csv-detective 0.9.3.dev2514__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +11 -38
- csv_detective/explore_csv.py +3 -2
- csv_detective/format.py +11 -4
- csv_detective/formats/adresse.py +9 -9
- csv_detective/formats/binary.py +2 -1
- csv_detective/formats/booleen.py +3 -2
- csv_detective/formats/code_commune_insee.py +12 -10
- csv_detective/formats/code_csp_insee.py +1 -1
- csv_detective/formats/code_departement.py +8 -7
- csv_detective/formats/code_fantoir.py +6 -5
- csv_detective/formats/code_import.py +1 -1
- csv_detective/formats/code_postal.py +10 -9
- csv_detective/formats/code_region.py +7 -6
- csv_detective/formats/code_rna.py +7 -6
- csv_detective/formats/code_waldec.py +1 -1
- csv_detective/formats/commune.py +5 -5
- csv_detective/formats/csp_insee.py +6 -5
- csv_detective/formats/data/insee_ape700.txt +1 -1
- csv_detective/formats/data/iso_country_code_alpha2.txt +153 -397
- csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
- csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
- csv_detective/formats/date.py +18 -17
- csv_detective/formats/date_fr.py +1 -1
- csv_detective/formats/datetime_aware.py +7 -2
- csv_detective/formats/datetime_naive.py +3 -0
- csv_detective/formats/datetime_rfc822.py +1 -0
- csv_detective/formats/departement.py +15 -15
- csv_detective/formats/email.py +13 -13
- csv_detective/formats/float.py +2 -1
- csv_detective/formats/geojson.py +10 -10
- csv_detective/formats/insee_ape700.py +10 -8
- csv_detective/formats/insee_canton.py +6 -6
- csv_detective/formats/int.py +2 -1
- csv_detective/formats/iso_country_code_alpha2.py +14 -14
- csv_detective/formats/iso_country_code_alpha3.py +6 -13
- csv_detective/formats/iso_country_code_numeric.py +2 -9
- csv_detective/formats/jour_de_la_semaine.py +11 -12
- csv_detective/formats/json.py +6 -0
- csv_detective/formats/latitude_l93.py +8 -22
- csv_detective/formats/latitude_wgs.py +31 -29
- csv_detective/formats/latitude_wgs_fr_metropole.py +7 -30
- csv_detective/formats/latlon_wgs.py +30 -28
- csv_detective/formats/longitude_l93.py +8 -13
- csv_detective/formats/longitude_wgs.py +34 -19
- csv_detective/formats/longitude_wgs_fr_metropole.py +6 -19
- csv_detective/formats/lonlat_wgs.py +12 -11
- csv_detective/formats/mois_de_lannee.py +1 -1
- csv_detective/formats/money.py +1 -1
- csv_detective/formats/mongo_object_id.py +1 -1
- csv_detective/formats/pays.py +11 -13
- csv_detective/formats/percent.py +1 -1
- csv_detective/formats/region.py +13 -13
- csv_detective/formats/sexe.py +1 -1
- csv_detective/formats/siren.py +9 -10
- csv_detective/formats/siret.py +9 -9
- csv_detective/formats/tel_fr.py +7 -13
- csv_detective/formats/uai.py +17 -18
- csv_detective/formats/url.py +16 -16
- csv_detective/formats/username.py +1 -1
- csv_detective/formats/uuid.py +1 -1
- csv_detective/formats/year.py +7 -12
- csv_detective/output/dataframe.py +6 -1
- csv_detective/output/profile.py +5 -1
- csv_detective/parsing/text.py +13 -12
- {csv_detective-0.9.3.dev2514.dist-info → csv_detective-0.10.1.dist-info}/METADATA +2 -2
- csv_detective-0.10.1.dist-info/RECORD +92 -0
- {csv_detective-0.9.3.dev2514.dist-info → csv_detective-0.10.1.dist-info}/WHEEL +1 -1
- csv_detective-0.9.3.dev2514.dist-info/RECORD +0 -92
- {csv_detective-0.9.3.dev2514.dist-info → csv_detective-0.10.1.dist-info}/entry_points.txt +0 -0
csv_detective/formats/date.py
CHANGED
|
@@ -7,23 +7,24 @@ from dateutil.parser import parse as dateutil_parser
|
|
|
7
7
|
|
|
8
8
|
proportion = 1
|
|
9
9
|
tags = ["temp", "type"]
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
|
|
10
|
+
python_type = "date"
|
|
11
|
+
SHARED_DATE_LABELS = {
|
|
12
|
+
"date": 1,
|
|
13
|
+
"mise à jour": 1,
|
|
14
|
+
"modifie": 1,
|
|
15
|
+
"maj": 0.75,
|
|
16
|
+
"datemaj": 1,
|
|
17
|
+
"update": 1,
|
|
18
|
+
"created": 1,
|
|
19
|
+
"modified": 1,
|
|
20
|
+
}
|
|
21
|
+
labels = SHARED_DATE_LABELS | {
|
|
22
|
+
"jour": 0.75,
|
|
23
|
+
"periode": 0.75,
|
|
24
|
+
"dpc": 0.5,
|
|
25
|
+
"yyyymmdd": 1,
|
|
26
|
+
"aaaammjj": 1,
|
|
27
|
+
}
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
def date_casting(val: str) -> datetime | None:
|
csv_detective/formats/date_fr.py
CHANGED
|
@@ -4,7 +4,8 @@ from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, dat
|
|
|
4
4
|
|
|
5
5
|
proportion = 1
|
|
6
6
|
tags = ["temp", "type"]
|
|
7
|
-
|
|
7
|
+
python_type = "datetime"
|
|
8
|
+
labels = SHARED_DATE_LABELS | {"datetime": 1, "timestamp": 1}
|
|
8
9
|
|
|
9
10
|
threshold = 0.7
|
|
10
11
|
pat = (
|
|
@@ -12,7 +13,9 @@ pat = (
|
|
|
12
13
|
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
|
|
13
14
|
+ r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
|
|
14
15
|
)
|
|
15
|
-
|
|
16
|
+
# date_casting is very (too?) good at finding date(time)s where there sometimes is just a number
|
|
17
|
+
# this prefix check asserts we only consider strings that have a somewhat fine structure trying to cast
|
|
18
|
+
prefix = r"^\d{2}[-/:]?\d{2}"
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
def _is(val):
|
|
@@ -41,6 +44,8 @@ _test_values = {
|
|
|
41
44
|
"2000-12-21 10:20:10.1Z",
|
|
42
45
|
"2024-12-19T10:53:36.428000+00:00",
|
|
43
46
|
"1996/06/22 10:20:10 GMT",
|
|
47
|
+
"12/31/2022 12:00:00-04:00",
|
|
48
|
+
"12:00:00-04:00 12/31/2022",
|
|
44
49
|
],
|
|
45
50
|
False: [
|
|
46
51
|
"2021-06-22T30:20:10",
|
|
@@ -6,6 +6,7 @@ from csv_detective.formats.datetime_aware import labels, prefix # noqa
|
|
|
6
6
|
|
|
7
7
|
proportion = 1
|
|
8
8
|
tags = ["temp", "type"]
|
|
9
|
+
python_type = "datetime"
|
|
9
10
|
threshold = 0.7
|
|
10
11
|
|
|
11
12
|
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
|
|
@@ -36,6 +37,8 @@ _test_values = {
|
|
|
36
37
|
"2021-06-22 10:20:10",
|
|
37
38
|
"2030/06-22 00:00:00",
|
|
38
39
|
"2030/06/22 00:00:00.0028",
|
|
40
|
+
"12/31/2022 12:00:00",
|
|
41
|
+
"12:00:00 12/31/2022",
|
|
39
42
|
],
|
|
40
43
|
False: [
|
|
41
44
|
"2021-06-22T30:20:10",
|
|
@@ -2,21 +2,21 @@ from frformat import Departement, Millesime, Options
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"departement",
|
|
7
|
-
"libelle du departement",
|
|
8
|
-
"deplib",
|
|
9
|
-
"nom dept",
|
|
10
|
-
"dept",
|
|
11
|
-
"libdepartement",
|
|
12
|
-
"nom departement",
|
|
13
|
-
"libelle dep",
|
|
14
|
-
"libelle departement",
|
|
15
|
-
"lb departements",
|
|
16
|
-
"dep libusage",
|
|
17
|
-
"lb departement",
|
|
18
|
-
"nom dep",
|
|
19
|
-
|
|
5
|
+
labels = {
|
|
6
|
+
"departement": 1,
|
|
7
|
+
"libelle du departement": 1,
|
|
8
|
+
"deplib": 1,
|
|
9
|
+
"nom dept": 1,
|
|
10
|
+
"dept": 0.75,
|
|
11
|
+
"libdepartement": 1,
|
|
12
|
+
"nom departement": 1,
|
|
13
|
+
"libelle dep": 1,
|
|
14
|
+
"libelle departement": 1,
|
|
15
|
+
"lb departements": 1,
|
|
16
|
+
"dep libusage": 1,
|
|
17
|
+
"lb departement": 1,
|
|
18
|
+
"nom dep": 1,
|
|
19
|
+
}
|
|
20
20
|
|
|
21
21
|
_options = Options(
|
|
22
22
|
ignore_case=True,
|
csv_detective/formats/email.py
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
|
-
labels =
|
|
5
|
-
"email",
|
|
6
|
-
"mail",
|
|
7
|
-
"courriel",
|
|
8
|
-
"contact",
|
|
9
|
-
"mel",
|
|
10
|
-
"lieucourriel",
|
|
11
|
-
"coordinates.emailcontact",
|
|
12
|
-
"e mail",
|
|
13
|
-
"mo mail",
|
|
14
|
-
"adresse mail",
|
|
15
|
-
"adresse email",
|
|
16
|
-
|
|
4
|
+
labels = {
|
|
5
|
+
"email": 1,
|
|
6
|
+
"mail": 1,
|
|
7
|
+
"courriel": 1,
|
|
8
|
+
"contact": 1,
|
|
9
|
+
"mel": 1,
|
|
10
|
+
"lieucourriel": 1,
|
|
11
|
+
"coordinates.emailcontact": 1,
|
|
12
|
+
"e mail": 1,
|
|
13
|
+
"mo mail": 1,
|
|
14
|
+
"adresse mail": 1,
|
|
15
|
+
"adresse email": 1,
|
|
16
|
+
}
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def _is(val):
|
csv_detective/formats/float.py
CHANGED
csv_detective/formats/geojson.py
CHANGED
|
@@ -2,16 +2,16 @@ import json
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"json",
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"geoshape",
|
|
14
|
-
|
|
5
|
+
python_type = "json"
|
|
6
|
+
labels = {
|
|
7
|
+
"json geojson": 1,
|
|
8
|
+
"json": 1,
|
|
9
|
+
"geojson": 1,
|
|
10
|
+
"geo shape": 1,
|
|
11
|
+
"geom": 0.75,
|
|
12
|
+
"geometry": 1,
|
|
13
|
+
"geoshape": 1,
|
|
14
|
+
}
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def _is(val) -> bool:
|
|
@@ -4,14 +4,16 @@ from csv_detective.parsing.text import _process_text
|
|
|
4
4
|
|
|
5
5
|
proportion = 0.8
|
|
6
6
|
tags = ["fr"]
|
|
7
|
-
labels =
|
|
8
|
-
"code ape",
|
|
9
|
-
"code activite (ape)",
|
|
10
|
-
"code naf",
|
|
11
|
-
"code naf organisme designe",
|
|
12
|
-
"code naf organisme designant",
|
|
13
|
-
"base sirene : code ape de l'etablissement siege",
|
|
14
|
-
|
|
7
|
+
labels = {
|
|
8
|
+
"code ape": 1,
|
|
9
|
+
"code activite (ape)": 1,
|
|
10
|
+
"code naf": 1,
|
|
11
|
+
"code naf organisme designe": 1,
|
|
12
|
+
"code naf organisme designant": 1,
|
|
13
|
+
"base sirene : code ape de l'etablissement siege": 1,
|
|
14
|
+
"naf": 0.75,
|
|
15
|
+
"ape": 0.5,
|
|
16
|
+
}
|
|
15
17
|
|
|
16
18
|
f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r")
|
|
17
19
|
condes_insee_ape = f.read().split("\n")
|
|
@@ -2,12 +2,12 @@ from frformat import Canton, Millesime, Options
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"insee canton",
|
|
7
|
-
"canton",
|
|
8
|
-
"cant",
|
|
9
|
-
"nom canton",
|
|
10
|
-
|
|
5
|
+
labels = {
|
|
6
|
+
"insee canton": 1,
|
|
7
|
+
"canton": 1,
|
|
8
|
+
"cant": 0.5,
|
|
9
|
+
"nom canton": 1,
|
|
10
|
+
}
|
|
11
11
|
|
|
12
12
|
_options = Options(
|
|
13
13
|
ignore_case=True,
|
csv_detective/formats/int.py
CHANGED
|
@@ -3,28 +3,28 @@ from os.path import dirname, join
|
|
|
3
3
|
|
|
4
4
|
proportion = 1
|
|
5
5
|
tags = ["geo"]
|
|
6
|
-
labels =
|
|
7
|
-
"iso country code",
|
|
8
|
-
"code pays",
|
|
9
|
-
"pays",
|
|
10
|
-
"country",
|
|
11
|
-
"nation",
|
|
12
|
-
"pays code",
|
|
13
|
-
"code pays (iso)",
|
|
14
|
-
|
|
6
|
+
labels = {
|
|
7
|
+
"iso country code": 1,
|
|
8
|
+
"code pays": 1,
|
|
9
|
+
"pays": 1,
|
|
10
|
+
"country": 1,
|
|
11
|
+
"nation": 1,
|
|
12
|
+
"pays code": 1,
|
|
13
|
+
"code pays (iso)": 1,
|
|
14
|
+
"code": 0.5,
|
|
15
|
+
}
|
|
15
16
|
|
|
16
17
|
with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile:
|
|
17
|
-
liste_pays = iofile.read().split("\n")
|
|
18
|
-
liste_pays = set(liste_pays)
|
|
18
|
+
liste_pays = set(iofile.read().split("\n"))
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def _is(val):
|
|
22
|
-
if not isinstance(val, str) or not bool(re.match(r"[
|
|
22
|
+
if not isinstance(val, str) or not bool(re.match(r"[a-zA-Z]{2}$", val)):
|
|
23
23
|
return False
|
|
24
|
-
return val in liste_pays
|
|
24
|
+
return val.upper() in liste_pays
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
_test_values = {
|
|
28
|
-
True: ["FR"],
|
|
28
|
+
True: ["FR", "sj"],
|
|
29
29
|
False: ["XX", "A", "FRA"],
|
|
30
30
|
}
|
|
@@ -1,30 +1,23 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from os.path import dirname, join
|
|
3
3
|
|
|
4
|
+
from csv_detective.formats.iso_country_code_alpha2 import labels # noqa
|
|
5
|
+
|
|
4
6
|
proportion = 1
|
|
5
7
|
tags = ["geo"]
|
|
6
|
-
labels = [
|
|
7
|
-
"iso country code",
|
|
8
|
-
"code pays",
|
|
9
|
-
"pays",
|
|
10
|
-
"country",
|
|
11
|
-
"nation",
|
|
12
|
-
"pays code",
|
|
13
|
-
"code pays (iso)",
|
|
14
|
-
]
|
|
15
8
|
|
|
16
9
|
with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile:
|
|
17
|
-
liste_pays = iofile.read().split("\n")
|
|
10
|
+
liste_pays = set(iofile.read().split("\n"))
|
|
18
11
|
|
|
19
12
|
|
|
20
13
|
def _is(val):
|
|
21
14
|
"""Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
|
|
22
|
-
if not isinstance(val, str) or not bool(re.match(r"[
|
|
15
|
+
if not isinstance(val, str) or not bool(re.match(r"[a-zA-Z]{3}$", val)):
|
|
23
16
|
return False
|
|
24
|
-
return val in
|
|
17
|
+
return val.upper() in liste_pays
|
|
25
18
|
|
|
26
19
|
|
|
27
20
|
_test_values = {
|
|
28
|
-
True: ["FRA"],
|
|
21
|
+
True: ["FRA", "brb"],
|
|
29
22
|
False: ["XXX", "FR", "A"],
|
|
30
23
|
}
|
|
@@ -1,17 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from os.path import dirname, join
|
|
3
3
|
|
|
4
|
+
from csv_detective.formats.iso_country_code_alpha2 import labels # noqa
|
|
5
|
+
|
|
4
6
|
proportion = 1
|
|
5
7
|
tags = ["geo"]
|
|
6
|
-
labels = [
|
|
7
|
-
"iso country code",
|
|
8
|
-
"code pays",
|
|
9
|
-
"pays",
|
|
10
|
-
"country",
|
|
11
|
-
"nation",
|
|
12
|
-
"pays code",
|
|
13
|
-
"code pays (iso)",
|
|
14
|
-
]
|
|
15
8
|
|
|
16
9
|
with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile:
|
|
17
10
|
liste_pays = iofile.read().split("\n")
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
proportion = 0.8
|
|
2
2
|
tags = ["fr", "temp"]
|
|
3
|
-
labels =
|
|
4
|
-
"jour semaine",
|
|
5
|
-
"type jour",
|
|
6
|
-
"jour de la semaine",
|
|
7
|
-
"saufjour",
|
|
8
|
-
"nomjour",
|
|
9
|
-
"jour",
|
|
10
|
-
"jour de fermeture",
|
|
11
|
-
|
|
3
|
+
labels = {
|
|
4
|
+
"jour semaine": 1,
|
|
5
|
+
"type jour": 1,
|
|
6
|
+
"jour de la semaine": 1,
|
|
7
|
+
"saufjour": 1,
|
|
8
|
+
"nomjour": 1,
|
|
9
|
+
"jour": 0.75,
|
|
10
|
+
"jour de fermeture": 1,
|
|
11
|
+
}
|
|
12
12
|
|
|
13
13
|
jours = {
|
|
14
14
|
"lundi",
|
|
@@ -31,11 +31,10 @@ jours = {
|
|
|
31
31
|
def _is(val):
|
|
32
32
|
if not isinstance(val, str):
|
|
33
33
|
return False
|
|
34
|
-
|
|
35
|
-
return val in jours
|
|
34
|
+
return val.lower() in jours
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
_test_values = {
|
|
39
38
|
True: ["lundi"],
|
|
40
|
-
False: ["jour
|
|
39
|
+
False: ["jour"],
|
|
41
40
|
}
|
csv_detective/formats/json.py
CHANGED
|
@@ -2,31 +2,17 @@ from frformat import LatitudeL93
|
|
|
2
2
|
|
|
3
3
|
from csv_detective.formats.float import _is as is_float
|
|
4
4
|
from csv_detective.formats.float import float_casting
|
|
5
|
+
from csv_detective.formats.latitude_wgs import SHARED_LATITUDE_LABELS
|
|
5
6
|
|
|
6
7
|
proportion = 1
|
|
7
8
|
tags = ["fr", "geo"]
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
"y",
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
|
|
15
|
-
"coordonnee y",
|
|
16
|
-
"latitude lb93",
|
|
17
|
-
"coord y",
|
|
18
|
-
"ycoord",
|
|
19
|
-
"geocodage y gps",
|
|
20
|
-
"location latitude",
|
|
21
|
-
"ylatitude",
|
|
22
|
-
"ylat",
|
|
23
|
-
"latitude (y)",
|
|
24
|
-
"latitudeorg",
|
|
25
|
-
"coordinates.latitude",
|
|
26
|
-
"googlemap latitude",
|
|
27
|
-
"latitudelieu",
|
|
28
|
-
"latitude googlemap",
|
|
29
|
-
]
|
|
9
|
+
mandatory_label = True
|
|
10
|
+
python_type = "float"
|
|
11
|
+
labels = SHARED_LATITUDE_LABELS | {
|
|
12
|
+
"y l93": 1,
|
|
13
|
+
"latitude lb93": 1,
|
|
14
|
+
"lamby": 1,
|
|
15
|
+
}
|
|
30
16
|
|
|
31
17
|
_latitudel93 = LatitudeL93()
|
|
32
18
|
|
|
@@ -1,42 +1,44 @@
|
|
|
1
1
|
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
from csv_detective.formats.int import _is as is_int
|
|
2
3
|
|
|
3
4
|
proportion = 1
|
|
4
5
|
tags = ["geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
"y wgs84",
|
|
26
|
-
"latitude (wgs84)",
|
|
27
|
-
]
|
|
6
|
+
mandatory_label = True
|
|
7
|
+
python_type = "float"
|
|
8
|
+
SHARED_LATITUDE_LABELS = {
|
|
9
|
+
"latitude": 1,
|
|
10
|
+
"lat": 0.75,
|
|
11
|
+
"y": 0.5,
|
|
12
|
+
"yf": 0.5,
|
|
13
|
+
"yd": 0.5,
|
|
14
|
+
"coordonnee y": 1,
|
|
15
|
+
"coord y": 1,
|
|
16
|
+
"ycoord": 1,
|
|
17
|
+
"ylat": 1,
|
|
18
|
+
}
|
|
19
|
+
labels = SHARED_LATITUDE_LABELS | {
|
|
20
|
+
"y gps": 1,
|
|
21
|
+
"latitude wgs84": 1,
|
|
22
|
+
"y wgs84": 1,
|
|
23
|
+
"wsg": 0.75,
|
|
24
|
+
"gps": 0.5,
|
|
25
|
+
}
|
|
28
26
|
|
|
29
27
|
|
|
30
28
|
def _is(val):
|
|
31
29
|
try:
|
|
32
|
-
return
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
return (
|
|
31
|
+
is_float(val)
|
|
32
|
+
and -90 <= float(val) <= 90
|
|
33
|
+
# we ideally would like a certain level of decimal precision
|
|
34
|
+
# but 1.200 is saved as 1.2 in csv so we just discriminate ints
|
|
35
|
+
and not is_int(val)
|
|
36
|
+
)
|
|
37
|
+
except Exception:
|
|
36
38
|
return False
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
_test_values = {
|
|
40
|
-
True: ["43.
|
|
41
|
-
False: ["100"],
|
|
42
|
+
True: ["43.2872", "-22.61", "-3.0"],
|
|
43
|
+
False: ["100.1973", "40"],
|
|
42
44
|
}
|
|
@@ -1,42 +1,19 @@
|
|
|
1
|
-
from csv_detective.formats.
|
|
1
|
+
from csv_detective.formats.latitude_wgs import _is as is_latitude, labels # noqa
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"lat",
|
|
8
|
-
"y",
|
|
9
|
-
"yf",
|
|
10
|
-
"yd",
|
|
11
|
-
"coordonnee y",
|
|
12
|
-
"coord y",
|
|
13
|
-
"ycoord",
|
|
14
|
-
"geocodage y gps",
|
|
15
|
-
"location latitude",
|
|
16
|
-
"ylatitude",
|
|
17
|
-
"ylat",
|
|
18
|
-
"latitude (y)",
|
|
19
|
-
"latitudeorg",
|
|
20
|
-
"coordinates.latitude",
|
|
21
|
-
"googlemap latitude",
|
|
22
|
-
"latitudelieu",
|
|
23
|
-
"latitude googlemap",
|
|
24
|
-
"latitude wgs84",
|
|
25
|
-
"y wgs84",
|
|
26
|
-
"latitude (wgs84)",
|
|
27
|
-
]
|
|
5
|
+
mandatory_label = True
|
|
6
|
+
python_type = "float"
|
|
28
7
|
|
|
29
8
|
|
|
30
9
|
def _is(val):
|
|
31
10
|
try:
|
|
32
|
-
return
|
|
33
|
-
except
|
|
34
|
-
return False
|
|
35
|
-
except OverflowError:
|
|
11
|
+
return is_latitude(val) and 41.3 <= float(val) <= 51.3
|
|
12
|
+
except Exception:
|
|
36
13
|
return False
|
|
37
14
|
|
|
38
15
|
|
|
39
16
|
_test_values = {
|
|
40
|
-
True: ["42.5"],
|
|
41
|
-
False: ["22.5"
|
|
17
|
+
True: ["42.576", "42.5"],
|
|
18
|
+
False: ["22.5"],
|
|
42
19
|
}
|
|
@@ -3,37 +3,39 @@ from csv_detective.formats.longitude_wgs import _is as is_lon
|
|
|
3
3
|
|
|
4
4
|
proportion = 1
|
|
5
5
|
tags = ["geo"]
|
|
6
|
+
mandatory_label = True
|
|
7
|
+
|
|
8
|
+
SHARED_COORDS_LABELS = {
|
|
9
|
+
"ban": 1,
|
|
10
|
+
"coordinates": 1,
|
|
11
|
+
"coordonnees": 1,
|
|
12
|
+
"coordonnees insee": 1,
|
|
13
|
+
"coord": 1,
|
|
14
|
+
"geo": 0.5,
|
|
15
|
+
"geopoint": 1,
|
|
16
|
+
"geoloc": 1,
|
|
17
|
+
"geolocalisation": 1,
|
|
18
|
+
"geom": 0.75,
|
|
19
|
+
"geometry": 1,
|
|
20
|
+
"gps": 1,
|
|
21
|
+
"localisation": 1,
|
|
22
|
+
"point": 1,
|
|
23
|
+
"position": 1,
|
|
24
|
+
"wgs84": 1,
|
|
25
|
+
}
|
|
6
26
|
|
|
7
|
-
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
|
|
13
|
-
"geopoint",
|
|
14
|
-
"geoloc",
|
|
15
|
-
"geolocalisation",
|
|
16
|
-
"geom",
|
|
17
|
-
"geometry",
|
|
18
|
-
"gps",
|
|
19
|
-
"localisation",
|
|
20
|
-
"point",
|
|
21
|
-
"position",
|
|
22
|
-
"wgs84",
|
|
23
|
-
]
|
|
24
|
-
|
|
25
|
-
specific = [
|
|
26
|
-
"latlon",
|
|
27
|
-
"lat lon",
|
|
28
|
-
"x y",
|
|
29
|
-
"xy",
|
|
30
|
-
]
|
|
27
|
+
specific = {
|
|
28
|
+
"latlon": 1,
|
|
29
|
+
"lat lon": 1,
|
|
30
|
+
"x y": 0.75,
|
|
31
|
+
"xy": 0.75,
|
|
32
|
+
}
|
|
31
33
|
|
|
32
34
|
# we aim wide to catch exact matches if possible for the highest possible score
|
|
33
35
|
labels = (
|
|
34
36
|
SHARED_COORDS_LABELS
|
|
35
|
-
|
|
36
|
-
|
|
37
|
+
| specific
|
|
38
|
+
| {w + sep + suf: 1 for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]}
|
|
37
39
|
)
|
|
38
40
|
|
|
39
41
|
|
|
@@ -48,6 +50,6 @@ def _is(val):
|
|
|
48
50
|
|
|
49
51
|
|
|
50
52
|
_test_values = {
|
|
51
|
-
True: ["43.2,-22.6", "-10.
|
|
52
|
-
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
|
|
53
|
+
True: ["43.2,-22.6", "-10.71,140.0", "-40.791, 10.81", "[12.01,-0.28]"],
|
|
54
|
+
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27", "1,2", "43, -23"],
|
|
53
55
|
}
|