csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from csv_detective.formats.datetime_aware import labels # noqa
|
|
4
|
+
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["temp", "type"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is(val):
|
|
10
|
+
return isinstance(val, str) and bool(
|
|
11
|
+
re.match(
|
|
12
|
+
r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
|
|
13
|
+
r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
|
|
14
|
+
r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
|
|
15
|
+
val.lower(),
|
|
16
|
+
re.IGNORECASE,
|
|
17
|
+
)
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
_test_values = {
|
|
22
|
+
True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
23
|
+
False: ["2021-06-22T10:20:10"],
|
|
24
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from frformat import Departement, Millesime, Options
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"departement",
|
|
7
|
+
"libelle du departement",
|
|
8
|
+
"deplib",
|
|
9
|
+
"nom dept",
|
|
10
|
+
"dept",
|
|
11
|
+
"libdepartement",
|
|
12
|
+
"nom departement",
|
|
13
|
+
"libelle dep",
|
|
14
|
+
"libelle departement",
|
|
15
|
+
"lb departements",
|
|
16
|
+
"dep libusage",
|
|
17
|
+
"lb departement",
|
|
18
|
+
"nom dep",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
_options = Options(
|
|
22
|
+
ignore_case=True,
|
|
23
|
+
ignore_accents=True,
|
|
24
|
+
replace_non_alphanumeric_with_space=True,
|
|
25
|
+
ignore_extra_whitespace=True,
|
|
26
|
+
)
|
|
27
|
+
_departement = Departement(Millesime.LATEST, _options)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is(val):
|
|
31
|
+
return isinstance(val, str) and _departement.is_valid(val)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_test_values = {
|
|
35
|
+
True: ["essonne"],
|
|
36
|
+
False: ["alabama", "auvergne"],
|
|
37
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
labels = [
|
|
5
|
+
"email",
|
|
6
|
+
"mail",
|
|
7
|
+
"courriel",
|
|
8
|
+
"contact",
|
|
9
|
+
"mel",
|
|
10
|
+
"lieucourriel",
|
|
11
|
+
"coordinates.emailcontact",
|
|
12
|
+
"e mail",
|
|
13
|
+
"mo mail",
|
|
14
|
+
"adresse mail",
|
|
15
|
+
"adresse email",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is(val):
|
|
20
|
+
return isinstance(val, str) and bool(
|
|
21
|
+
re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_test_values = {
|
|
26
|
+
True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
|
|
27
|
+
False: ["cdo@@gouv.sfd"],
|
|
28
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
proportion = 1
|
|
2
|
+
tags = ["type"]
|
|
3
|
+
labels = ["part", "ratio", "taux"]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def float_casting(val: str) -> float:
|
|
7
|
+
return float(val.replace(",", "."))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is(val):
|
|
11
|
+
"""Detects floats, assuming that tables will not have scientific
|
|
12
|
+
notations (3e6) or "+" in the string. "-" is still accepted."""
|
|
13
|
+
try:
|
|
14
|
+
if (
|
|
15
|
+
not isinstance(val, str)
|
|
16
|
+
or any([k in val for k in ["_", "+", "e", "E"]])
|
|
17
|
+
or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
|
|
18
|
+
):
|
|
19
|
+
return False
|
|
20
|
+
float_casting(val)
|
|
21
|
+
return True
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_test_values = {
|
|
27
|
+
True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
|
|
28
|
+
False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
|
|
29
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"json geojson",
|
|
7
|
+
"json",
|
|
8
|
+
"geojson",
|
|
9
|
+
"geo shape",
|
|
10
|
+
"geom",
|
|
11
|
+
"geometry",
|
|
12
|
+
"geo shape",
|
|
13
|
+
"geoshape",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is(val) -> bool:
|
|
18
|
+
try:
|
|
19
|
+
j = json.loads(val)
|
|
20
|
+
if isinstance(j, dict):
|
|
21
|
+
if "type" in j and "coordinates" in j:
|
|
22
|
+
return True
|
|
23
|
+
if "geometry" in j and "coordinates" in j["geometry"]:
|
|
24
|
+
return True
|
|
25
|
+
except Exception:
|
|
26
|
+
pass
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
_test_values = {
|
|
31
|
+
True: [
|
|
32
|
+
'{"coordinates": [45.783753, 3.049342], "type": "63870"}',
|
|
33
|
+
'{"geometry": {"coordinates": [45.783753, 3.049342]}}',
|
|
34
|
+
],
|
|
35
|
+
False: ['{"pomme": "fruit", "reponse": 42}'],
|
|
36
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from os.path import dirname, join
|
|
2
|
+
|
|
3
|
+
from csv_detective.parsing.text import _process_text
|
|
4
|
+
|
|
5
|
+
proportion = 0.8
|
|
6
|
+
tags = ["fr"]
|
|
7
|
+
labels = [
|
|
8
|
+
"code ape",
|
|
9
|
+
"code activite (ape)",
|
|
10
|
+
"code naf",
|
|
11
|
+
"code naf organisme designe",
|
|
12
|
+
"code naf organisme designant",
|
|
13
|
+
"base sirene : code ape de l'etablissement siege",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r")
|
|
17
|
+
condes_insee_ape = f.read().split("\n")
|
|
18
|
+
# removing empty str due to additionnal line in file
|
|
19
|
+
del condes_insee_ape[-1]
|
|
20
|
+
condes_insee_ape = set(condes_insee_ape)
|
|
21
|
+
f.close()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _is(val):
|
|
25
|
+
if not isinstance(val, str):
|
|
26
|
+
return False
|
|
27
|
+
val = _process_text(val).upper()
|
|
28
|
+
return val in condes_insee_ape
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_test_values = {True: ["0116Z"], False: ["0116A"]}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from frformat import Canton, Millesime, Options
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"insee canton",
|
|
7
|
+
"canton",
|
|
8
|
+
"cant",
|
|
9
|
+
"nom canton",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
_options = Options(
|
|
13
|
+
ignore_case=True,
|
|
14
|
+
ignore_accents=True,
|
|
15
|
+
replace_non_alphanumeric_with_space=True,
|
|
16
|
+
ignore_extra_whitespace=True,
|
|
17
|
+
)
|
|
18
|
+
_canton = Canton(Millesime.LATEST, _options)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is(val):
|
|
22
|
+
return isinstance(val, str) and _canton.is_valid(val)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_test_values = {
|
|
26
|
+
True: ["nantua"],
|
|
27
|
+
False: ["california"],
|
|
28
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
labels = ["nb", "nombre", "nbre"]
|
|
2
|
+
tag = ["type"]
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _is(val):
|
|
6
|
+
"""Detects integers"""
|
|
7
|
+
if (
|
|
8
|
+
not isinstance(val, str)
|
|
9
|
+
or any([v in val for v in [".", "_", "+"]])
|
|
10
|
+
or (val.startswith("0") and len(val) > 1)
|
|
11
|
+
):
|
|
12
|
+
return False
|
|
13
|
+
try:
|
|
14
|
+
int(val)
|
|
15
|
+
return True
|
|
16
|
+
except ValueError:
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_test_values = {
|
|
21
|
+
True: ["1", "0", "1764", "-24"],
|
|
22
|
+
False: ["01053", "1.2", "123_456", "+35"],
|
|
23
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
|
+
|
|
4
|
+
proportion = 1
|
|
5
|
+
tags = ["geo"]
|
|
6
|
+
labels = [
|
|
7
|
+
"iso country code",
|
|
8
|
+
"code pays",
|
|
9
|
+
"pays",
|
|
10
|
+
"country",
|
|
11
|
+
"nation",
|
|
12
|
+
"pays code",
|
|
13
|
+
"code pays (iso)",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile:
|
|
17
|
+
liste_pays = iofile.read().split("\n")
|
|
18
|
+
liste_pays = set(liste_pays)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is(val):
|
|
22
|
+
if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
|
|
23
|
+
return False
|
|
24
|
+
return val in liste_pays
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
_test_values = {
|
|
28
|
+
True: ["FR"],
|
|
29
|
+
False: ["XX", "A", "FRA"],
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
|
+
|
|
4
|
+
proportion = 1
|
|
5
|
+
tags = ["geo"]
|
|
6
|
+
labels = [
|
|
7
|
+
"iso country code",
|
|
8
|
+
"code pays",
|
|
9
|
+
"pays",
|
|
10
|
+
"country",
|
|
11
|
+
"nation",
|
|
12
|
+
"pays code",
|
|
13
|
+
"code pays (iso)",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile:
|
|
17
|
+
liste_pays = iofile.read().split("\n")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is(val):
|
|
21
|
+
"""Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
|
|
22
|
+
if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
|
|
23
|
+
return False
|
|
24
|
+
return val in set(liste_pays)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
_test_values = {
|
|
28
|
+
True: ["FRA"],
|
|
29
|
+
False: ["XXX", "FR", "A"],
|
|
30
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
|
+
|
|
4
|
+
proportion = 1
|
|
5
|
+
tags = ["geo"]
|
|
6
|
+
labels = [
|
|
7
|
+
"iso country code",
|
|
8
|
+
"code pays",
|
|
9
|
+
"pays",
|
|
10
|
+
"country",
|
|
11
|
+
"nation",
|
|
12
|
+
"pays code",
|
|
13
|
+
"code pays (iso)",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile:
|
|
17
|
+
liste_pays = iofile.read().split("\n")
|
|
18
|
+
liste_pays = set(liste_pays)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is(val):
|
|
22
|
+
"""Renvoie True si val peut etre un code iso pays numerique, False sinon"""
|
|
23
|
+
if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
|
|
24
|
+
return False
|
|
25
|
+
return val in liste_pays
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_test_values = {
|
|
29
|
+
True: ["250"],
|
|
30
|
+
False: ["003"],
|
|
31
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
proportion = 0.8
|
|
2
|
+
tags = ["fr", "temp"]
|
|
3
|
+
labels = [
|
|
4
|
+
"jour semaine",
|
|
5
|
+
"type jour",
|
|
6
|
+
"jour de la semaine",
|
|
7
|
+
"saufjour",
|
|
8
|
+
"nomjour",
|
|
9
|
+
"jour",
|
|
10
|
+
"jour de fermeture",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
jours = {
|
|
14
|
+
"lundi",
|
|
15
|
+
"mardi",
|
|
16
|
+
"mercredi",
|
|
17
|
+
"jeudi",
|
|
18
|
+
"vendredi",
|
|
19
|
+
"samedi",
|
|
20
|
+
"dimanche",
|
|
21
|
+
"lun",
|
|
22
|
+
"mar",
|
|
23
|
+
"mer",
|
|
24
|
+
"jeu",
|
|
25
|
+
"ven",
|
|
26
|
+
"sam",
|
|
27
|
+
"dim",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is(val):
|
|
32
|
+
if not isinstance(val, str):
|
|
33
|
+
return False
|
|
34
|
+
val = val.lower()
|
|
35
|
+
return val in jours
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
_test_values = {
|
|
39
|
+
True: ["lundi"],
|
|
40
|
+
False: ["jour de la biere"],
|
|
41
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from json import JSONDecodeError
|
|
3
|
+
|
|
4
|
+
proportion = 1
|
|
5
|
+
tags = ["type"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is(val):
|
|
9
|
+
try:
|
|
10
|
+
loaded = json.loads(val)
|
|
11
|
+
# we don't want to consider integers for instance
|
|
12
|
+
return isinstance(loaded, (list, dict))
|
|
13
|
+
except (JSONDecodeError, TypeError):
|
|
14
|
+
return False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_test_values = {
|
|
18
|
+
True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
|
|
19
|
+
False: ["5", '{"zefib":', '{"a"}'],
|
|
20
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from frformat import LatitudeL93
|
|
2
|
+
|
|
3
|
+
from csv_detective.formats.float import _is as is_float
|
|
4
|
+
from csv_detective.formats.float import float_casting
|
|
5
|
+
|
|
6
|
+
proportion = 1
|
|
7
|
+
tags = ["fr", "geo"]
|
|
8
|
+
labels = [
|
|
9
|
+
"latitude",
|
|
10
|
+
"lat",
|
|
11
|
+
"y",
|
|
12
|
+
"yf",
|
|
13
|
+
"yd",
|
|
14
|
+
"y l93",
|
|
15
|
+
"coordonnee y",
|
|
16
|
+
"latitude lb93",
|
|
17
|
+
"coord y",
|
|
18
|
+
"ycoord",
|
|
19
|
+
"geocodage y gps",
|
|
20
|
+
"location latitude",
|
|
21
|
+
"ylatitude",
|
|
22
|
+
"ylat",
|
|
23
|
+
"latitude (y)",
|
|
24
|
+
"latitudeorg",
|
|
25
|
+
"coordinates.latitude",
|
|
26
|
+
"googlemap latitude",
|
|
27
|
+
"latitudelieu",
|
|
28
|
+
"latitude googlemap",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
_latitudel93 = LatitudeL93()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _is(val):
|
|
35
|
+
try:
|
|
36
|
+
if isinstance(val, str) and is_float(val):
|
|
37
|
+
return _latitudel93.is_valid(float_casting(val))
|
|
38
|
+
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
except (ValueError, OverflowError):
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
_test_values = {
|
|
46
|
+
True: ["6037008", "7123528.5", "7124528,5"],
|
|
47
|
+
False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"],
|
|
48
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"latitude",
|
|
7
|
+
"lat",
|
|
8
|
+
"y",
|
|
9
|
+
"yf",
|
|
10
|
+
"yd",
|
|
11
|
+
"coordonnee y",
|
|
12
|
+
"coord y",
|
|
13
|
+
"ycoord",
|
|
14
|
+
"geocodage y gps",
|
|
15
|
+
"location latitude",
|
|
16
|
+
"ylatitude",
|
|
17
|
+
"ylat",
|
|
18
|
+
"latitude (y)",
|
|
19
|
+
"latitudeorg",
|
|
20
|
+
"coordinates.latitude",
|
|
21
|
+
"googlemap latitude",
|
|
22
|
+
"latitudelieu",
|
|
23
|
+
"latitude googlemap",
|
|
24
|
+
"latitude wgs84",
|
|
25
|
+
"y wgs84",
|
|
26
|
+
"latitude (wgs84)",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is(val):
|
|
31
|
+
try:
|
|
32
|
+
return is_float(val) and float(val) >= -90 and float(val) <= 90
|
|
33
|
+
except ValueError:
|
|
34
|
+
return False
|
|
35
|
+
except OverflowError:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_test_values = {
|
|
40
|
+
True: ["43.2", "-22"],
|
|
41
|
+
False: ["100"],
|
|
42
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"latitude",
|
|
7
|
+
"lat",
|
|
8
|
+
"y",
|
|
9
|
+
"yf",
|
|
10
|
+
"yd",
|
|
11
|
+
"coordonnee y",
|
|
12
|
+
"coord y",
|
|
13
|
+
"ycoord",
|
|
14
|
+
"geocodage y gps",
|
|
15
|
+
"location latitude",
|
|
16
|
+
"ylatitude",
|
|
17
|
+
"ylat",
|
|
18
|
+
"latitude (y)",
|
|
19
|
+
"latitudeorg",
|
|
20
|
+
"coordinates.latitude",
|
|
21
|
+
"googlemap latitude",
|
|
22
|
+
"latitudelieu",
|
|
23
|
+
"latitude googlemap",
|
|
24
|
+
"latitude wgs84",
|
|
25
|
+
"y wgs84",
|
|
26
|
+
"latitude (wgs84)",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is(val):
|
|
31
|
+
try:
|
|
32
|
+
return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3
|
|
33
|
+
except ValueError:
|
|
34
|
+
return False
|
|
35
|
+
except OverflowError:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_test_values = {
|
|
40
|
+
True: ["42.5"],
|
|
41
|
+
False: ["22.5", "62.5"],
|
|
42
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from csv_detective.formats.latitude_wgs import _is as is_lat
|
|
2
|
+
from csv_detective.formats.longitude_wgs import _is as is_lon
|
|
3
|
+
|
|
4
|
+
proportion = 1
|
|
5
|
+
tags = ["geo"]
|
|
6
|
+
|
|
7
|
+
SHARED_COORDS_LABELS = [
|
|
8
|
+
"ban",
|
|
9
|
+
"coordinates",
|
|
10
|
+
"coordonnees",
|
|
11
|
+
"coordonnees insee",
|
|
12
|
+
"geo",
|
|
13
|
+
"geopoint",
|
|
14
|
+
"geoloc",
|
|
15
|
+
"geolocalisation",
|
|
16
|
+
"geom",
|
|
17
|
+
"geometry",
|
|
18
|
+
"gps",
|
|
19
|
+
"localisation",
|
|
20
|
+
"point",
|
|
21
|
+
"position",
|
|
22
|
+
"wgs84",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
specific = [
|
|
26
|
+
"latlon",
|
|
27
|
+
"lat lon",
|
|
28
|
+
"x y",
|
|
29
|
+
"xy",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# we aim wide to catch exact matches if possible for the highest possible score
|
|
33
|
+
labels = (
|
|
34
|
+
SHARED_COORDS_LABELS
|
|
35
|
+
+ specific
|
|
36
|
+
+ [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is(val):
|
|
41
|
+
if not isinstance(val, str) or val.count(",") != 1:
|
|
42
|
+
return False
|
|
43
|
+
lat, lon = val.split(",")
|
|
44
|
+
# handling [lat,lon]
|
|
45
|
+
if lat.startswith("[") and lon.endswith("]"):
|
|
46
|
+
lat, lon = lat[1:], lon[:-1]
|
|
47
|
+
return is_lat(lat) and is_lon(lon.replace(" ", ""))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
_test_values = {
|
|
51
|
+
True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
|
|
52
|
+
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
|
|
53
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from frformat import LongitudeL93
|
|
2
|
+
|
|
3
|
+
from csv_detective.formats.float import _is as is_float
|
|
4
|
+
from csv_detective.formats.float import float_casting
|
|
5
|
+
|
|
6
|
+
proportion = 1
|
|
7
|
+
tags = ["fr", "geo"]
|
|
8
|
+
labels = [
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
_longitudel93 = LongitudeL93()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _is(val):
|
|
26
|
+
try:
|
|
27
|
+
if isinstance(val, str) and is_float(val):
|
|
28
|
+
return _longitudel93.is_valid(float_casting(val))
|
|
29
|
+
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
except (ValueError, OverflowError):
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_test_values = {
|
|
37
|
+
True: ["0", "-154", "1265783,45", "34723.4"],
|
|
38
|
+
False: ["1456669.8", "-776225", "346_3214"],
|
|
39
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"longitude",
|
|
7
|
+
"lon",
|
|
8
|
+
"long",
|
|
9
|
+
"geocodage x gps",
|
|
10
|
+
"location longitude",
|
|
11
|
+
"xlongitude",
|
|
12
|
+
"lng",
|
|
13
|
+
"xlong",
|
|
14
|
+
"x",
|
|
15
|
+
"xf",
|
|
16
|
+
"xd",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is(val):
|
|
21
|
+
try:
|
|
22
|
+
return is_float(val) and float(val) >= -180 and float(val) <= 180
|
|
23
|
+
except ValueError:
|
|
24
|
+
return False
|
|
25
|
+
except OverflowError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_test_values = {
|
|
30
|
+
True: ["120", "-20.2"],
|
|
31
|
+
False: ["-200"],
|
|
32
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"longitude",
|
|
7
|
+
"lon",
|
|
8
|
+
"long",
|
|
9
|
+
"geocodage x gps",
|
|
10
|
+
"location longitude",
|
|
11
|
+
"xlongitude",
|
|
12
|
+
"lng",
|
|
13
|
+
"xlong",
|
|
14
|
+
"x",
|
|
15
|
+
"xf",
|
|
16
|
+
"xd",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is(val):
|
|
21
|
+
try:
|
|
22
|
+
return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8
|
|
23
|
+
except ValueError:
|
|
24
|
+
return False
|
|
25
|
+
except OverflowError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_test_values = {
|
|
30
|
+
True: ["-2.5"],
|
|
31
|
+
False: ["12.8"],
|
|
32
|
+
}
|