csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +12 -15
- csv_detective/explore_csv.py +28 -9
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
- csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
- csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
- csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
- csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
- csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
- csv_detective/formats/geojson.py +36 -0
- csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
- csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
- csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
- csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
- csv_detective/formats/sexe.py +17 -0
- csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +45 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +3 -4
- csv_detective/output/dataframe.py +3 -3
- csv_detective/output/profile.py +2 -3
- csv_detective/output/schema.py +2 -2
- csv_detective/parsing/columns.py +35 -50
- csv_detective/parsing/csv.py +2 -2
- csv_detective/parsing/load.py +4 -5
- csv_detective/validate.py +9 -4
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/METADATA +6 -5
- csv_detective-0.9.3.dev2348.dist-info/RECORD +102 -0
- tests/test_fields.py +39 -364
- tests/test_file.py +1 -1
- tests/test_labels.py +5 -3
- tests/test_structure.py +40 -36
- csv_detective/detect_fields/FR/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/__init__.py +0 -112
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/email/__init__.py +0 -10
- csv_detective/detect_fields/other/money/__init__.py +0 -11
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/percent/__init__.py +0 -9
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -14
- csv_detective/detect_fields/other/uuid/__init__.py +0 -10
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
- csv_detective/detect_labels/__init__.py +0 -94
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -8
- csv_detective/detect_labels/other/email/__init__.py +0 -20
- csv_detective/detect_labels/other/float/__init__.py +0 -8
- csv_detective/detect_labels/other/int/__init__.py +0 -8
- csv_detective/detect_labels/other/money/__init__.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_labels/other/twitter/__init__.py +0 -8
- csv_detective/detect_labels/other/url/__init__.py +0 -23
- csv_detective/detect_labels/other/uuid/__init__.py +0 -8
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -28
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
- csv_detective/detect_labels/temp/year/__init__.py +0 -19
- csv_detective/load_tests.py +0 -59
- csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from frformat import CodeFantoir
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"cadastre1",
|
|
7
|
+
"code fantoir",
|
|
8
|
+
"fantoir",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
_code_fantoir = CodeFantoir()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is(val):
|
|
15
|
+
return isinstance(val, str) and _code_fantoir.is_valid(val)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_test_values = {
|
|
19
|
+
True: ["7755A", "B150B", "ZA04C", "ZB03D"],
|
|
20
|
+
False: ["7755", "ZA99A"],
|
|
21
|
+
}
|
|
@@ -1,9 +1,17 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = ["code"]
|
|
6
|
+
|
|
7
|
+
regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is(val):
|
|
11
|
+
return isinstance(val, str) and bool(re.match(regex, val))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_test_values = {
|
|
15
|
+
True: ["123S1871092288"],
|
|
16
|
+
False: ["AA751PEE00188854", "W123456789"],
|
|
17
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from frformat import CodePostal
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"code postal",
|
|
7
|
+
"postal code",
|
|
8
|
+
"postcode",
|
|
9
|
+
"post code",
|
|
10
|
+
"cp",
|
|
11
|
+
"codes postaux",
|
|
12
|
+
"location postcode",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
_code_postal = CodePostal()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _is(val):
|
|
19
|
+
return isinstance(val, str) and _code_postal.is_valid(val)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_test_values = {
|
|
23
|
+
True: ["75020", "01000"],
|
|
24
|
+
False: ["77777", "018339"],
|
|
25
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from frformat import CodeRegion, Millesime
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"code region",
|
|
7
|
+
"reg",
|
|
8
|
+
"code insee region",
|
|
9
|
+
"region",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
_code_region = CodeRegion(Millesime.LATEST)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _is(val):
|
|
16
|
+
return isinstance(val, str) and _code_region.is_valid(val)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_test_values = {
|
|
20
|
+
True: ["32"],
|
|
21
|
+
False: ["55"],
|
|
22
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from frformat import CodeRNA
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = [
|
|
6
|
+
"code rna",
|
|
7
|
+
"rna",
|
|
8
|
+
"n° inscription association",
|
|
9
|
+
"identifiant association",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
_code_rna = CodeRNA()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _is(val):
|
|
16
|
+
return isinstance(val, str) and _code_rna.is_valid(val)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_test_values = {
|
|
20
|
+
True: ["W751515517"],
|
|
21
|
+
False: [
|
|
22
|
+
"W111111111111111111111111111111111111",
|
|
23
|
+
"w143788974",
|
|
24
|
+
"W12",
|
|
25
|
+
"678W23456",
|
|
26
|
+
"165789325",
|
|
27
|
+
"Wa1#89sf&h",
|
|
28
|
+
],
|
|
29
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = ["code waldec", "waldec"]
|
|
6
|
+
|
|
7
|
+
regex = r"^W\d[\dA-Z]\d{7}$"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is(val):
|
|
11
|
+
return isinstance(val, str) and bool(re.match(regex, val))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_test_values = {
|
|
15
|
+
True: ["W123456789", "W2D1234567"],
|
|
16
|
+
False: ["AA751PEE00188854"],
|
|
17
|
+
}
|
|
@@ -1,16 +1,27 @@
|
|
|
1
|
-
from frformat import Commune, Millesime, Options
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
1
|
+
from frformat import Commune, Millesime, Options
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"commune",
|
|
7
|
+
"ville",
|
|
8
|
+
"libelle commune",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
_options = Options(
|
|
12
|
+
ignore_case=True,
|
|
13
|
+
ignore_accents=True,
|
|
14
|
+
replace_non_alphanumeric_with_space=True,
|
|
15
|
+
ignore_extra_whitespace=True,
|
|
16
|
+
)
|
|
17
|
+
_commune = Commune(Millesime.LATEST, _options)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is(val):
|
|
21
|
+
return isinstance(val, str) and _commune.is_valid(val)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_test_values = {
|
|
25
|
+
True: ["saint denis"],
|
|
26
|
+
False: ["new york", "lion"],
|
|
27
|
+
}
|
|
@@ -1,19 +1,31 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
|
-
|
|
3
|
-
from csv_detective.parsing.text import _process_text
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
1
|
+
from os.path import dirname, join
|
|
2
|
+
|
|
3
|
+
from csv_detective.parsing.text import _process_text
|
|
4
|
+
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["fr"]
|
|
7
|
+
labels = [
|
|
8
|
+
"csp insee",
|
|
9
|
+
"csp",
|
|
10
|
+
"categorie socioprofessionnelle",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
|
|
14
|
+
codes_insee = f.read().split("\n")
|
|
15
|
+
# removing empty str due to additionnal line in file
|
|
16
|
+
del codes_insee[-1]
|
|
17
|
+
codes_insee = set(codes_insee)
|
|
18
|
+
f.close()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is(val):
|
|
22
|
+
if not isinstance(val, str):
|
|
23
|
+
return False
|
|
24
|
+
val = _process_text(val)
|
|
25
|
+
return val in codes_insee
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_test_values = {
|
|
29
|
+
True: ["employes de la poste"],
|
|
30
|
+
False: ["super-heros"],
|
|
31
|
+
}
|
|
File without changes
|
|
@@ -1,62 +1,99 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from datetime import datetime
|
|
3
|
-
|
|
4
|
-
from dateparser import parse as date_parser
|
|
5
|
-
from dateutil.parser import ParserError
|
|
6
|
-
from dateutil.parser import parse as dateutil_parser
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
"
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
threshold = 0.3
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
""
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
if not
|
|
61
|
-
return False
|
|
62
|
-
|
|
1
|
+
import re
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
from dateparser import parse as date_parser
|
|
5
|
+
from dateutil.parser import ParserError
|
|
6
|
+
from dateutil.parser import parse as dateutil_parser
|
|
7
|
+
|
|
8
|
+
proportion = 1
|
|
9
|
+
tags = ["temp", "type"]
|
|
10
|
+
SHARED_DATE_LABELS = [
|
|
11
|
+
"date",
|
|
12
|
+
"mise à jour",
|
|
13
|
+
"modifie",
|
|
14
|
+
"maj",
|
|
15
|
+
"datemaj",
|
|
16
|
+
"update",
|
|
17
|
+
"created",
|
|
18
|
+
"modified",
|
|
19
|
+
]
|
|
20
|
+
labels = SHARED_DATE_LABELS + [
|
|
21
|
+
"jour",
|
|
22
|
+
"periode",
|
|
23
|
+
"dpc",
|
|
24
|
+
"yyyymmdd",
|
|
25
|
+
"aaaammjj",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def date_casting(val: str) -> datetime | None:
|
|
30
|
+
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
|
|
31
|
+
try:
|
|
32
|
+
return dateutil_parser(val)
|
|
33
|
+
except ParserError:
|
|
34
|
+
return date_parser(val)
|
|
35
|
+
except Exception:
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
threshold = 0.3
|
|
40
|
+
seps = r"[\s/\-\*_\|;.,]"
|
|
41
|
+
# matches JJ-MM-AAAA with any of the listed separators
|
|
42
|
+
jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
|
|
43
|
+
"SEP", seps
|
|
44
|
+
)
|
|
45
|
+
# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
|
|
46
|
+
aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
|
|
47
|
+
"SEP", seps + "?"
|
|
48
|
+
)
|
|
49
|
+
# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
|
|
50
|
+
string_month_pattern = (
|
|
51
|
+
r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
|
|
52
|
+
r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
|
|
53
|
+
r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
|
|
54
|
+
r"([0-9]{2}$|(19|20)[0-9]{2}$)"
|
|
55
|
+
).replace("SEP", seps + "?")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _is(val):
|
|
59
|
+
# early stops, to cut processing time
|
|
60
|
+
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
|
|
61
|
+
return False
|
|
62
|
+
# if it's a usual date pattern
|
|
63
|
+
if any(
|
|
64
|
+
# with this syntax, if any of the first value is True, the next ones are not computed
|
|
65
|
+
[
|
|
66
|
+
bool(re.match(jjmmaaaa_pattern, val))
|
|
67
|
+
or bool(re.match(aaaammjj_pattern, val))
|
|
68
|
+
or bool(re.match(string_month_pattern, val, re.IGNORECASE))
|
|
69
|
+
]
|
|
70
|
+
):
|
|
71
|
+
return True
|
|
72
|
+
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
73
|
+
return False
|
|
74
|
+
res = date_casting(val)
|
|
75
|
+
if not res or res.hour or res.minute or res.second:
|
|
76
|
+
return False
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_test_values = {
|
|
81
|
+
True: [
|
|
82
|
+
"1960-08-07",
|
|
83
|
+
"12/02/2007",
|
|
84
|
+
"15 jan 1985",
|
|
85
|
+
"15 décembre 1985",
|
|
86
|
+
"02 05 2003",
|
|
87
|
+
"20030502",
|
|
88
|
+
"1993-12/02",
|
|
89
|
+
],
|
|
90
|
+
False: [
|
|
91
|
+
"1993-1993-1993",
|
|
92
|
+
"39-10-1993",
|
|
93
|
+
"19-15-1993",
|
|
94
|
+
"15 tambour 1985",
|
|
95
|
+
"12152003",
|
|
96
|
+
"20031512",
|
|
97
|
+
"02052003",
|
|
98
|
+
],
|
|
99
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from csv_detective.parsing.text import _process_text
|
|
4
|
+
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["fr", "temp"]
|
|
7
|
+
labels = ["date"]
|
|
8
|
+
|
|
9
|
+
pattern = (
|
|
10
|
+
r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
|
|
11
|
+
r"|octobre|novembre|decembre)[ \-/]\d{4}$"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _is(val):
|
|
16
|
+
return isinstance(val, str) and bool(re.match(pattern, _process_text(val)))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_test_values = {
|
|
20
|
+
True: ["13 février 1996", "15 decembre 2024"],
|
|
21
|
+
False: ["44 march 2025"],
|
|
22
|
+
}
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any
|
|
3
2
|
|
|
4
|
-
from csv_detective.
|
|
3
|
+
from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["temp", "type"]
|
|
7
|
+
labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
threshold = 0.7
|
|
10
10
|
pat = (
|
|
11
11
|
aaaammjj_pattern.replace("$", "")
|
|
12
12
|
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
|
|
@@ -14,8 +14,7 @@ pat = (
|
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def _is(val
|
|
18
|
-
"""Detects timezone-aware datetimes only"""
|
|
17
|
+
def _is(val):
|
|
19
18
|
# early stops, to cut processing time
|
|
20
19
|
# 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
|
|
21
20
|
# 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
|
|
@@ -32,3 +31,15 @@ def _is(val: Any | None) -> bool:
|
|
|
32
31
|
and bool(res.hour or res.minute or res.second or res.microsecond)
|
|
33
32
|
and bool(res.tzinfo)
|
|
34
33
|
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_test_values = {
|
|
37
|
+
True: [
|
|
38
|
+
"2021-06-22 10:20:10-04:00",
|
|
39
|
+
"2030-06-22 00:00:00.0028+02:00",
|
|
40
|
+
"2000-12-21 10:20:10.1Z",
|
|
41
|
+
"2024-12-19T10:53:36.428000+00:00",
|
|
42
|
+
"1996/06/22 10:20:10 GMT",
|
|
43
|
+
],
|
|
44
|
+
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
|
|
45
|
+
}
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from csv_detective.
|
|
4
|
+
from csv_detective.formats.date import aaaammjj_pattern, date_casting
|
|
5
|
+
from csv_detective.formats.datetime_aware import labels # noqa
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
proportion = 1
|
|
8
|
+
tags = ["temp", "type"]
|
|
7
9
|
threshold = 0.7
|
|
8
10
|
|
|
9
11
|
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
|
|
@@ -27,3 +29,20 @@ def _is(val: Any | None) -> bool:
|
|
|
27
29
|
return False
|
|
28
30
|
res = date_casting(val)
|
|
29
31
|
return res is not None and not bool(res.tzinfo)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_test_values = {
|
|
35
|
+
True: [
|
|
36
|
+
"2021-06-22 10:20:10",
|
|
37
|
+
"2030/06-22 00:00:00",
|
|
38
|
+
"2030/06/22 00:00:00.0028",
|
|
39
|
+
],
|
|
40
|
+
False: [
|
|
41
|
+
"2021-06-22T30:20:10",
|
|
42
|
+
"Sun, 06 Nov 1994 08:49:37 GMT",
|
|
43
|
+
"2021-06-44 10:20:10+02:00",
|
|
44
|
+
"1999-12-01T00:00:00Z",
|
|
45
|
+
"2021-06-44",
|
|
46
|
+
"15 décembre 1985",
|
|
47
|
+
],
|
|
48
|
+
}
|
|
@@ -1,18 +1,24 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
return isinstance(val, str) and bool(
|
|
11
|
-
re.match(
|
|
12
|
-
r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
|
|
13
|
-
r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
|
|
14
|
-
r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
|
|
15
|
-
val.lower(),
|
|
16
|
-
re.IGNORECASE,
|
|
17
|
-
)
|
|
18
|
-
)
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from csv_detective.formats.datetime_aware import labels # noqa
|
|
4
|
+
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["temp", "type"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is(val):
|
|
10
|
+
return isinstance(val, str) and bool(
|
|
11
|
+
re.match(
|
|
12
|
+
r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
|
|
13
|
+
r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
|
|
14
|
+
r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
|
|
15
|
+
val.lower(),
|
|
16
|
+
re.IGNORECASE,
|
|
17
|
+
)
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
_test_values = {
|
|
22
|
+
True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
23
|
+
False: ["2021-06-22T10:20:10"],
|
|
24
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from frformat import Departement, Millesime, Options
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"departement",
|
|
7
|
+
"libelle du departement",
|
|
8
|
+
"deplib",
|
|
9
|
+
"nom dept",
|
|
10
|
+
"dept",
|
|
11
|
+
"libdepartement",
|
|
12
|
+
"nom departement",
|
|
13
|
+
"libelle dep",
|
|
14
|
+
"libelle departement",
|
|
15
|
+
"lb departements",
|
|
16
|
+
"dep libusage",
|
|
17
|
+
"lb departement",
|
|
18
|
+
"nom dep",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
_options = Options(
|
|
22
|
+
ignore_case=True,
|
|
23
|
+
ignore_accents=True,
|
|
24
|
+
replace_non_alphanumeric_with_space=True,
|
|
25
|
+
ignore_extra_whitespace=True,
|
|
26
|
+
)
|
|
27
|
+
_departement = Departement(Millesime.LATEST, _options)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is(val):
|
|
31
|
+
return isinstance(val, str) and _departement.is_valid(val)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_test_values = {
|
|
35
|
+
True: ["essonne"],
|
|
36
|
+
False: ["alabama", "auvergne"],
|
|
37
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
labels = [
|
|
5
|
+
"email",
|
|
6
|
+
"mail",
|
|
7
|
+
"courriel",
|
|
8
|
+
"contact",
|
|
9
|
+
"mel",
|
|
10
|
+
"lieucourriel",
|
|
11
|
+
"coordinates.emailcontact",
|
|
12
|
+
"e mail",
|
|
13
|
+
"mo mail",
|
|
14
|
+
"adresse mail",
|
|
15
|
+
"adresse email",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is(val):
|
|
20
|
+
return isinstance(val, str) and bool(
|
|
21
|
+
re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_test_values = {
|
|
26
|
+
True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
|
|
27
|
+
False: ["cdo@@gouv.sfd"],
|
|
28
|
+
}
|
|
@@ -1,21 +1,29 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return
|
|
1
|
+
proportion = 1
|
|
2
|
+
tags = ["type"]
|
|
3
|
+
labels = ["part", "ratio", "taux"]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def float_casting(val: str) -> float:
|
|
7
|
+
return float(val.replace(",", "."))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is(val):
|
|
11
|
+
"""Detects floats, assuming that tables will not have scientific
|
|
12
|
+
notations (3e6) or "+" in the string. "-" is still accepted."""
|
|
13
|
+
try:
|
|
14
|
+
if (
|
|
15
|
+
not isinstance(val, str)
|
|
16
|
+
or any([k in val for k in ["_", "+", "e", "E"]])
|
|
17
|
+
or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
|
|
18
|
+
):
|
|
19
|
+
return False
|
|
20
|
+
float_casting(val)
|
|
21
|
+
return True
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_test_values = {
|
|
27
|
+
True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
|
|
28
|
+
False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
|
|
29
|
+
}
|