csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from csv_detective.formats.latitude_wgs import _is as is_lat
|
|
2
|
+
from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS
|
|
3
|
+
from csv_detective.formats.longitude_wgs import _is as is_lon
|
|
4
|
+
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["geo"]
|
|
7
|
+
|
|
8
|
+
specific = [
|
|
9
|
+
"lonlat",
|
|
10
|
+
"lon lat",
|
|
11
|
+
"y x",
|
|
12
|
+
"yx",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
# we aim wide to catch exact matches if possible for the highest possible score
|
|
16
|
+
words = (
|
|
17
|
+
SHARED_COORDS_LABELS
|
|
18
|
+
+ specific
|
|
19
|
+
+ [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is(val):
|
|
24
|
+
if not isinstance(val, str) or val.count(",") != 1:
|
|
25
|
+
return False
|
|
26
|
+
lon, lat = val.split(",")
|
|
27
|
+
# handling [lon,lat]
|
|
28
|
+
if lon.startswith("[") and lat.endswith("]"):
|
|
29
|
+
lon, lat = lon[1:], lat[:-1]
|
|
30
|
+
return is_lon(lon) and is_lat(lat.replace(" ", ""))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_test_values = {
|
|
34
|
+
True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
|
|
35
|
+
False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
|
|
36
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from unidecode import unidecode
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "temp"]
|
|
5
|
+
labels = ["mois", "month"]
|
|
6
|
+
|
|
7
|
+
mois = {
|
|
8
|
+
"janvier",
|
|
9
|
+
"fevrier",
|
|
10
|
+
"mars",
|
|
11
|
+
"avril",
|
|
12
|
+
"mai",
|
|
13
|
+
"juin",
|
|
14
|
+
"juillet",
|
|
15
|
+
"aout",
|
|
16
|
+
"septembre",
|
|
17
|
+
"octobre",
|
|
18
|
+
"novembre",
|
|
19
|
+
"decembre",
|
|
20
|
+
"jan",
|
|
21
|
+
"fev",
|
|
22
|
+
"mar",
|
|
23
|
+
"avr",
|
|
24
|
+
"mai",
|
|
25
|
+
"jun",
|
|
26
|
+
"jui",
|
|
27
|
+
"juil",
|
|
28
|
+
"aou",
|
|
29
|
+
"sep",
|
|
30
|
+
"sept",
|
|
31
|
+
"oct",
|
|
32
|
+
"nov",
|
|
33
|
+
"dec",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _is(val):
|
|
38
|
+
"""Renvoie True si les champs peuvent être des mois de l'année"""
|
|
39
|
+
if not isinstance(val, str):
|
|
40
|
+
return False
|
|
41
|
+
val = unidecode(val.lower())
|
|
42
|
+
return val in mois
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
_test_values = {
|
|
46
|
+
True: ["JUIN", "décembre"],
|
|
47
|
+
False: ["november"],
|
|
48
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
|
|
5
|
+
|
|
6
|
+
currencies = {"€", "$", "£", "¥"}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is(val):
|
|
10
|
+
if not isinstance(val, str) or val[-1] not in currencies:
|
|
11
|
+
return False
|
|
12
|
+
return is_float(val[:-1])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_test_values = {
|
|
16
|
+
True: ["120€", "-20.2$"],
|
|
17
|
+
False: ["200", "100 euros"],
|
|
18
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
labels = ["id", "objectid"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _is(val):
|
|
8
|
+
return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_test_values = {
|
|
12
|
+
True: ["62320e50f981bc2b57bcc044"],
|
|
13
|
+
False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
|
|
14
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from frformat import Millesime, Options, Pays
|
|
2
|
+
|
|
3
|
+
proportion = 0.6
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"pays",
|
|
7
|
+
"payslieu",
|
|
8
|
+
"paysorg",
|
|
9
|
+
"country",
|
|
10
|
+
"pays lib",
|
|
11
|
+
"lieupays",
|
|
12
|
+
"pays beneficiaire",
|
|
13
|
+
"nom du pays",
|
|
14
|
+
"journey start country",
|
|
15
|
+
"libelle pays",
|
|
16
|
+
"journey end country",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
_options = Options(
|
|
20
|
+
ignore_case=True,
|
|
21
|
+
ignore_accents=True,
|
|
22
|
+
replace_non_alphanumeric_with_space=True,
|
|
23
|
+
ignore_extra_whitespace=True,
|
|
24
|
+
)
|
|
25
|
+
_pays = Pays(Millesime.LATEST, _options)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _is(val):
|
|
29
|
+
return isinstance(val, str) and _pays.is_valid(val)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_test_values = {
|
|
33
|
+
True: ["france", "italie"],
|
|
34
|
+
False: ["amerique", "paris"],
|
|
35
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
labels = []
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _is(val):
|
|
8
|
+
if not isinstance(val, str) or val[-1] != "%":
|
|
9
|
+
return False
|
|
10
|
+
return is_float(val[:-1])
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_test_values = {
|
|
14
|
+
True: ["120%", "-20.2%"],
|
|
15
|
+
False: ["200", "100 pourcents"],
|
|
16
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from frformat import Millesime, Options, Region
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"region",
|
|
7
|
+
"libelle region",
|
|
8
|
+
"nom region",
|
|
9
|
+
"libelle reg",
|
|
10
|
+
"nom reg",
|
|
11
|
+
"reg libusage",
|
|
12
|
+
"nom de la region",
|
|
13
|
+
"regionorg",
|
|
14
|
+
"regionlieu",
|
|
15
|
+
"reg",
|
|
16
|
+
"nom officiel region",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
_extra_valid_values_set = frozenset(
|
|
20
|
+
{
|
|
21
|
+
"alsace",
|
|
22
|
+
"aquitaine",
|
|
23
|
+
"ara",
|
|
24
|
+
"aura",
|
|
25
|
+
"auvergne",
|
|
26
|
+
"auvergne et rhone alpes",
|
|
27
|
+
"basse normandie",
|
|
28
|
+
"bfc",
|
|
29
|
+
"bourgogne",
|
|
30
|
+
"bourgogne et franche comte",
|
|
31
|
+
"centre",
|
|
32
|
+
"champagne ardenne",
|
|
33
|
+
"franche comte",
|
|
34
|
+
"ge",
|
|
35
|
+
"haute normandie",
|
|
36
|
+
"hdf",
|
|
37
|
+
"languedoc roussillon",
|
|
38
|
+
"limousin",
|
|
39
|
+
"lorraine",
|
|
40
|
+
"midi pyrenees",
|
|
41
|
+
"nord pas de calais",
|
|
42
|
+
"npdc",
|
|
43
|
+
"paca",
|
|
44
|
+
"picardie",
|
|
45
|
+
"poitou charentes",
|
|
46
|
+
"reunion",
|
|
47
|
+
"rhone alpes",
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_options = Options(
|
|
53
|
+
ignore_case=True,
|
|
54
|
+
ignore_accents=True,
|
|
55
|
+
replace_non_alphanumeric_with_space=True,
|
|
56
|
+
ignore_extra_whitespace=True,
|
|
57
|
+
extra_valid_values=_extra_valid_values_set,
|
|
58
|
+
)
|
|
59
|
+
_region = Region(Millesime.LATEST, _options)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is(val):
|
|
63
|
+
"""Match avec le nom des regions"""
|
|
64
|
+
return isinstance(val, str) and _region.is_valid(val)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_test_values = {
|
|
68
|
+
True: ["bretagne", "ile-de-france"],
|
|
69
|
+
False: ["baviere", "overgne"],
|
|
70
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from csv_detective.parsing.text import _process_text
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is(val):
|
|
9
|
+
if not isinstance(val, str):
|
|
10
|
+
return False
|
|
11
|
+
return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_test_values = {
|
|
15
|
+
True: ["femme", "H"],
|
|
16
|
+
False: ["adulte"],
|
|
17
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.9
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = [
|
|
6
|
+
"siren",
|
|
7
|
+
"siren organisme designe",
|
|
8
|
+
"siren organisme designant",
|
|
9
|
+
"n° siren",
|
|
10
|
+
"siren organisme",
|
|
11
|
+
"siren titulaire",
|
|
12
|
+
"numero siren",
|
|
13
|
+
"epci",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is(val):
|
|
18
|
+
"""Repere les codes SIREN"""
|
|
19
|
+
if not isinstance(val, str):
|
|
20
|
+
return False
|
|
21
|
+
val = val.replace(" ", "")
|
|
22
|
+
if not bool(re.match(r"^[0-9]{9}$", val)):
|
|
23
|
+
return False
|
|
24
|
+
# Vérification par clé propre aux codes siren
|
|
25
|
+
cle = 0
|
|
26
|
+
pair = False
|
|
27
|
+
for x in val:
|
|
28
|
+
y = int(x) * (1 + pair)
|
|
29
|
+
cle += y // 10 + y % 10
|
|
30
|
+
pair = not pair
|
|
31
|
+
return cle % 10 == 0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_test_values = {
|
|
35
|
+
True: ["552 100 554", "552100554"],
|
|
36
|
+
False: ["42"],
|
|
37
|
+
}
|
|
@@ -1,29 +1,47 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = [
|
|
6
|
+
"siret",
|
|
7
|
+
"siret d",
|
|
8
|
+
"num siret",
|
|
9
|
+
"siretacheteur",
|
|
10
|
+
"n° siret",
|
|
11
|
+
"coll siret",
|
|
12
|
+
"epci",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _is(val):
|
|
17
|
+
"""Détection des identifiants SIRET (SIRENE)"""
|
|
18
|
+
if not isinstance(val, str):
|
|
19
|
+
return False
|
|
20
|
+
val = val.replace(" ", "")
|
|
21
|
+
if not bool(re.match(r"^[0-9]{14}$", val)):
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
# Vérification par clé de luhn du SIREN
|
|
25
|
+
cle = 0
|
|
26
|
+
pair = False
|
|
27
|
+
for x in val[:9]:
|
|
28
|
+
y = int(x) * (1 + pair)
|
|
29
|
+
cle += y // 10 + y % 10
|
|
30
|
+
pair = not pair
|
|
31
|
+
if cle % 10 != 0:
|
|
32
|
+
return cle % 10 == 0
|
|
33
|
+
|
|
34
|
+
# Vérification par clé de luhn du SIRET
|
|
35
|
+
cle = 0
|
|
36
|
+
pair = len(val) % 2 == 0
|
|
37
|
+
for x in val:
|
|
38
|
+
y = int(x) * (1 + pair)
|
|
39
|
+
cle += y // 10 + y % 10
|
|
40
|
+
pair = not pair
|
|
41
|
+
return cle % 10 == 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_test_values = {
|
|
45
|
+
True: ["13002526500013", "130 025 265 00013"],
|
|
46
|
+
False: ["13002526500012"],
|
|
47
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.7
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = [
|
|
6
|
+
"telephone",
|
|
7
|
+
"tel",
|
|
8
|
+
"tel1",
|
|
9
|
+
"tel2",
|
|
10
|
+
"phone",
|
|
11
|
+
"num tel",
|
|
12
|
+
"tel mob",
|
|
13
|
+
"telephone sav",
|
|
14
|
+
"telephone1",
|
|
15
|
+
"coordinates.phone",
|
|
16
|
+
"telephone du lieu",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is(val):
|
|
21
|
+
if not isinstance(val, str):
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
if len(val) < 10:
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
val = val.replace(".", "").replace("-", "").replace(" ", "")
|
|
28
|
+
|
|
29
|
+
match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val))
|
|
30
|
+
return match_1
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_test_values = {
|
|
34
|
+
True: ["0134643467"],
|
|
35
|
+
False: ["6625388263", "01288398"],
|
|
36
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
tags = ["fr"]
|
|
5
|
+
labels = [
|
|
6
|
+
"uai",
|
|
7
|
+
"code etablissement",
|
|
8
|
+
"code uai",
|
|
9
|
+
"uai - identifiant",
|
|
10
|
+
"numero uai",
|
|
11
|
+
"rne",
|
|
12
|
+
"numero de l'etablissement",
|
|
13
|
+
"code rne",
|
|
14
|
+
"codeetab",
|
|
15
|
+
"code uai de l'etablissement",
|
|
16
|
+
"ref uai",
|
|
17
|
+
"cd rne",
|
|
18
|
+
"numerouai",
|
|
19
|
+
"numero d etablissement",
|
|
20
|
+
"code etablissement",
|
|
21
|
+
"numero etablissement",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _is(val):
|
|
26
|
+
if not isinstance(val, str) or len(val) != 8:
|
|
27
|
+
return False
|
|
28
|
+
if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)):
|
|
29
|
+
return False
|
|
30
|
+
return True
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_test_values = {
|
|
34
|
+
True: ["0422170F"],
|
|
35
|
+
False: ["04292E"],
|
|
36
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
labels = [
|
|
5
|
+
"url",
|
|
6
|
+
"url source",
|
|
7
|
+
"site web",
|
|
8
|
+
"source url",
|
|
9
|
+
"site internet",
|
|
10
|
+
"remote url",
|
|
11
|
+
"web",
|
|
12
|
+
"site",
|
|
13
|
+
"lien",
|
|
14
|
+
"site data",
|
|
15
|
+
"lien url",
|
|
16
|
+
"lien vers le fichier",
|
|
17
|
+
"sitweb",
|
|
18
|
+
"interneturl",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
pattern = re.compile(
|
|
22
|
+
r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
|
|
23
|
+
r"(/[A-Za-z\u00C0-\u024F\u1E00-\u1EFF0-9\s._~:/?#[@!$&'()*+,;=%-]*)?$"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _is(val):
|
|
28
|
+
if not isinstance(val, str):
|
|
29
|
+
return False
|
|
30
|
+
return bool(pattern.match(val))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_test_values = {
|
|
34
|
+
True: [
|
|
35
|
+
"www.data.gouv.fr",
|
|
36
|
+
"http://data.gouv.fr",
|
|
37
|
+
"https://www.youtube.com/@data-gouv-fr",
|
|
38
|
+
(
|
|
39
|
+
"https://tabular-api.data.gouv.fr/api/resources/"
|
|
40
|
+
"aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
|
|
41
|
+
"?score__greater=0.9&decompte__exact=13"
|
|
42
|
+
),
|
|
43
|
+
"https://une-ville.fr/délibérations/2025/Doc avec espaces et àccëñts.pdf",
|
|
44
|
+
],
|
|
45
|
+
False: ["tmp@data.gouv.fr"],
|
|
46
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
proportion = 0.8
|
|
4
|
+
labels = ["id", "identifiant"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _is(val) -> bool:
|
|
8
|
+
return isinstance(val, str) and bool(
|
|
9
|
+
re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_test_values = {
|
|
14
|
+
True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
|
|
15
|
+
False: ["0610928327"],
|
|
16
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
proportion = 1
|
|
2
|
+
tags = ["temp"]
|
|
3
|
+
labels = [
|
|
4
|
+
"year",
|
|
5
|
+
"annee",
|
|
6
|
+
"annee depot",
|
|
7
|
+
"an nais",
|
|
8
|
+
"exercice",
|
|
9
|
+
"data year",
|
|
10
|
+
"annee de publication",
|
|
11
|
+
"exercice comptable",
|
|
12
|
+
"annee de naissance",
|
|
13
|
+
"annee ouverture",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is(val):
|
|
18
|
+
try:
|
|
19
|
+
val = int(val)
|
|
20
|
+
except ValueError:
|
|
21
|
+
return False
|
|
22
|
+
return (1800 <= val) and (val <= 2100)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_test_values = {
|
|
26
|
+
True: ["2015"],
|
|
27
|
+
False: ["20166", "123"],
|
|
28
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.output.dataframe import cast_df_chunks
|
|
8
|
+
from csv_detective.output.profile import create_profile
|
|
9
|
+
from csv_detective.output.schema import generate_table_schema
|
|
10
|
+
from csv_detective.utils import is_url
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_output(
|
|
14
|
+
table: pd.DataFrame,
|
|
15
|
+
analysis: dict,
|
|
16
|
+
file_path: str,
|
|
17
|
+
num_rows: int = 500,
|
|
18
|
+
limited_output: bool = True,
|
|
19
|
+
save_results: bool | str = True,
|
|
20
|
+
output_profile: bool = False,
|
|
21
|
+
output_schema: bool = False,
|
|
22
|
+
output_df: bool = False,
|
|
23
|
+
cast_json: bool = True,
|
|
24
|
+
verbose: bool = False,
|
|
25
|
+
sheet_name: str | int | None = None,
|
|
26
|
+
_col_values: dict[str, pd.Series] | None = None,
|
|
27
|
+
) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
|
|
28
|
+
if output_profile:
|
|
29
|
+
analysis["profile"] = create_profile(
|
|
30
|
+
table=table,
|
|
31
|
+
columns=analysis["columns"],
|
|
32
|
+
num_rows=num_rows,
|
|
33
|
+
limited_output=limited_output,
|
|
34
|
+
cast_json=cast_json,
|
|
35
|
+
verbose=verbose,
|
|
36
|
+
_col_values=_col_values,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if save_results:
|
|
40
|
+
if isinstance(save_results, str):
|
|
41
|
+
output_path = save_results
|
|
42
|
+
else:
|
|
43
|
+
output_path = os.path.splitext(file_path)[0]
|
|
44
|
+
if is_url(output_path):
|
|
45
|
+
output_path = output_path.split("/")[-1]
|
|
46
|
+
if analysis.get("sheet_name"):
|
|
47
|
+
output_path += "_sheet-" + str(sheet_name)
|
|
48
|
+
output_path += ".json"
|
|
49
|
+
with open(output_path, "w", encoding="utf8") as fp:
|
|
50
|
+
json.dump(
|
|
51
|
+
analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if output_schema:
|
|
55
|
+
analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
|
|
56
|
+
|
|
57
|
+
if output_df:
|
|
58
|
+
return analysis, cast_df_chunks(
|
|
59
|
+
df=table,
|
|
60
|
+
analysis=analysis,
|
|
61
|
+
file_path=file_path,
|
|
62
|
+
cast_json=cast_json,
|
|
63
|
+
verbose=verbose,
|
|
64
|
+
)
|
|
65
|
+
return analysis
|