PyPI - csv-detective - Versions diffs - 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl - Mend

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

csv_detective/__init__.py +7 -1
csv_detective/cli.py +33 -21
csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
csv_detective/detection/columns.py +89 -0
csv_detective/detection/encoding.py +29 -0
csv_detective/detection/engine.py +46 -0
csv_detective/detection/formats.py +156 -0
csv_detective/detection/headers.py +28 -0
csv_detective/detection/rows.py +18 -0
csv_detective/detection/separator.py +44 -0
csv_detective/detection/variables.py +97 -0
csv_detective/explore_csv.py +151 -377
csv_detective/format.py +67 -0
csv_detective/formats/__init__.py +9 -0
csv_detective/formats/adresse.py +116 -0
csv_detective/formats/binary.py +26 -0
csv_detective/formats/booleen.py +35 -0
csv_detective/formats/code_commune_insee.py +26 -0
csv_detective/formats/code_csp_insee.py +36 -0
csv_detective/formats/code_departement.py +29 -0
csv_detective/formats/code_fantoir.py +21 -0
csv_detective/formats/code_import.py +17 -0
csv_detective/formats/code_postal.py +25 -0
csv_detective/formats/code_region.py +22 -0
csv_detective/formats/code_rna.py +29 -0
csv_detective/formats/code_waldec.py +17 -0
csv_detective/formats/commune.py +27 -0
csv_detective/formats/csp_insee.py +31 -0
csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
csv_detective/formats/date.py +99 -0
csv_detective/formats/date_fr.py +22 -0
csv_detective/formats/datetime_aware.py +45 -0
csv_detective/formats/datetime_naive.py +48 -0
csv_detective/formats/datetime_rfc822.py +24 -0
csv_detective/formats/departement.py +37 -0
csv_detective/formats/email.py +28 -0
csv_detective/formats/float.py +29 -0
csv_detective/formats/geojson.py +36 -0
csv_detective/formats/insee_ape700.py +31 -0
csv_detective/formats/insee_canton.py +28 -0
csv_detective/formats/int.py +23 -0
csv_detective/formats/iso_country_code_alpha2.py +30 -0
csv_detective/formats/iso_country_code_alpha3.py +30 -0
csv_detective/formats/iso_country_code_numeric.py +31 -0
csv_detective/formats/jour_de_la_semaine.py +41 -0
csv_detective/formats/json.py +20 -0
csv_detective/formats/latitude_l93.py +48 -0
csv_detective/formats/latitude_wgs.py +42 -0
csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
csv_detective/formats/latlon_wgs.py +53 -0
csv_detective/formats/longitude_l93.py +39 -0
csv_detective/formats/longitude_wgs.py +32 -0
csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
csv_detective/formats/lonlat_wgs.py +36 -0
csv_detective/formats/mois_de_lannee.py +48 -0
csv_detective/formats/money.py +18 -0
csv_detective/formats/mongo_object_id.py +14 -0
csv_detective/formats/pays.py +35 -0
csv_detective/formats/percent.py +16 -0
csv_detective/formats/region.py +70 -0
csv_detective/formats/sexe.py +17 -0
csv_detective/formats/siren.py +37 -0
csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
csv_detective/formats/tel_fr.py +36 -0
csv_detective/formats/uai.py +36 -0
csv_detective/formats/url.py +46 -0
csv_detective/formats/username.py +14 -0
csv_detective/formats/uuid.py +16 -0
csv_detective/formats/year.py +28 -0
csv_detective/output/__init__.py +65 -0
csv_detective/output/dataframe.py +96 -0
csv_detective/output/example.py +250 -0
csv_detective/output/profile.py +119 -0
csv_detective/{schema_generation.py → output/schema.py} +268 -343
csv_detective/output/utils.py +74 -0
csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
csv_detective/parsing/columns.py +235 -0
csv_detective/parsing/compression.py +11 -0
csv_detective/parsing/csv.py +56 -0
csv_detective/parsing/excel.py +167 -0
csv_detective/parsing/load.py +111 -0
csv_detective/parsing/text.py +56 -0
csv_detective/utils.py +23 -196
csv_detective/validate.py +138 -0
csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
{csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
csv_detective/all_packages.txt +0 -104
csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
csv_detective/detect_fields/FR/other/__init__.py +0 -0
csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
csv_detective/detect_fields/FR/temp/__init__.py +0 -0
csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
csv_detective/detect_fields/__init__.py +0 -57
csv_detective/detect_fields/geo/__init__.py +0 -0
csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/other/__init__.py +0 -0
csv_detective/detect_fields/other/booleen/__init__.py +0 -21
csv_detective/detect_fields/other/email/__init__.py +0 -8
csv_detective/detect_fields/other/float/__init__.py +0 -17
csv_detective/detect_fields/other/int/__init__.py +0 -12
csv_detective/detect_fields/other/json/__init__.py +0 -24
csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
csv_detective/detect_fields/other/twitter/__init__.py +0 -8
csv_detective/detect_fields/other/url/__init__.py +0 -11
csv_detective/detect_fields/other/uuid/__init__.py +0 -11
csv_detective/detect_fields/temp/__init__.py +0 -0
csv_detective/detect_fields/temp/date/__init__.py +0 -62
csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
csv_detective/detect_fields/temp/year/__init__.py +0 -10
csv_detective/detect_labels/FR/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
csv_detective/detect_labels/FR/other/__init__.py +0 -0
csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
csv_detective/detect_labels/FR/temp/__init__.py +0 -0
csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
csv_detective/detect_labels/__init__.py +0 -43
csv_detective/detect_labels/geo/__init__.py +0 -0
csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
csv_detective/detect_labels/other/__init__.py +0 -0
csv_detective/detect_labels/other/booleen/__init__.py +0 -34
csv_detective/detect_labels/other/email/__init__.py +0 -45
csv_detective/detect_labels/other/float/__init__.py +0 -33
csv_detective/detect_labels/other/int/__init__.py +0 -33
csv_detective/detect_labels/other/money/__init__.py +0 -11
csv_detective/detect_labels/other/money/check_col_name.py +0 -8
csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
csv_detective/detect_labels/other/twitter/__init__.py +0 -33
csv_detective/detect_labels/other/url/__init__.py +0 -48
csv_detective/detect_labels/other/uuid/__init__.py +0 -33
csv_detective/detect_labels/temp/__init__.py +0 -0
csv_detective/detect_labels/temp/date/__init__.py +0 -51
csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
csv_detective/detect_labels/temp/year/__init__.py +0 -44
csv_detective/detection.py +0 -361
csv_detective/process_text.py +0 -39
csv_detective/s3_utils.py +0 -48
csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.dist-info/METADATA +0 -23
csv_detective-0.6.7.dist-info/RECORD +0 -150
csv_detective-0.6.7.dist-info/WHEEL +0 -5
csv_detective-0.6.7.dist-info/top_level.txt +0 -2
tests/__init__.py +0 -0
tests/test_fields.py +0 -360
tests/test_file.py +0 -116
tests/test_labels.py +0 -7
/csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0

csv_detective/formats/lonlat_wgs.py ADDED Viewed

@@ -0,0 +1,36 @@
+from csv_detective.formats.latitude_wgs import _is as is_lat
+from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS
+from csv_detective.formats.longitude_wgs import _is as is_lon
+proportion = 1
+tags = ["geo"]
+specific = [
+    "lonlat",
+    "lon lat",
+    "y x",
+    "yx",
+]
+# we aim wide to catch exact matches if possible for the highest possible score
+words = (
+    SHARED_COORDS_LABELS
+    + specific
+    + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
+)
+def _is(val):
+    if not isinstance(val, str) or val.count(",") != 1:
+        return False
+    lon, lat = val.split(",")
+    # handling [lon,lat]
+    if lon.startswith("[") and lat.endswith("]"):
+        lon, lat = lon[1:], lat[:-1]
+    return is_lon(lon) and is_lat(lat.replace(" ", ""))
+_test_values = {
+    True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
+    False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
+}

csv_detective/formats/mois_de_lannee.py ADDED Viewed

@@ -0,0 +1,48 @@
+from unidecode import unidecode
+proportion = 1
+tags = ["fr", "temp"]
+labels = ["mois", "month"]
+mois = {
+    "janvier",
+    "fevrier",
+    "mars",
+    "avril",
+    "mai",
+    "juin",
+    "juillet",
+    "aout",
+    "septembre",
+    "octobre",
+    "novembre",
+    "decembre",
+    "jan",
+    "fev",
+    "mar",
+    "avr",
+    "mai",
+    "jun",
+    "jui",
+    "juil",
+    "aou",
+    "sep",
+    "sept",
+    "oct",
+    "nov",
+    "dec",
+}
+def _is(val):
+    """Renvoie True si les champs peuvent être des mois de l'année"""
+    if not isinstance(val, str):
+        return False
+    val = unidecode(val.lower())
+    return val in mois
+_test_values = {
+    True: ["JUIN", "décembre"],
+    False: ["november"],
+}

csv_detective/formats/money.py ADDED Viewed

@@ -0,0 +1,18 @@
+from csv_detective.formats.float import _is as is_float
+proportion = 0.8
+labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
+currencies = {"€", "$", "£", "¥"}
+def _is(val):
+    if not isinstance(val, str) or val[-1] not in currencies:
+        return False
+    return is_float(val[:-1])
+_test_values = {
+    True: ["120€", "-20.2$"],
+    False: ["200", "100 euros"],
+}

csv_detective/formats/mongo_object_id.py ADDED Viewed

@@ -0,0 +1,14 @@
+import re
+proportion = 0.8
+labels = ["id", "objectid"]
+def _is(val):
+    return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
+_test_values = {
+    True: ["62320e50f981bc2b57bcc044"],
+    False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
+}

csv_detective/formats/pays.py ADDED Viewed

@@ -0,0 +1,35 @@
+from frformat import Millesime, Options, Pays
+proportion = 0.6
+tags = ["fr", "geo"]
+labels = [
+    "pays",
+    "payslieu",
+    "paysorg",
+    "country",
+    "pays lib",
+    "lieupays",
+    "pays beneficiaire",
+    "nom du pays",
+    "journey start country",
+    "libelle pays",
+    "journey end country",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_pays = Pays(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _pays.is_valid(val)
+_test_values = {
+    True: ["france", "italie"],
+    False: ["amerique", "paris"],
+}

csv_detective/formats/percent.py ADDED Viewed

@@ -0,0 +1,16 @@
+from csv_detective.formats.float import _is as is_float
+proportion = 0.8
+labels = []
+def _is(val):
+    if not isinstance(val, str) or val[-1] != "%":
+        return False
+    return is_float(val[:-1])
+_test_values = {
+    True: ["120%", "-20.2%"],
+    False: ["200", "100 pourcents"],
+}

csv_detective/formats/region.py ADDED Viewed

@@ -0,0 +1,70 @@
+from frformat import Millesime, Options, Region
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "region",
+    "libelle region",
+    "nom region",
+    "libelle reg",
+    "nom reg",
+    "reg libusage",
+    "nom de la region",
+    "regionorg",
+    "regionlieu",
+    "reg",
+    "nom officiel region",
+]
+_extra_valid_values_set = frozenset(
+    {
+        "alsace",
+        "aquitaine",
+        "ara",
+        "aura",
+        "auvergne",
+        "auvergne et rhone alpes",
+        "basse normandie",
+        "bfc",
+        "bourgogne",
+        "bourgogne et franche comte",
+        "centre",
+        "champagne ardenne",
+        "franche comte",
+        "ge",
+        "haute normandie",
+        "hdf",
+        "languedoc roussillon",
+        "limousin",
+        "lorraine",
+        "midi pyrenees",
+        "nord pas de calais",
+        "npdc",
+        "paca",
+        "picardie",
+        "poitou charentes",
+        "reunion",
+        "rhone alpes",
+    }
+)
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+    extra_valid_values=_extra_valid_values_set,
+)
+_region = Region(Millesime.LATEST, _options)
+def _is(val):
+    """Match avec le nom des regions"""
+    return isinstance(val, str) and _region.is_valid(val)
+_test_values = {
+    True: ["bretagne", "ile-de-france"],
+    False: ["baviere", "overgne"],
+}

csv_detective/formats/sexe.py ADDED Viewed

@@ -0,0 +1,17 @@
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr"]
+labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"}
+_test_values = {
+    True: ["femme", "H"],
+    False: ["adulte"],
+}

csv_detective/formats/siren.py ADDED Viewed

@@ -0,0 +1,37 @@
+import re
+proportion = 0.9
+tags = ["fr"]
+labels = [
+    "siren",
+    "siren organisme designe",
+    "siren organisme designant",
+    "n° siren",
+    "siren organisme",
+    "siren titulaire",
+    "numero siren",
+    "epci",
+]
+def _is(val):
+    """Repere les codes SIREN"""
+    if not isinstance(val, str):
+        return False
+    val = val.replace(" ", "")
+    if not bool(re.match(r"^[0-9]{9}$", val)):
+        return False
+    # Vérification par clé propre aux codes siren
+    cle = 0
+    pair = False
+    for x in val:
+        y = int(x) * (1 + pair)
+        cle += y // 10 + y % 10
+        pair = not pair
+    return cle % 10 == 0
+_test_values = {
+    True: ["552 100 554", "552100554"],
+    False: ["42"],
+}

csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} RENAMED Viewed

@@ -1,29 +1,47 @@
-import re
-PROPORTION = 0.8
-def _is(val):
-    '''Détection des identifiants SIRET (SIRENE)'''
-    val = val.replace(' ', '')
-    if not bool(re.match(r'^[0-9]{14}$', val)):
-        return False
-    # Vérification par clé de luhn du SIREN
-    cle = 0
-    pair = False
-    for x in val[:9]:
-        y = int(x) * (1 + pair)
-        cle += y // 10 + y % 10
-        pair = not pair
-    if cle % 10 != 0:
-        return cle % 10 == 0
-    # Vérification par clé de luhn du SIRET
-    cle = 0
-    pair = len(val) % 2 == 0
-    for x in val:
-        y = int(x) * (1 + pair)
-        cle += y // 10 + y % 10
-        pair = not pair
-    return cle % 10 == 0
+import re
+proportion = 0.8
+tags = ["fr"]
+labels = [
+    "siret",
+    "siret d",
+    "num siret",
+    "siretacheteur",
+    "n° siret",
+    "coll siret",
+    "epci",
+]
+def _is(val):
+    """Détection des identifiants SIRET (SIRENE)"""
+    if not isinstance(val, str):
+        return False
+    val = val.replace(" ", "")
+    if not bool(re.match(r"^[0-9]{14}$", val)):
+        return False
+    # Vérification par clé de luhn du SIREN
+    cle = 0
+    pair = False
+    for x in val[:9]:
+        y = int(x) * (1 + pair)
+        cle += y // 10 + y % 10
+        pair = not pair
+    if cle % 10 != 0:
+        return cle % 10 == 0
+    # Vérification par clé de luhn du SIRET
+    cle = 0
+    pair = len(val) % 2 == 0
+    for x in val:
+        y = int(x) * (1 + pair)
+        cle += y // 10 + y % 10
+        pair = not pair
+    return cle % 10 == 0
+_test_values = {
+    True: ["13002526500013", "130 025 265 00013"],
+    False: ["13002526500012"],
+}

csv_detective/formats/tel_fr.py ADDED Viewed

@@ -0,0 +1,36 @@
+import re
+proportion = 0.7
+tags = ["fr"]
+labels = [
+    "telephone",
+    "tel",
+    "tel1",
+    "tel2",
+    "phone",
+    "num tel",
+    "tel mob",
+    "telephone sav",
+    "telephone1",
+    "coordinates.phone",
+    "telephone du lieu",
+]
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    if len(val) < 10:
+        return False
+    val = val.replace(".", "").replace("-", "").replace(" ", "")
+    match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val))
+    return match_1
+_test_values = {
+    True: ["0134643467"],
+    False: ["6625388263", "01288398"],
+}

csv_detective/formats/uai.py ADDED Viewed

@@ -0,0 +1,36 @@
+import re
+proportion = 0.8
+tags = ["fr"]
+labels = [
+    "uai",
+    "code etablissement",
+    "code uai",
+    "uai - identifiant",
+    "numero uai",
+    "rne",
+    "numero de l'etablissement",
+    "code rne",
+    "codeetab",
+    "code uai de l'etablissement",
+    "ref uai",
+    "cd rne",
+    "numerouai",
+    "numero d etablissement",
+    "code etablissement",
+    "numero etablissement",
+]
+def _is(val):
+    if not isinstance(val, str) or len(val) != 8:
+        return False
+    if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)):
+        return False
+    return True
+_test_values = {
+    True: ["0422170F"],
+    False: ["04292E"],
+}

csv_detective/formats/url.py ADDED Viewed

@@ -0,0 +1,46 @@
+import re
+proportion = 1
+labels = [
+    "url",
+    "url source",
+    "site web",
+    "source url",
+    "site internet",
+    "remote url",
+    "web",
+    "site",
+    "lien",
+    "site data",
+    "lien url",
+    "lien vers le fichier",
+    "sitweb",
+    "interneturl",
+]
+pattern = re.compile(
+    r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
+    r"(/[A-Za-z\u00C0-\u024F\u1E00-\u1EFF0-9\s._~:/?#[@!$&'()*+,;=%-]*)?$"
+)
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    return bool(pattern.match(val))
+_test_values = {
+    True: [
+        "www.data.gouv.fr",
+        "http://data.gouv.fr",
+        "https://www.youtube.com/@data-gouv-fr",
+        (
+            "https://tabular-api.data.gouv.fr/api/resources/"
+            "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
+            "?score__greater=0.9&decompte__exact=13"
+        ),
+        "https://une-ville.fr/délibérations/2025/Doc avec espaces et àccëñts.pdf",
+    ],
+    False: ["tmp@data.gouv.fr"],
+}

csv_detective/formats/username.py ADDED Viewed

@@ -0,0 +1,14 @@
+import re
+proportion = 1
+labels = ["account", "username", "user"]
+def _is(val):
+    return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))
+_test_values = {
+    True: ["@accueil1"],
+    False: ["adresse@mail"],
+}

csv_detective/formats/uuid.py ADDED Viewed

@@ -0,0 +1,16 @@
+import re
+proportion = 0.8
+labels = ["id", "identifiant"]
+def _is(val) -> bool:
+    return isinstance(val, str) and bool(
+        re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
+    )
+_test_values = {
+    True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
+    False: ["0610928327"],
+}

csv_detective/formats/year.py ADDED Viewed

@@ -0,0 +1,28 @@
+proportion = 1
+tags = ["temp"]
+labels = [
+    "year",
+    "annee",
+    "annee depot",
+    "an nais",
+    "exercice",
+    "data year",
+    "annee de publication",
+    "exercice comptable",
+    "annee de naissance",
+    "annee ouverture",
+]
+def _is(val):
+    try:
+        val = int(val)
+    except ValueError:
+        return False
+    return (1800 <= val) and (val <= 2100)
+_test_values = {
+    True: ["2015"],
+    False: ["20166", "123"],
+}

csv_detective/output/__init__.py ADDED Viewed

@@ -0,0 +1,65 @@
+import json
+import os
+from typing import Iterator
+import pandas as pd
+from csv_detective.output.dataframe import cast_df_chunks
+from csv_detective.output.profile import create_profile
+from csv_detective.output.schema import generate_table_schema
+from csv_detective.utils import is_url
+def generate_output(
+    table: pd.DataFrame,
+    analysis: dict,
+    file_path: str,
+    num_rows: int = 500,
+    limited_output: bool = True,
+    save_results: bool | str = True,
+    output_profile: bool = False,
+    output_schema: bool = False,
+    output_df: bool = False,
+    cast_json: bool = True,
+    verbose: bool = False,
+    sheet_name: str | int | None = None,
+    _col_values: dict[str, pd.Series] | None = None,
+) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
+    if output_profile:
+        analysis["profile"] = create_profile(
+            table=table,
+            columns=analysis["columns"],
+            num_rows=num_rows,
+            limited_output=limited_output,
+            cast_json=cast_json,
+            verbose=verbose,
+            _col_values=_col_values,
+        )
+    if save_results:
+        if isinstance(save_results, str):
+            output_path = save_results
+        else:
+            output_path = os.path.splitext(file_path)[0]
+            if is_url(output_path):
+                output_path = output_path.split("/")[-1]
+            if analysis.get("sheet_name"):
+                output_path += "_sheet-" + str(sheet_name)
+            output_path += ".json"
+        with open(output_path, "w", encoding="utf8") as fp:
+            json.dump(
+                analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
+            )
+    if output_schema:
+        analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
+    if output_df:
+        return analysis, cast_df_chunks(
+            df=table,
+            analysis=analysis,
+            file_path=file_path,
+            cast_json=cast_json,
+            verbose=verbose,
+        )
+    return analysis

csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl