PyPI - csv-detective - Versions diffs - 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl - Mend

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

csv_detective/__init__.py +7 -1
csv_detective/cli.py +33 -21
csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
csv_detective/detection/columns.py +89 -0
csv_detective/detection/encoding.py +29 -0
csv_detective/detection/engine.py +46 -0
csv_detective/detection/formats.py +156 -0
csv_detective/detection/headers.py +28 -0
csv_detective/detection/rows.py +18 -0
csv_detective/detection/separator.py +44 -0
csv_detective/detection/variables.py +97 -0
csv_detective/explore_csv.py +151 -377
csv_detective/format.py +67 -0
csv_detective/formats/__init__.py +9 -0
csv_detective/formats/adresse.py +116 -0
csv_detective/formats/binary.py +26 -0
csv_detective/formats/booleen.py +35 -0
csv_detective/formats/code_commune_insee.py +26 -0
csv_detective/formats/code_csp_insee.py +36 -0
csv_detective/formats/code_departement.py +29 -0
csv_detective/formats/code_fantoir.py +21 -0
csv_detective/formats/code_import.py +17 -0
csv_detective/formats/code_postal.py +25 -0
csv_detective/formats/code_region.py +22 -0
csv_detective/formats/code_rna.py +29 -0
csv_detective/formats/code_waldec.py +17 -0
csv_detective/formats/commune.py +27 -0
csv_detective/formats/csp_insee.py +31 -0
csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
csv_detective/formats/date.py +99 -0
csv_detective/formats/date_fr.py +22 -0
csv_detective/formats/datetime_aware.py +45 -0
csv_detective/formats/datetime_naive.py +48 -0
csv_detective/formats/datetime_rfc822.py +24 -0
csv_detective/formats/departement.py +37 -0
csv_detective/formats/email.py +28 -0
csv_detective/formats/float.py +29 -0
csv_detective/formats/geojson.py +36 -0
csv_detective/formats/insee_ape700.py +31 -0
csv_detective/formats/insee_canton.py +28 -0
csv_detective/formats/int.py +23 -0
csv_detective/formats/iso_country_code_alpha2.py +30 -0
csv_detective/formats/iso_country_code_alpha3.py +30 -0
csv_detective/formats/iso_country_code_numeric.py +31 -0
csv_detective/formats/jour_de_la_semaine.py +41 -0
csv_detective/formats/json.py +20 -0
csv_detective/formats/latitude_l93.py +48 -0
csv_detective/formats/latitude_wgs.py +42 -0
csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
csv_detective/formats/latlon_wgs.py +53 -0
csv_detective/formats/longitude_l93.py +39 -0
csv_detective/formats/longitude_wgs.py +32 -0
csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
csv_detective/formats/lonlat_wgs.py +36 -0
csv_detective/formats/mois_de_lannee.py +48 -0
csv_detective/formats/money.py +18 -0
csv_detective/formats/mongo_object_id.py +14 -0
csv_detective/formats/pays.py +35 -0
csv_detective/formats/percent.py +16 -0
csv_detective/formats/region.py +70 -0
csv_detective/formats/sexe.py +17 -0
csv_detective/formats/siren.py +37 -0
csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
csv_detective/formats/tel_fr.py +36 -0
csv_detective/formats/uai.py +36 -0
csv_detective/formats/url.py +46 -0
csv_detective/formats/username.py +14 -0
csv_detective/formats/uuid.py +16 -0
csv_detective/formats/year.py +28 -0
csv_detective/output/__init__.py +65 -0
csv_detective/output/dataframe.py +96 -0
csv_detective/output/example.py +250 -0
csv_detective/output/profile.py +119 -0
csv_detective/{schema_generation.py → output/schema.py} +268 -343
csv_detective/output/utils.py +74 -0
csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
csv_detective/parsing/columns.py +235 -0
csv_detective/parsing/compression.py +11 -0
csv_detective/parsing/csv.py +56 -0
csv_detective/parsing/excel.py +167 -0
csv_detective/parsing/load.py +111 -0
csv_detective/parsing/text.py +56 -0
csv_detective/utils.py +23 -196
csv_detective/validate.py +138 -0
csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
{csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
csv_detective/all_packages.txt +0 -104
csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
csv_detective/detect_fields/FR/other/__init__.py +0 -0
csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
csv_detective/detect_fields/FR/temp/__init__.py +0 -0
csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
csv_detective/detect_fields/__init__.py +0 -57
csv_detective/detect_fields/geo/__init__.py +0 -0
csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/other/__init__.py +0 -0
csv_detective/detect_fields/other/booleen/__init__.py +0 -21
csv_detective/detect_fields/other/email/__init__.py +0 -8
csv_detective/detect_fields/other/float/__init__.py +0 -17
csv_detective/detect_fields/other/int/__init__.py +0 -12
csv_detective/detect_fields/other/json/__init__.py +0 -24
csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
csv_detective/detect_fields/other/twitter/__init__.py +0 -8
csv_detective/detect_fields/other/url/__init__.py +0 -11
csv_detective/detect_fields/other/uuid/__init__.py +0 -11
csv_detective/detect_fields/temp/__init__.py +0 -0
csv_detective/detect_fields/temp/date/__init__.py +0 -62
csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
csv_detective/detect_fields/temp/year/__init__.py +0 -10
csv_detective/detect_labels/FR/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
csv_detective/detect_labels/FR/other/__init__.py +0 -0
csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
csv_detective/detect_labels/FR/temp/__init__.py +0 -0
csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
csv_detective/detect_labels/__init__.py +0 -43
csv_detective/detect_labels/geo/__init__.py +0 -0
csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
csv_detective/detect_labels/other/__init__.py +0 -0
csv_detective/detect_labels/other/booleen/__init__.py +0 -34
csv_detective/detect_labels/other/email/__init__.py +0 -45
csv_detective/detect_labels/other/float/__init__.py +0 -33
csv_detective/detect_labels/other/int/__init__.py +0 -33
csv_detective/detect_labels/other/money/__init__.py +0 -11
csv_detective/detect_labels/other/money/check_col_name.py +0 -8
csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
csv_detective/detect_labels/other/twitter/__init__.py +0 -33
csv_detective/detect_labels/other/url/__init__.py +0 -48
csv_detective/detect_labels/other/uuid/__init__.py +0 -33
csv_detective/detect_labels/temp/__init__.py +0 -0
csv_detective/detect_labels/temp/date/__init__.py +0 -51
csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
csv_detective/detect_labels/temp/year/__init__.py +0 -44
csv_detective/detection.py +0 -361
csv_detective/process_text.py +0 -39
csv_detective/s3_utils.py +0 -48
csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.dist-info/METADATA +0 -23
csv_detective-0.6.7.dist-info/RECORD +0 -150
csv_detective-0.6.7.dist-info/WHEEL +0 -5
csv_detective-0.6.7.dist-info/top_level.txt +0 -2
tests/__init__.py +0 -0
tests/test_fields.py +0 -360
tests/test_file.py +0 -116
tests/test_labels.py +0 -7
/csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0

csv_detective/formats/adresse.py ADDED Viewed

@@ -0,0 +1,116 @@
+from csv_detective.parsing.text import _process_text
+proportion = 0.55
+tags = ["fr", "geo"]
+labels = [
+    "adresse",
+    "localisation",
+    "adresse postale",
+    "adresse geographique",
+    "adr",
+    "adresse complete",
+    "adresse station",
+]
+voies = {
+    "aire ",
+    "allee ",
+    "avenue ",
+    "base ",
+    "boulevard ",
+    "cami ",
+    "carrefour ",
+    "chemin ",
+    "cheminement ",
+    "chaussee ",
+    "cite ",
+    "clos ",
+    "coin ",
+    "corniche ",
+    "cote ",
+    "cour ",
+    "cours ",
+    "domaine ",
+    "descente ",
+    "ecart ",
+    "esplanade ",
+    "faubourg ",
+    "gare ",
+    "grande rue",
+    "hameau ",
+    "halle ",
+    "ilot ",
+    "impasse ",
+    "lieu dit",
+    "lotissement ",
+    "marche ",
+    "montee ",
+    "parc ",
+    "passage ",
+    "place ",
+    "plan ",
+    "plaine ",
+    "plateau ",
+    "pont ",
+    "port ",
+    "promenade ",
+    "parvis ",
+    "quartier ",
+    "quai ",
+    "residence ",
+    "ruelle ",
+    "rocade ",
+    "rond point",
+    "route ",
+    "rue ",
+    # 'sente - sentier',
+    "square ",
+    "tour ",
+    # 'terre-plein',
+    "traverse ",
+    "villa ",
+    "village ",
+    "voie ",
+    "zone artisanale",
+    "zone d’amenagement concerte",
+    "zone d’amenagement differe",
+    "zone industrielle",
+    "zone ",
+    # 'r',
+    "av ",
+    "pl ",
+    "bd ",
+    "cami ",
+    # 'che',
+    "chs ",
+    "dom ",
+    "ham ",
+    "ld ",
+    # 'pro',
+    # 'rte',
+    "vlge ",
+    "za ",
+    "zac ",
+    "zad ",
+    "zi ",
+    # 'car',
+    "fg ",
+    # 'lot',
+    "imp ",
+    # 'qu',
+    "mte",
+}
+def _is(val):
+    """Repere des adresses"""
+    if not isinstance(val, str) or len(val) > 150:
+        return False
+    val = _process_text(val)
+    return any(x in val for x in voies)
+_test_values = {
+    True: ["rue du martyr"],
+    False: ["un batiment"],
+}

csv_detective/formats/binary.py ADDED Viewed

@@ -0,0 +1,26 @@
+import codecs
+proportion = 1
+tags = ["type"]
+labels = ["bytes", "binary", "image", "encode", "content"]
+def binary_casting(val: str) -> bytes:
+    return codecs.escape_decode(val[2:-1])[0]
+def _is(val) -> bool:
+    if isinstance(val, str) and (
+        (val.startswith("b'") and val.endswith("'")) or (val.startswith('b"') and val.endswith('"'))
+    ):
+        try:
+            return isinstance(binary_casting(val), bytes)
+        except Exception:
+            return False
+    return False
+_test_values = {
+    True: ["b'\x01\x01'", 'b"\x01\x01\x00\x00\x00;\xb7\xd4\xc5_)J\xc0\xcb\x16>\x9e\xd1\xc4\x13@"'],
+    False: ["bytes", 'b"ytes'],
+}

csv_detective/formats/booleen.py ADDED Viewed

@@ -0,0 +1,35 @@
+proportion = 1
+tags = ["type"]
+labels = ["is ", "has ", "est "]
+bool_mapping = {
+    "1": True,
+    "0": False,
+    "vrai": True,
+    "faux": False,
+    "true": True,
+    "false": False,
+    "oui": True,
+    "non": False,
+    "yes": True,
+    "no": False,
+    "y": True,
+    "n": False,
+    "o": True,
+}
+liste_bool = set(bool_mapping.keys())
+def bool_casting(val: str) -> bool:
+    return bool_mapping.get(val.lower())
+def _is(val):
+    return isinstance(val, str) and val.lower() in liste_bool
+_test_values = {
+    True: ["oui", "0", "1", "yes", "false", "True"],
+    False: ["nein", "ja", "2", "-0"],
+}

csv_detective/formats/code_commune_insee.py ADDED Viewed

@@ -0,0 +1,26 @@
+from frformat import CodeCommuneInsee, Millesime
+proportion = 0.75
+tags = ["fr", "geo"]
+labels = [
+    "code commune insee",
+    "code insee",
+    "codes insee",
+    "code commune",
+    "code insee commune",
+    "insee",
+    "code com",
+    "com",
+]
+_code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
+def _is(val):
+    return isinstance(val, str) and _code_commune_insee.is_valid(val)
+_test_values = {
+    True: ["91471", "01053"],
+    False: ["914712", "01000"],
+}

csv_detective/formats/code_csp_insee.py ADDED Viewed

@@ -0,0 +1,36 @@
+import re
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr"]
+labels = ["code csp insee", "code csp"]
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    val = _process_text(val)
+    if len(val) != 4:
+        return False
+    a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
+    b = val in {
+        "7100",
+        "7200",
+        "7400",
+        "7500",
+        "7700",
+        "7800",
+        "8100",
+        "8300",
+        "8400",
+        "8500",
+        "8600",
+    }
+    return a or b
+_test_values = {
+    True: ["121f"],
+    False: ["121x"],
+}

csv_detective/formats/code_departement.py ADDED Viewed

@@ -0,0 +1,29 @@
+from frformat import Millesime, NumeroDepartement, Options
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "code departement",
+    "code_departement",
+    "dep",
+    "departement",
+    "dept",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_numero_departement = NumeroDepartement(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _numero_departement.is_valid(val)
+_test_values = {
+    True: ["75", "2A", "2b", "974", "01"],
+    False: ["00", "96", "101"],
+}

csv_detective/formats/code_fantoir.py ADDED Viewed

@@ -0,0 +1,21 @@
+from frformat import CodeFantoir
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "cadastre1",
+    "code fantoir",
+    "fantoir",
+]
+_code_fantoir = CodeFantoir()
+def _is(val):
+    return isinstance(val, str) and _code_fantoir.is_valid(val)
+_test_values = {
+    True: ["7755A", "B150B", "ZA04C", "ZB03D"],
+    False: ["7755", "ZA99A"],
+}

csv_detective/formats/code_import.py ADDED Viewed

@@ -0,0 +1,17 @@
+import re
+proportion = 0.9
+tags = ["fr"]
+labels = ["code"]
+regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
+def _is(val):
+    return isinstance(val, str) and bool(re.match(regex, val))
+_test_values = {
+    True: ["123S1871092288"],
+    False: ["AA751PEE00188854", "W123456789"],
+}

csv_detective/formats/code_postal.py ADDED Viewed

@@ -0,0 +1,25 @@
+from frformat import CodePostal
+proportion = 0.9
+tags = ["fr", "geo"]
+labels = [
+    "code postal",
+    "postal code",
+    "postcode",
+    "post code",
+    "cp",
+    "codes postaux",
+    "location postcode",
+]
+_code_postal = CodePostal()
+def _is(val):
+    return isinstance(val, str) and _code_postal.is_valid(val)
+_test_values = {
+    True: ["75020", "01000"],
+    False: ["77777", "018339"],
+}

csv_detective/formats/code_region.py ADDED Viewed

@@ -0,0 +1,22 @@
+from frformat import CodeRegion, Millesime
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "code region",
+    "reg",
+    "code insee region",
+    "region",
+]
+_code_region = CodeRegion(Millesime.LATEST)
+def _is(val):
+    return isinstance(val, str) and _code_region.is_valid(val)
+_test_values = {
+    True: ["32"],
+    False: ["55"],
+}

csv_detective/formats/code_rna.py ADDED Viewed

@@ -0,0 +1,29 @@
+from frformat import CodeRNA
+proportion = 0.9
+tags = ["fr"]
+labels = [
+    "code rna",
+    "rna",
+    "n° inscription association",
+    "identifiant association",
+]
+_code_rna = CodeRNA()
+def _is(val):
+    return isinstance(val, str) and _code_rna.is_valid(val)
+_test_values = {
+    True: ["W751515517"],
+    False: [
+        "W111111111111111111111111111111111111",
+        "w143788974",
+        "W12",
+        "678W23456",
+        "165789325",
+        "Wa1#89sf&h",
+    ],
+}

csv_detective/formats/code_waldec.py ADDED Viewed

@@ -0,0 +1,17 @@
+import re
+proportion = 0.9
+tags = ["fr"]
+labels = ["code waldec", "waldec"]
+regex = r"^W\d[\dA-Z]\d{7}$"
+def _is(val):
+    return isinstance(val, str) and bool(re.match(regex, val))
+_test_values = {
+    True: ["W123456789", "W2D1234567"],
+    False: ["AA751PEE00188854"],
+}

csv_detective/formats/commune.py ADDED Viewed

@@ -0,0 +1,27 @@
+from frformat import Commune, Millesime, Options
+proportion = 0.8
+tags = ["fr", "geo"]
+labels = [
+    "commune",
+    "ville",
+    "libelle commune",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_commune = Commune(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _commune.is_valid(val)
+_test_values = {
+    True: ["saint denis"],
+    False: ["new york", "lion"],
+}

csv_detective/formats/csp_insee.py ADDED Viewed

@@ -0,0 +1,31 @@
+from os.path import dirname, join
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr"]
+labels = [
+    "csp insee",
+    "csp",
+    "categorie socioprofessionnelle",
+]
+f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
+codes_insee = f.read().split("\n")
+# removing empty str due to additionnal line in file
+del codes_insee[-1]
+codes_insee = set(codes_insee)
+f.close()
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    val = _process_text(val)
+    return val in codes_insee
+_test_values = {
+    True: ["employes de la poste"],
+    False: ["super-heros"],
+}

csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt RENAMED Viewed

File without changes

csv_detective/formats/date.py ADDED Viewed

@@ -0,0 +1,99 @@
+import re
+from datetime import datetime
+from dateparser import parse as date_parser
+from dateutil.parser import ParserError
+from dateutil.parser import parse as dateutil_parser
+proportion = 1
+tags = ["temp", "type"]
+SHARED_DATE_LABELS = [
+    "date",
+    "mise à jour",
+    "modifie",
+    "maj",
+    "datemaj",
+    "update",
+    "created",
+    "modified",
+]
+labels = SHARED_DATE_LABELS + [
+    "jour",
+    "periode",
+    "dpc",
+    "yyyymmdd",
+    "aaaammjj",
+]
+def date_casting(val: str) -> datetime | None:
+    """For performance reasons, we try first with dateutil and fallback on dateparser"""
+    try:
+        return dateutil_parser(val)
+    except ParserError:
+        return date_parser(val)
+    except Exception:
+        return None
+threshold = 0.3
+seps = r"[\s/\-\*_\|;.,]"
+# matches JJ-MM-AAAA with any of the listed separators
+jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
+    "SEP", seps
+)
+# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
+aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
+    "SEP", seps + "?"
+)
+# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
+string_month_pattern = (
+    r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
+    r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
+    r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
+    r"([0-9]{2}$|(19|20)[0-9]{2}$)"
+).replace("SEP", seps + "?")
+def _is(val):
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
+        return False
+    # if it's a usual date pattern
+    if any(
+        # with this syntax, if any of the first value is True, the next ones are not computed
+        [
+            bool(re.match(jjmmaaaa_pattern, val))
+            or bool(re.match(aaaammjj_pattern, val))
+            or bool(re.match(string_month_pattern, val, re.IGNORECASE))
+        ]
+    ):
+        return True
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if not res or res.hour or res.minute or res.second:
+        return False
+    return True
+_test_values = {
+    True: [
+        "1960-08-07",
+        "12/02/2007",
+        "15 jan 1985",
+        "15 décembre 1985",
+        "02 05 2003",
+        "20030502",
+        "1993-12/02",
+    ],
+    False: [
+        "1993-1993-1993",
+        "39-10-1993",
+        "19-15-1993",
+        "15 tambour 1985",
+        "12152003",
+        "20031512",
+        "02052003",
+    ],
+}

csv_detective/formats/date_fr.py ADDED Viewed

@@ -0,0 +1,22 @@
+import re
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr", "temp"]
+labels = ["date"]
+pattern = (
+    r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
+    r"|octobre|novembre|decembre)[ \-/]\d{4}$"
+)
+def _is(val):
+    return isinstance(val, str) and bool(re.match(pattern, _process_text(val)))
+_test_values = {
+    True: ["13 février 1996", "15 decembre 2024"],
+    False: ["44 march 2025"],
+}

csv_detective/formats/datetime_aware.py ADDED Viewed

@@ -0,0 +1,45 @@
+import re
+from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting
+proportion = 1
+tags = ["temp", "type"]
+labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
+threshold = 0.7
+pat = (
+    aaaammjj_pattern.replace("$", "")
+    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
+    + r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
+)
+def _is(val):
+    # early stops, to cut processing time
+    # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
+    # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
+    if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
+        return False
+    # if usual format, no need to parse
+    if bool(re.match(pat, val)):
+        return True
+    if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    return (
+        res is not None
+        and bool(res.hour or res.minute or res.second or res.microsecond)
+        and bool(res.tzinfo)
+    )
+_test_values = {
+    True: [
+        "2021-06-22 10:20:10-04:00",
+        "2030-06-22 00:00:00.0028+02:00",
+        "2000-12-21 10:20:10.1Z",
+        "2024-12-19T10:53:36.428000+00:00",
+        "1996/06/22 10:20:10 GMT",
+    ],
+    False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
+}

csv_detective/formats/datetime_naive.py ADDED Viewed

@@ -0,0 +1,48 @@
+import re
+from typing import Any
+from csv_detective.formats.date import aaaammjj_pattern, date_casting
+from csv_detective.formats.datetime_aware import labels  # noqa
+proportion = 1
+tags = ["temp", "type"]
+threshold = 0.7
+# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
+pat = (
+    aaaammjj_pattern.replace("$", "")
+    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
+)
+def _is(val: Any | None) -> bool:
+    """Detects naive datetimes only"""
+    # early stops, to cut processing time
+    # 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
+    # 26 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd, keeping some slack
+    if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
+        return False
+    # if usual format, no need to parse
+    if bool(re.match(pat, val)):
+        return True
+    if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    return res is not None and not bool(res.tzinfo)
+_test_values = {
+    True: [
+        "2021-06-22 10:20:10",
+        "2030/06-22   00:00:00",
+        "2030/06/22 00:00:00.0028",
+    ],
+    False: [
+        "2021-06-22T30:20:10",
+        "Sun, 06 Nov 1994 08:49:37 GMT",
+        "2021-06-44 10:20:10+02:00",
+        "1999-12-01T00:00:00Z",
+        "2021-06-44",
+        "15 décembre 1985",
+    ],
+}

csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl