PyPI - csv-detective - Versions diffs - 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2258py3-none-any.whl → 0.9.3.dev2348py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

csv_detective/formats/lonlat_wgs.py ADDED Viewed

@@ -0,0 +1,36 @@
+from csv_detective.formats.latitude_wgs import _is as is_lat
+from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS
+from csv_detective.formats.longitude_wgs import _is as is_lon
+proportion = 1
+tags = ["geo"]
+specific = [
+    "lonlat",
+    "lon lat",
+    "y x",
+    "yx",
+]
+# we aim wide to catch exact matches if possible for the highest possible score
+words = (
+    SHARED_COORDS_LABELS
+    + specific
+    + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
+)
+def _is(val):
+    if not isinstance(val, str) or val.count(",") != 1:
+        return False
+    lon, lat = val.split(",")
+    # handling [lon,lat]
+    if lon.startswith("[") and lat.endswith("]"):
+        lon, lat = lon[1:], lat[:-1]
+    return is_lon(lon) and is_lat(lat.replace(" ", ""))
+_test_values = {
+    True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
+    False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
+}

csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} RENAMED Viewed

@@ -1,39 +1,48 @@
-from unidecode import unidecode
-PROPORTION = 1
-mois = {
-    "janvier",
-    "fevrier",
-    "mars",
-    "avril",
-    "mai",
-    "juin",
-    "juillet",
-    "aout",
-    "septembre",
-    "octobre",
-    "novembre",
-    "decembre",
-    "jan",
-    "fev",
-    "mar",
-    "avr",
-    "mai",
-    "jun",
-    "jui",
-    "juil",
-    "aou",
-    "sep",
-    "sept",
-    "oct",
-    "nov",
-    "dec",
-}
-def _is(val):
-    """Renvoie True si les champs peuvent être des mois de l'année"""
-    if not isinstance(val, str):
-        return False
-    val = unidecode(val.lower())
-    return val in mois
+from unidecode import unidecode
+proportion = 1
+tags = ["fr", "temp"]
+labels = ["mois", "month"]
+mois = {
+    "janvier",
+    "fevrier",
+    "mars",
+    "avril",
+    "mai",
+    "juin",
+    "juillet",
+    "aout",
+    "septembre",
+    "octobre",
+    "novembre",
+    "decembre",
+    "jan",
+    "fev",
+    "mar",
+    "avr",
+    "mai",
+    "jun",
+    "jui",
+    "juil",
+    "aou",
+    "sep",
+    "sept",
+    "oct",
+    "nov",
+    "dec",
+}
+def _is(val):
+    """Renvoie True si les champs peuvent être des mois de l'année"""
+    if not isinstance(val, str):
+        return False
+    val = unidecode(val.lower())
+    return val in mois
+_test_values = {
+    True: ["JUIN", "décembre"],
+    False: ["november"],
+}

csv_detective/formats/money.py ADDED Viewed

@@ -0,0 +1,18 @@
+from csv_detective.formats.float import _is as is_float
+proportion = 0.8
+labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
+currencies = {"€", "$", "£", "¥"}
+def _is(val):
+    if not isinstance(val, str) or val[-1] not in currencies:
+        return False
+    return is_float(val[:-1])
+_test_values = {
+    True: ["120€", "-20.2$"],
+    False: ["200", "100 euros"],
+}

csv_detective/formats/mongo_object_id.py ADDED Viewed

@@ -0,0 +1,14 @@
+import re
+proportion = 0.8
+labels = ["id", "objectid"]
+def _is(val):
+    return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
+_test_values = {
+    True: ["62320e50f981bc2b57bcc044"],
+    False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
+}

csv_detective/formats/pays.py ADDED Viewed

@@ -0,0 +1,35 @@
+from frformat import Millesime, Options, Pays
+proportion = 0.6
+tags = ["fr", "geo"]
+labels = [
+    "pays",
+    "payslieu",
+    "paysorg",
+    "country",
+    "pays lib",
+    "lieupays",
+    "pays beneficiaire",
+    "nom du pays",
+    "journey start country",
+    "libelle pays",
+    "journey end country",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_pays = Pays(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _pays.is_valid(val)
+_test_values = {
+    True: ["france", "italie"],
+    False: ["amerique", "paris"],
+}

csv_detective/formats/percent.py ADDED Viewed

@@ -0,0 +1,16 @@
+from csv_detective.formats.float import _is as is_float
+proportion = 0.8
+labels = []
+def _is(val):
+    if not isinstance(val, str) or val[-1] != "%":
+        return False
+    return is_float(val[:-1])
+_test_values = {
+    True: ["120%", "-20.2%"],
+    False: ["200", "100 pourcents"],
+}

csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} RENAMED Viewed

@@ -1,50 +1,70 @@
-from frformat import Millesime, Options, Region
-PROPORTION = 1
-_extra_valid_values_set = frozenset(
-    {
-        "alsace",
-        "aquitaine",
-        "ara",
-        "aura",
-        "auvergne",
-        "auvergne et rhone alpes",
-        "basse normandie",
-        "bfc",
-        "bourgogne",
-        "bourgogne et franche comte",
-        "centre",
-        "champagne ardenne",
-        "franche comte",
-        "ge",
-        "haute normandie",
-        "hdf",
-        "languedoc roussillon",
-        "limousin",
-        "lorraine",
-        "midi pyrenees",
-        "nord pas de calais",
-        "npdc",
-        "paca",
-        "picardie",
-        "poitou charentes",
-        "reunion",
-        "rhone alpes",
-    }
-)
-_options = Options(
-    ignore_case=True,
-    ignore_accents=True,
-    replace_non_alphanumeric_with_space=True,
-    ignore_extra_whitespace=True,
-    extra_valid_values=_extra_valid_values_set,
-)
-_region = Region(Millesime.LATEST, _options)
-def _is(val):
-    """Match avec le nom des regions"""
-    return isinstance(val, str) and _region.is_valid(val)
+from frformat import Millesime, Options, Region
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "region",
+    "libelle region",
+    "nom region",
+    "libelle reg",
+    "nom reg",
+    "reg libusage",
+    "nom de la region",
+    "regionorg",
+    "regionlieu",
+    "reg",
+    "nom officiel region",
+]
+_extra_valid_values_set = frozenset(
+    {
+        "alsace",
+        "aquitaine",
+        "ara",
+        "aura",
+        "auvergne",
+        "auvergne et rhone alpes",
+        "basse normandie",
+        "bfc",
+        "bourgogne",
+        "bourgogne et franche comte",
+        "centre",
+        "champagne ardenne",
+        "franche comte",
+        "ge",
+        "haute normandie",
+        "hdf",
+        "languedoc roussillon",
+        "limousin",
+        "lorraine",
+        "midi pyrenees",
+        "nord pas de calais",
+        "npdc",
+        "paca",
+        "picardie",
+        "poitou charentes",
+        "reunion",
+        "rhone alpes",
+    }
+)
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+    extra_valid_values=_extra_valid_values_set,
+)
+_region = Region(Millesime.LATEST, _options)
+def _is(val):
+    """Match avec le nom des regions"""
+    return isinstance(val, str) and _region.is_valid(val)
+_test_values = {
+    True: ["bretagne", "ile-de-france"],
+    False: ["baviere", "overgne"],
+}

csv_detective/formats/sexe.py ADDED Viewed

@@ -0,0 +1,17 @@
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr"]
+labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"}
+_test_values = {
+    True: ["femme", "H"],
+    False: ["adulte"],
+}

csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} RENAMED Viewed

@@ -1,20 +1,37 @@
-import re
-PROPORTION = 0.9
-def _is(val):
-    """Repere les codes SIREN"""
-    if not isinstance(val, str):
-        return False
-    val = val.replace(" ", "")
-    if not bool(re.match(r"^[0-9]{9}$", val)):
-        return False
-    # Vérification par clé propre aux codes siren
-    cle = 0
-    pair = False
-    for x in val:
-        y = int(x) * (1 + pair)
-        cle += y // 10 + y % 10
-        pair = not pair
-    return cle % 10 == 0
+import re
+proportion = 0.9
+tags = ["fr"]
+labels = [
+    "siren",
+    "siren organisme designe",
+    "siren organisme designant",
+    "n° siren",
+    "siren organisme",
+    "siren titulaire",
+    "numero siren",
+    "epci",
+]
+def _is(val):
+    """Repere les codes SIREN"""
+    if not isinstance(val, str):
+        return False
+    val = val.replace(" ", "")
+    if not bool(re.match(r"^[0-9]{9}$", val)):
+        return False
+    # Vérification par clé propre aux codes siren
+    cle = 0
+    pair = False
+    for x in val:
+        y = int(x) * (1 + pair)
+        cle += y // 10 + y % 10
+        pair = not pair
+    return cle % 10 == 0
+_test_values = {
+    True: ["552 100 554", "552100554"],
+    False: ["42"],
+}

csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} RENAMED Viewed

@@ -1,31 +1,47 @@
-import re
-PROPORTION = 0.8
-def _is(val):
-    """Détection des identifiants SIRET (SIRENE)"""
-    if not isinstance(val, str):
-        return False
-    val = val.replace(" ", "")
-    if not bool(re.match(r"^[0-9]{14}$", val)):
-        return False
-    # Vérification par clé de luhn du SIREN
-    cle = 0
-    pair = False
-    for x in val[:9]:
-        y = int(x) * (1 + pair)
-        cle += y // 10 + y % 10
-        pair = not pair
-    if cle % 10 != 0:
-        return cle % 10 == 0
-    # Vérification par clé de luhn du SIRET
-    cle = 0
-    pair = len(val) % 2 == 0
-    for x in val:
-        y = int(x) * (1 + pair)
-        cle += y // 10 + y % 10
-        pair = not pair
-    return cle % 10 == 0
+import re
+proportion = 0.8
+tags = ["fr"]
+labels = [
+    "siret",
+    "siret d",
+    "num siret",
+    "siretacheteur",
+    "n° siret",
+    "coll siret",
+    "epci",
+]
+def _is(val):
+    """Détection des identifiants SIRET (SIRENE)"""
+    if not isinstance(val, str):
+        return False
+    val = val.replace(" ", "")
+    if not bool(re.match(r"^[0-9]{14}$", val)):
+        return False
+    # Vérification par clé de luhn du SIREN
+    cle = 0
+    pair = False
+    for x in val[:9]:
+        y = int(x) * (1 + pair)
+        cle += y // 10 + y % 10
+        pair = not pair
+    if cle % 10 != 0:
+        return cle % 10 == 0
+    # Vérification par clé de luhn du SIRET
+    cle = 0
+    pair = len(val) % 2 == 0
+    for x in val:
+        y = int(x) * (1 + pair)
+        cle += y // 10 + y % 10
+        pair = not pair
+    return cle % 10 == 0
+_test_values = {
+    True: ["13002526500013", "130 025 265 00013"],
+    False: ["13002526500012"],
+}

csv_detective/formats/tel_fr.py ADDED Viewed

@@ -0,0 +1,36 @@
+import re
+proportion = 0.7
+tags = ["fr"]
+labels = [
+    "telephone",
+    "tel",
+    "tel1",
+    "tel2",
+    "phone",
+    "num tel",
+    "tel mob",
+    "telephone sav",
+    "telephone1",
+    "coordinates.phone",
+    "telephone du lieu",
+]
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    if len(val) < 10:
+        return False
+    val = val.replace(".", "").replace("-", "").replace(" ", "")
+    match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val))
+    return match_1
+_test_values = {
+    True: ["0134643467"],
+    False: ["6625388263", "01288398"],
+}

csv_detective/formats/uai.py ADDED Viewed

@@ -0,0 +1,36 @@
+import re
+proportion = 0.8
+tags = ["fr"]
+labels = [
+    "uai",
+    "code etablissement",
+    "code uai",
+    "uai - identifiant",
+    "numero uai",
+    "rne",
+    "numero de l'etablissement",
+    "code rne",
+    "codeetab",
+    "code uai de l'etablissement",
+    "ref uai",
+    "cd rne",
+    "numerouai",
+    "numero d etablissement",
+    "code etablissement",
+    "numero etablissement",
+]
+def _is(val):
+    if not isinstance(val, str) or len(val) != 8:
+        return False
+    if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)):
+        return False
+    return True
+_test_values = {
+    True: ["0422170F"],
+    False: ["04292E"],
+}

csv_detective/formats/url.py ADDED Viewed

@@ -0,0 +1,45 @@
+import re
+proportion = 1
+labels = [
+    "url",
+    "url source",
+    "site web",
+    "source url",
+    "site internet",
+    "remote url",
+    "web",
+    "site",
+    "lien",
+    "site data",
+    "lien url",
+    "lien vers le fichier",
+    "sitweb",
+    "interneturl",
+]
+pattern = re.compile(
+    r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
+    r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$"
+)
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    return bool(pattern.match(val))
+_test_values = {
+    True: [
+        "www.data.gouv.fr",
+        "http://data.gouv.fr",
+        "https://www.youtube.com/@data-gouv-fr",
+        (
+            "https://tabular-api.data.gouv.fr/api/resources/"
+            "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
+            "?score__greater=0.9&decompte__exact=13"
+        ),
+    ],
+    False: ["tmp@data.gouv.fr"],
+}

csv_detective/formats/username.py ADDED Viewed

@@ -0,0 +1,14 @@
+import re
+proportion = 1
+labels = ["account", "username", "user"]
+def _is(val):
+    return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))
+_test_values = {
+    True: ["@accueil1"],
+    False: ["adresse@mail"],
+}

csv_detective/formats/uuid.py ADDED Viewed

@@ -0,0 +1,16 @@
+import re
+proportion = 0.8
+labels = ["id", "identifiant"]
+def _is(val) -> bool:
+    return isinstance(val, str) and bool(
+        re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
+    )
+_test_values = {
+    True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
+    False: ["0610928327"],
+}

csv_detective/formats/year.py ADDED Viewed

@@ -0,0 +1,28 @@
+proportion = 1
+tags = ["temp"]
+labels = [
+    "year",
+    "annee",
+    "annee depot",
+    "an nais",
+    "exercice",
+    "data year",
+    "annee de publication",
+    "exercice comptable",
+    "annee de naissance",
+    "annee ouverture",
+]
+def _is(val):
+    try:
+        val = int(val)
+    except ValueError:
+        return False
+    return (1800 <= val) and (val <= 2100)
+_test_values = {
+    True: ["2015"],
+    False: ["20166", "123"],
+}

csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl

csv-detective 0.9.3.dev2258py3-none-any.whl → 0.9.3.dev2348py3-none-any.whl