PyPI - csv-detective - Versions diffs - 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2258py3-none-any.whl → 0.9.3.dev2348py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

csv_detective/formats/code_fantoir.py ADDED Viewed

@@ -0,0 +1,21 @@
+from frformat import CodeFantoir
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "cadastre1",
+    "code fantoir",
+    "fantoir",
+]
+_code_fantoir = CodeFantoir()
+def _is(val):
+    return isinstance(val, str) and _code_fantoir.is_valid(val)
+_test_values = {
+    True: ["7755A", "B150B", "ZA04C", "ZB03D"],
+    False: ["7755", "ZA99A"],
+}

csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} RENAMED Viewed

@@ -1,9 +1,17 @@
-import re
-PROPORTION = 0.9
-regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
-def _is(val):
-    """Repere le code Import (ancien RNA)"""
-    return isinstance(val, str) and bool(re.match(regex, val))
+import re
+proportion = 0.9
+tags = ["fr"]
+labels = ["code"]
+regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
+def _is(val):
+    return isinstance(val, str) and bool(re.match(regex, val))
+_test_values = {
+    True: ["123S1871092288"],
+    False: ["AA751PEE00188854", "W123456789"],
+}

csv_detective/formats/code_postal.py ADDED Viewed

@@ -0,0 +1,25 @@
+from frformat import CodePostal
+proportion = 0.9
+tags = ["fr", "geo"]
+labels = [
+    "code postal",
+    "postal code",
+    "postcode",
+    "post code",
+    "cp",
+    "codes postaux",
+    "location postcode",
+]
+_code_postal = CodePostal()
+def _is(val):
+    return isinstance(val, str) and _code_postal.is_valid(val)
+_test_values = {
+    True: ["75020", "01000"],
+    False: ["77777", "018339"],
+}

csv_detective/formats/code_region.py ADDED Viewed

@@ -0,0 +1,22 @@
+from frformat import CodeRegion, Millesime
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "code region",
+    "reg",
+    "code insee region",
+    "region",
+]
+_code_region = CodeRegion(Millesime.LATEST)
+def _is(val):
+    return isinstance(val, str) and _code_region.is_valid(val)
+_test_values = {
+    True: ["32"],
+    False: ["55"],
+}

csv_detective/formats/code_rna.py ADDED Viewed

@@ -0,0 +1,29 @@
+from frformat import CodeRNA
+proportion = 0.9
+tags = ["fr"]
+labels = [
+    "code rna",
+    "rna",
+    "n° inscription association",
+    "identifiant association",
+]
+_code_rna = CodeRNA()
+def _is(val):
+    return isinstance(val, str) and _code_rna.is_valid(val)
+_test_values = {
+    True: ["W751515517"],
+    False: [
+        "W111111111111111111111111111111111111",
+        "w143788974",
+        "W12",
+        "678W23456",
+        "165789325",
+        "Wa1#89sf&h",
+    ],
+}

csv_detective/formats/code_waldec.py ADDED Viewed

@@ -0,0 +1,17 @@
+import re
+proportion = 0.9
+tags = ["fr"]
+labels = ["code waldec", "waldec"]
+regex = r"^W\d[\dA-Z]\d{7}$"
+def _is(val):
+    return isinstance(val, str) and bool(re.match(regex, val))
+_test_values = {
+    True: ["W123456789", "W2D1234567"],
+    False: ["AA751PEE00188854"],
+}

csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} RENAMED Viewed

@@ -1,16 +1,27 @@
-from frformat import Commune, Millesime, Options
-PROPORTION = 0.9
-_options = Options(
-    ignore_case=True,
-    ignore_accents=True,
-    replace_non_alphanumeric_with_space=True,
-    ignore_extra_whitespace=True,
-)
-_commune = Commune(Millesime.LATEST, _options)
-def _is(val):
-    """Match avec le nom des communes"""
-    return isinstance(val, str) and _commune.is_valid(val)
+from frformat import Commune, Millesime, Options
+proportion = 0.8
+tags = ["fr", "geo"]
+labels = [
+    "commune",
+    "ville",
+    "libelle commune",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_commune = Commune(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _commune.is_valid(val)
+_test_values = {
+    True: ["saint denis"],
+    False: ["new york", "lion"],
+}

csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} RENAMED Viewed

@@ -1,19 +1,31 @@
-from os.path import dirname, join
-from csv_detective.parsing.text import _process_text
-PROPORTION = 1
-f = open(join(dirname(__file__), "csp_insee.txt"), "r")
-codes_insee = f.read().split("\n")
-# removing empty str due to additionnal line in file
-del codes_insee[-1]
-codes_insee = set(codes_insee)
-f.close()
-def _is(val):
-    """Repère les csp telles que définies par l'INSEE"""
-    if not isinstance(val, str):
-        return False
-    val = _process_text(val)
-    return val in codes_insee
+from os.path import dirname, join
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr"]
+labels = [
+    "csp insee",
+    "csp",
+    "categorie socioprofessionnelle",
+]
+f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
+codes_insee = f.read().split("\n")
+# removing empty str due to additionnal line in file
+del codes_insee[-1]
+codes_insee = set(codes_insee)
+f.close()
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    val = _process_text(val)
+    return val in codes_insee
+_test_values = {
+    True: ["employes de la poste"],
+    False: ["super-heros"],
+}

csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt RENAMED Viewed

File without changes

csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} RENAMED Viewed

@@ -1,62 +1,99 @@
-import re
-from datetime import datetime
-from dateparser import parse as date_parser
-from dateutil.parser import ParserError
-from dateutil.parser import parse as dateutil_parser
-PROPORTION = 1
-# /!\ this is only for dates, not datetimes which are handled by other utils
-def date_casting(val: str) -> datetime | None:
-    """For performance reasons, we try first with dateutil and fallback on dateparser"""
-    try:
-        return dateutil_parser(val)
-    except ParserError:
-        return date_parser(val)
-    except Exception:
-        return None
-seps = r"[\s/\-\*_\|;.,]"
-# matches JJ-MM-AAAA with any of the listed separators
-jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
-    "SEP", seps
-)
-# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
-aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
-    "SEP", seps + "?"
-)
-# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
-string_month_pattern = (
-    r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
-    r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
-    r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
-    r"([0-9]{2}$|(19|20)[0-9]{2}$)"
-).replace("SEP", seps + "?")
-threshold = 0.3
-def _is(val):
-    """Renvoie True si val peut être une date, False sinon"""
-    # early stops, to cut processing time
-    if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
-        return False
-    # if it's a usual date pattern
-    if any(
-        # with this syntax, if any of the first value is True, the next ones are not computed
-        [
-            bool(re.match(jjmmaaaa_pattern, val))
-            or bool(re.match(aaaammjj_pattern, val))
-            or bool(re.match(string_month_pattern, val, re.IGNORECASE))
-        ]
-    ):
-        return True
-    if sum([char.isdigit() for char in val]) / len(val) < threshold:
-        return False
-    res = date_casting(val)
-    if not res or res.hour or res.minute or res.second:
-        return False
-    return True
+import re
+from datetime import datetime
+from dateparser import parse as date_parser
+from dateutil.parser import ParserError
+from dateutil.parser import parse as dateutil_parser
+proportion = 1
+tags = ["temp", "type"]
+SHARED_DATE_LABELS = [
+    "date",
+    "mise à jour",
+    "modifie",
+    "maj",
+    "datemaj",
+    "update",
+    "created",
+    "modified",
+]
+labels = SHARED_DATE_LABELS + [
+    "jour",
+    "periode",
+    "dpc",
+    "yyyymmdd",
+    "aaaammjj",
+]
+def date_casting(val: str) -> datetime | None:
+    """For performance reasons, we try first with dateutil and fallback on dateparser"""
+    try:
+        return dateutil_parser(val)
+    except ParserError:
+        return date_parser(val)
+    except Exception:
+        return None
+threshold = 0.3
+seps = r"[\s/\-\*_\|;.,]"
+# matches JJ-MM-AAAA with any of the listed separators
+jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
+    "SEP", seps
+)
+# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
+aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
+    "SEP", seps + "?"
+)
+# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
+string_month_pattern = (
+    r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
+    r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
+    r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
+    r"([0-9]{2}$|(19|20)[0-9]{2}$)"
+).replace("SEP", seps + "?")
+def _is(val):
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
+        return False
+    # if it's a usual date pattern
+    if any(
+        # with this syntax, if any of the first value is True, the next ones are not computed
+        [
+            bool(re.match(jjmmaaaa_pattern, val))
+            or bool(re.match(aaaammjj_pattern, val))
+            or bool(re.match(string_month_pattern, val, re.IGNORECASE))
+        ]
+    ):
+        return True
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if not res or res.hour or res.minute or res.second:
+        return False
+    return True
+_test_values = {
+    True: [
+        "1960-08-07",
+        "12/02/2007",
+        "15 jan 1985",
+        "15 décembre 1985",
+        "02 05 2003",
+        "20030502",
+        "1993-12/02",
+    ],
+    False: [
+        "1993-1993-1993",
+        "39-10-1993",
+        "19-15-1993",
+        "15 tambour 1985",
+        "12152003",
+        "20031512",
+        "02052003",
+    ],
+}

csv_detective/formats/date_fr.py ADDED Viewed

@@ -0,0 +1,22 @@
+import re
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr", "temp"]
+labels = ["date"]
+pattern = (
+    r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
+    r"|octobre|novembre|decembre)[ \-/]\d{4}$"
+)
+def _is(val):
+    return isinstance(val, str) and bool(re.match(pattern, _process_text(val)))
+_test_values = {
+    True: ["13 février 1996", "15 decembre 2024"],
+    False: ["44 march 2025"],
+}

csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} RENAMED Viewed

@@ -1,12 +1,12 @@
 import re
-from typing import Any
-from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
+from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting
-PROPORTION = 1
-threshold = 0.7
+proportion = 1
+tags = ["temp", "type"]
+labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
-# matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
+threshold = 0.7
 pat = (
     aaaammjj_pattern.replace("$", "")
     + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
@@ -14,8 +14,7 @@ pat = (
 )
-def _is(val: Any | None) -> bool:
-    """Detects timezone-aware datetimes only"""
+def _is(val):
     # early stops, to cut processing time
     # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
     # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
@@ -32,3 +31,15 @@ def _is(val: Any | None) -> bool:
         and bool(res.hour or res.minute or res.second or res.microsecond)
         and bool(res.tzinfo)
     )
+_test_values = {
+    True: [
+        "2021-06-22 10:20:10-04:00",
+        "2030-06-22 00:00:00.0028+02:00",
+        "2000-12-21 10:20:10.1Z",
+        "2024-12-19T10:53:36.428000+00:00",
+        "1996/06/22 10:20:10 GMT",
+    ],
+    False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
+}

csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} RENAMED Viewed

@@ -1,9 +1,11 @@
 import re
 from typing import Any
-from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
+from csv_detective.formats.date import aaaammjj_pattern, date_casting
+from csv_detective.formats.datetime_aware import labels  # noqa
-PROPORTION = 1
+proportion = 1
+tags = ["temp", "type"]
 threshold = 0.7
 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
@@ -27,3 +29,20 @@ def _is(val: Any | None) -> bool:
         return False
     res = date_casting(val)
     return res is not None and not bool(res.tzinfo)
+_test_values = {
+    True: [
+        "2021-06-22 10:20:10",
+        "2030/06-22   00:00:00",
+        "2030/06/22 00:00:00.0028",
+    ],
+    False: [
+        "2021-06-22T30:20:10",
+        "Sun, 06 Nov 1994 08:49:37 GMT",
+        "2021-06-44 10:20:10+02:00",
+        "1999-12-01T00:00:00Z",
+        "2021-06-44",
+        "15 décembre 1985",
+    ],
+}

csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} RENAMED Viewed

@@ -1,18 +1,24 @@
-import re
-PROPORTION = 1
-def _is(val):
-    """Renvoie True si val peut être une date au format rfc822, False sinon
-    Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
-    return isinstance(val, str) and bool(
-        re.match(
-            r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
-            r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
-            r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
-            val.lower(),
-            re.IGNORECASE,
-        )
-    )
+import re
+from csv_detective.formats.datetime_aware import labels  # noqa
+proportion = 1
+tags = ["temp", "type"]
+def _is(val):
+    return isinstance(val, str) and bool(
+        re.match(
+            r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
+            r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
+            r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
+            val.lower(),
+            re.IGNORECASE,
+        )
+    )
+_test_values = {
+    True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
+    False: ["2021-06-22T10:20:10"],
+}

csv_detective/formats/departement.py ADDED Viewed

@@ -0,0 +1,37 @@
+from frformat import Departement, Millesime, Options
+proportion = 0.9
+tags = ["fr", "geo"]
+labels = [
+    "departement",
+    "libelle du departement",
+    "deplib",
+    "nom dept",
+    "dept",
+    "libdepartement",
+    "nom departement",
+    "libelle dep",
+    "libelle departement",
+    "lb departements",
+    "dep libusage",
+    "lb departement",
+    "nom dep",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_departement = Departement(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _departement.is_valid(val)
+_test_values = {
+    True: ["essonne"],
+    False: ["alabama", "auvergne"],
+}

csv_detective/formats/email.py ADDED Viewed

@@ -0,0 +1,28 @@
+import re
+proportion = 0.9
+labels = [
+    "email",
+    "mail",
+    "courriel",
+    "contact",
+    "mel",
+    "lieucourriel",
+    "coordinates.emailcontact",
+    "e mail",
+    "mo mail",
+    "adresse mail",
+    "adresse email",
+]
+def _is(val):
+    return isinstance(val, str) and bool(
+        re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
+    )
+_test_values = {
+    True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
+    False: ["cdo@@gouv.sfd"],
+}

csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} RENAMED Viewed

@@ -1,21 +1,29 @@
-PROPORTION = 1
-def float_casting(val: str) -> float:
-    return float(val.replace(",", "."))
-def _is(val):
-    """Detects floats, assuming that tables will not have scientific
-    notations (3e6) or "+" in the string. "-" is still accepted."""
-    try:
-        if (
-            not isinstance(val, str)
-            or any([k in val for k in ["_", "+", "e", "E"]])
-            or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
-        ):
-            return False
-        float_casting(val)
-        return True
-    except ValueError:
-        return False
+proportion = 1
+tags = ["type"]
+labels = ["part", "ratio", "taux"]
+def float_casting(val: str) -> float:
+    return float(val.replace(",", "."))
+def _is(val):
+    """Detects floats, assuming that tables will not have scientific
+    notations (3e6) or "+" in the string. "-" is still accepted."""
+    try:
+        if (
+            not isinstance(val, str)
+            or any([k in val for k in ["_", "+", "e", "E"]])
+            or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
+        ):
+            return False
+        float_casting(val)
+        return True
+    except ValueError:
+        return False
+_test_values = {
+    True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
+    False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
+}

csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl

csv-detective 0.9.3.dev2258py3-none-any.whl → 0.9.3.dev2348py3-none-any.whl