PyPI - csv-detective - Versions diffs - 0.10.4.dev1__py3-none-any.whl → 0.10.2549__py3-none-any.whl - Mend

csv-detective 0.10.4.dev1py3-none-any.whl → 0.10.2549py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

csv_detective/detection/__init__.py +0 -0
csv_detective/detection/columns.py +0 -0
csv_detective/detection/encoding.py +0 -0
csv_detective/detection/engine.py +0 -0
csv_detective/detection/formats.py +38 -13
csv_detective/detection/headers.py +14 -12
csv_detective/detection/rows.py +1 -1
csv_detective/detection/separator.py +0 -0
csv_detective/detection/variables.py +0 -0
csv_detective/explore_csv.py +6 -18
csv_detective/format.py +5 -12
csv_detective/formats/__init__.py +0 -0
csv_detective/formats/adresse.py +9 -9
csv_detective/formats/binary.py +1 -2
csv_detective/formats/booleen.py +2 -3
csv_detective/formats/code_commune_insee.py +10 -12
csv_detective/formats/code_csp_insee.py +1 -1
csv_detective/formats/code_departement.py +7 -8
csv_detective/formats/code_fantoir.py +5 -6
csv_detective/formats/code_import.py +1 -1
csv_detective/formats/code_postal.py +9 -10
csv_detective/formats/code_region.py +6 -7
csv_detective/formats/code_rna.py +6 -7
csv_detective/formats/code_waldec.py +1 -1
csv_detective/formats/commune.py +5 -5
csv_detective/formats/csp_insee.py +5 -6
csv_detective/formats/data/insee_ape700.txt +1 -1
csv_detective/formats/data/iso_country_code_alpha2.txt +397 -153
csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
csv_detective/formats/date.py +18 -28
csv_detective/formats/date_fr.py +1 -1
csv_detective/formats/datetime_aware.py +2 -7
csv_detective/formats/datetime_naive.py +0 -3
csv_detective/formats/datetime_rfc822.py +0 -1
csv_detective/formats/departement.py +15 -15
csv_detective/formats/email.py +13 -13
csv_detective/formats/float.py +1 -2
csv_detective/formats/geojson.py +10 -10
csv_detective/formats/insee_ape700.py +8 -10
csv_detective/formats/insee_canton.py +6 -6
csv_detective/formats/int.py +1 -2
csv_detective/formats/iso_country_code_alpha2.py +14 -14
csv_detective/formats/iso_country_code_alpha3.py +13 -6
csv_detective/formats/iso_country_code_numeric.py +9 -2
csv_detective/formats/jour_de_la_semaine.py +12 -11
csv_detective/formats/json.py +0 -6
csv_detective/formats/latitude_l93.py +22 -8
csv_detective/formats/latitude_wgs.py +29 -31
csv_detective/formats/latitude_wgs_fr_metropole.py +30 -7
csv_detective/formats/latlon_wgs.py +28 -30
csv_detective/formats/longitude_l93.py +13 -8
csv_detective/formats/longitude_wgs.py +19 -34
csv_detective/formats/longitude_wgs_fr_metropole.py +19 -6
csv_detective/formats/lonlat_wgs.py +11 -12
csv_detective/formats/mois_de_lannee.py +1 -1
csv_detective/formats/money.py +1 -1
csv_detective/formats/mongo_object_id.py +1 -1
csv_detective/formats/pays.py +13 -11
csv_detective/formats/percent.py +1 -1
csv_detective/formats/region.py +13 -13
csv_detective/formats/sexe.py +1 -1
csv_detective/formats/siren.py +10 -9
csv_detective/formats/siret.py +9 -9
csv_detective/formats/tel_fr.py +13 -7
csv_detective/formats/uai.py +18 -17
csv_detective/formats/url.py +16 -16
csv_detective/formats/username.py +1 -1
csv_detective/formats/uuid.py +1 -1
csv_detective/formats/year.py +12 -7
csv_detective/output/__init__.py +0 -0
csv_detective/output/dataframe.py +3 -8
csv_detective/output/example.py +0 -0
csv_detective/output/profile.py +2 -6
csv_detective/output/schema.py +0 -0
csv_detective/output/utils.py +0 -0
csv_detective/parsing/__init__.py +0 -0
csv_detective/parsing/columns.py +5 -9
csv_detective/parsing/compression.py +0 -0
csv_detective/parsing/csv.py +0 -0
csv_detective/parsing/excel.py +1 -1
csv_detective/parsing/load.py +12 -11
csv_detective/parsing/text.py +12 -13
csv_detective/validate.py +36 -71
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.2549.dist-info}/METADATA +18 -15
csv_detective-0.10.2549.dist-info/RECORD +92 -0
csv_detective-0.10.2549.dist-info/WHEEL +4 -0
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.2549.dist-info}/entry_points.txt +1 -0
csv_detective-0.10.4.dev1.dist-info/RECORD +0 -111
csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
tests/__init__.py +0 -0
tests/data/a_test_file.csv +0 -407
tests/data/a_test_file.json +0 -394
tests/data/b_test_file.csv +0 -7
tests/data/c_test_file.csv +0 -2
tests/data/csv_file +0 -7
tests/data/file.csv.gz +0 -0
tests/data/file.ods +0 -0
tests/data/file.xls +0 -0
tests/data/file.xlsx +0 -0
tests/data/xlsx_file +0 -0
tests/test_example.py +0 -67
tests/test_fields.py +0 -175
tests/test_file.py +0 -469
tests/test_labels.py +0 -26
tests/test_structure.py +0 -45
tests/test_validation.py +0 -163

csv_detective/formats/date.py CHANGED Viewed

@@ -7,24 +7,23 @@ from dateutil.parser import parse as dateutil_parser
 proportion = 1
 tags = ["temp", "type"]
-python_type = "date"
-SHARED_DATE_LABELS = {
-    "date": 1,
-    "mise à jour": 1,
-    "modifie": 1,
-    "maj": 0.75,
-    "datemaj": 1,
-    "update": 1,
-    "created": 1,
-    "modified": 1,
-}
-labels = SHARED_DATE_LABELS | {
-    "jour": 0.75,
-    "periode": 0.75,
-    "dpc": 0.5,
-    "yyyymmdd": 1,
-    "aaaammjj": 1,
-}
+SHARED_DATE_LABELS = [
+    "date",
+    "mise à jour",
+    "modifie",
+    "maj",
+    "datemaj",
+    "update",
+    "created",
+    "modified",
+]
+labels = SHARED_DATE_LABELS + [
+    "jour",
+    "periode",
+    "dpc",
+    "yyyymmdd",
+    "aaaammjj",
+]
 def date_casting(val: str) -> datetime | None:
@@ -57,9 +56,7 @@ string_month_pattern = (
 def _is(val):
-    # many early stops, to cut processing time
-    # and avoid the costly use of date_casting as much as possible
-    # /!\ timestamps are considered ints, not dates
+    # early stops, to cut processing time
     if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
         return False
     # if it's a usual date pattern
@@ -72,13 +69,8 @@ def _is(val):
         ]
     ):
         return True
-    if re.match(r"^-?\d+[\.|,]\d+$", val):
-        # regular floats are excluded
-        return False
-    # not enough digits => not a date (slightly arbitrary)
     if sum([char.isdigit() for char in val]) / len(val) < threshold:
         return False
-    # last resort
     res = date_casting(val)
     if not res or res.hour or res.minute or res.second:
         return False
@@ -93,7 +85,6 @@ _test_values = {
         "15 décembre 1985",
         "02 05 2003",
         "20030502",
-        "2003.05.02",
         "1993-12/02",
     ],
     False: [
@@ -104,6 +95,5 @@ _test_values = {
         "12152003",
         "20031512",
         "02052003",
-        "6.27367393749392839",
     ],
 }

csv_detective/formats/date_fr.py CHANGED Viewed

@@ -4,7 +4,7 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr", "temp"]
-labels = {"date": 1}
+labels = ["date"]
 pattern = (
     r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"

csv_detective/formats/datetime_aware.py CHANGED Viewed

@@ -4,8 +4,7 @@ from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, dat
 proportion = 1
 tags = ["temp", "type"]
-python_type = "datetime"
-labels = SHARED_DATE_LABELS | {"datetime": 1, "timestamp": 1}
+labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
 threshold = 0.7
 pat = (
@@ -13,9 +12,7 @@ pat = (
     + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
     + r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
 )
-# date_casting is very (too?) good at finding date(time)s where there sometimes is just a number
-# this prefix check asserts we only consider strings that have a somewhat fine structure trying to cast
-prefix = r"^\d{2}[-/:]?\d{2}"
+prefix = r"^\d{4}"
 def _is(val):
@@ -44,8 +41,6 @@ _test_values = {
         "2000-12-21 10:20:10.1Z",
         "2024-12-19T10:53:36.428000+00:00",
         "1996/06/22 10:20:10 GMT",
-        "12/31/2022 12:00:00-04:00",
-        "12:00:00-04:00 12/31/2022",
     ],
     False: [
         "2021-06-22T30:20:10",

csv_detective/formats/datetime_naive.py CHANGED Viewed

@@ -6,7 +6,6 @@ from csv_detective.formats.datetime_aware import labels, prefix  # noqa
 proportion = 1
 tags = ["temp", "type"]
-python_type = "datetime"
 threshold = 0.7
 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
@@ -37,8 +36,6 @@ _test_values = {
         "2021-06-22 10:20:10",
         "2030/06-22   00:00:00",
         "2030/06/22 00:00:00.0028",
-        "12/31/2022 12:00:00",
-        "12:00:00 12/31/2022",
     ],
     False: [
         "2021-06-22T30:20:10",

csv_detective/formats/datetime_rfc822.py CHANGED Viewed

@@ -4,7 +4,6 @@ from csv_detective.formats.datetime_aware import labels  # noqa
 proportion = 1
 tags = ["temp", "type"]
-python_type = "datetime"
 def _is(val):

csv_detective/formats/departement.py CHANGED Viewed

@@ -2,21 +2,21 @@ from frformat import Departement, Millesime, Options
 proportion = 0.9
 tags = ["fr", "geo"]
-labels = {
-    "departement": 1,
-    "libelle du departement": 1,
-    "deplib": 1,
-    "nom dept": 1,
-    "dept": 0.75,
-    "libdepartement": 1,
-    "nom departement": 1,
-    "libelle dep": 1,
-    "libelle departement": 1,
-    "lb departements": 1,
-    "dep libusage": 1,
-    "lb departement": 1,
-    "nom dep": 1,
-}
+labels = [
+    "departement",
+    "libelle du departement",
+    "deplib",
+    "nom dept",
+    "dept",
+    "libdepartement",
+    "nom departement",
+    "libelle dep",
+    "libelle departement",
+    "lb departements",
+    "dep libusage",
+    "lb departement",
+    "nom dep",
+]
 _options = Options(
     ignore_case=True,

csv_detective/formats/email.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import re
 proportion = 0.9
-labels = {
-    "email": 1,
-    "mail": 1,
-    "courriel": 1,
-    "contact": 1,
-    "mel": 1,
-    "lieucourriel": 1,
-    "coordinates.emailcontact": 1,
-    "e mail": 1,
-    "mo mail": 1,
-    "adresse mail": 1,
-    "adresse email": 1,
-}
+labels = [
+    "email",
+    "mail",
+    "courriel",
+    "contact",
+    "mel",
+    "lieucourriel",
+    "coordinates.emailcontact",
+    "e mail",
+    "mo mail",
+    "adresse mail",
+    "adresse email",
+]
 def _is(val):

csv_detective/formats/float.py CHANGED Viewed

@@ -2,8 +2,7 @@ import re
 proportion = 1
 tags = ["type"]
-python_type = "float"
-labels = {"part": 1, "ratio": 1, "taux": 1}
+labels = ["part", "ratio", "taux"]
 scientific_notation_pattern = r"\d+\.\d+[e|E][+|-]?\d+"

csv_detective/formats/geojson.py CHANGED Viewed

@@ -2,16 +2,16 @@ import json
 proportion = 1
 tags = ["geo"]
-python_type = "json"
-labels = {
-    "json geojson": 1,
-    "json": 1,
-    "geojson": 1,
-    "geo shape": 1,
-    "geom": 0.75,
-    "geometry": 1,
-    "geoshape": 1,
-}
+labels = [
+    "json geojson",
+    "json",
+    "geojson",
+    "geo shape",
+    "geom",
+    "geometry",
+    "geo shape",
+    "geoshape",
+]
 def _is(val) -> bool:

csv_detective/formats/insee_ape700.py CHANGED Viewed

@@ -4,16 +4,14 @@ from csv_detective.parsing.text import _process_text
 proportion = 0.8
 tags = ["fr"]
-labels = {
-    "code ape": 1,
-    "code activite (ape)": 1,
-    "code naf": 1,
-    "code naf organisme designe": 1,
-    "code naf organisme designant": 1,
-    "base sirene : code ape de l'etablissement siege": 1,
-    "naf": 0.75,
-    "ape": 0.5,
-}
+labels = [
+    "code ape",
+    "code activite (ape)",
+    "code naf",
+    "code naf organisme designe",
+    "code naf organisme designant",
+    "base sirene : code ape de l'etablissement siege",
+]
 f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r")
 condes_insee_ape = f.read().split("\n")

csv_detective/formats/insee_canton.py CHANGED Viewed

@@ -2,12 +2,12 @@ from frformat import Canton, Millesime, Options
 proportion = 0.9
 tags = ["fr", "geo"]
-labels = {
-    "insee canton": 1,
-    "canton": 1,
-    "cant": 0.5,
-    "nom canton": 1,
-}
+labels = [
+    "insee canton",
+    "canton",
+    "cant",
+    "nom canton",
+]
 _options = Options(
     ignore_case=True,

csv_detective/formats/int.py CHANGED Viewed

@@ -1,6 +1,5 @@
+labels = ["nb", "nombre", "nbre"]
 tag = ["type"]
-python_type = "int"
-labels = {"nb": 0.75, "nombre": 1, "nbre": 0.75}
 def _is(val):

csv_detective/formats/iso_country_code_alpha2.py CHANGED Viewed

@@ -3,28 +3,28 @@ from os.path import dirname, join
 proportion = 1
 tags = ["geo"]
-labels = {
-    "iso country code": 1,
-    "code pays": 1,
-    "pays": 1,
-    "country": 1,
-    "nation": 1,
-    "pays code": 1,
-    "code pays (iso)": 1,
-    "code": 0.5,
-}
+labels = [
+    "iso country code",
+    "code pays",
+    "pays",
+    "country",
+    "nation",
+    "pays code",
+    "code pays (iso)",
+]
 with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile:
-    liste_pays = set(iofile.read().split("\n"))
+    liste_pays = iofile.read().split("\n")
+liste_pays = set(liste_pays)
 def _is(val):
-    if not isinstance(val, str) or not bool(re.match(r"[a-zA-Z]{2}$", val)):
+    if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
         return False
-    return val.upper() in liste_pays
+    return val in liste_pays
 _test_values = {
-    True: ["FR", "sj"],
+    True: ["FR"],
     False: ["XX", "A", "FRA"],
 }

csv_detective/formats/iso_country_code_alpha3.py CHANGED Viewed

@@ -1,23 +1,30 @@
 import re
 from os.path import dirname, join
-from csv_detective.formats.iso_country_code_alpha2 import labels  # noqa
 proportion = 1
 tags = ["geo"]
+labels = [
+    "iso country code",
+    "code pays",
+    "pays",
+    "country",
+    "nation",
+    "pays code",
+    "code pays (iso)",
+]
 with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile:
-    liste_pays = set(iofile.read().split("\n"))
+    liste_pays = iofile.read().split("\n")
 def _is(val):
     """Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
-    if not isinstance(val, str) or not bool(re.match(r"[a-zA-Z]{3}$", val)):
+    if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
         return False
-    return val.upper() in liste_pays
+    return val in set(liste_pays)
 _test_values = {
-    True: ["FRA", "brb"],
+    True: ["FRA"],
     False: ["XXX", "FR", "A"],
 }

csv_detective/formats/iso_country_code_numeric.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import re
 from os.path import dirname, join
-from csv_detective.formats.iso_country_code_alpha2 import labels  # noqa
 proportion = 1
 tags = ["geo"]
+labels = [
+    "iso country code",
+    "code pays",
+    "pays",
+    "country",
+    "nation",
+    "pays code",
+    "code pays (iso)",
+]
 with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile:
     liste_pays = iofile.read().split("\n")

csv_detective/formats/jour_de_la_semaine.py CHANGED Viewed

@@ -1,14 +1,14 @@
 proportion = 0.8
 tags = ["fr", "temp"]
-labels = {
-    "jour semaine": 1,
-    "type jour": 1,
-    "jour de la semaine": 1,
-    "saufjour": 1,
-    "nomjour": 1,
-    "jour": 0.75,
-    "jour de fermeture": 1,
-}
+labels = [
+    "jour semaine",
+    "type jour",
+    "jour de la semaine",
+    "saufjour",
+    "nomjour",
+    "jour",
+    "jour de fermeture",
+]
 jours = {
     "lundi",
@@ -31,10 +31,11 @@ jours = {
 def _is(val):
     if not isinstance(val, str):
         return False
-    return val.lower() in jours
+    val = val.lower()
+    return val in jours
 _test_values = {
     True: ["lundi"],
-    False: ["jour"],
+    False: ["jour de la biere"],
 }

csv_detective/formats/json.py CHANGED Viewed

@@ -2,13 +2,7 @@ import json
 from json import JSONDecodeError
 proportion = 1
-python_type = "json"
 tags = ["type"]
-labels = {
-    "list": 1,
-    "dict": 1,
-    "complex": 1,
-}
 def _is(val):

csv_detective/formats/latitude_l93.py CHANGED Viewed

@@ -2,17 +2,31 @@ from frformat import LatitudeL93
 from csv_detective.formats.float import _is as is_float
 from csv_detective.formats.float import float_casting
-from csv_detective.formats.latitude_wgs import SHARED_LATITUDE_LABELS
 proportion = 1
 tags = ["fr", "geo"]
-mandatory_label = True
-python_type = "float"
-labels = SHARED_LATITUDE_LABELS | {
-    "y l93": 1,
-    "latitude lb93": 1,
-    "lamby": 1,
-}
+labels = [
+    "latitude",
+    "lat",
+    "y",
+    "yf",
+    "yd",
+    "y l93",
+    "coordonnee y",
+    "latitude lb93",
+    "coord y",
+    "ycoord",
+    "geocodage y gps",
+    "location latitude",
+    "ylatitude",
+    "ylat",
+    "latitude (y)",
+    "latitudeorg",
+    "coordinates.latitude",
+    "googlemap latitude",
+    "latitudelieu",
+    "latitude googlemap",
+]
 _latitudel93 = LatitudeL93()

csv_detective/formats/latitude_wgs.py CHANGED Viewed

@@ -1,44 +1,42 @@
 from csv_detective.formats.float import _is as is_float
-from csv_detective.formats.int import _is as is_int
 proportion = 1
 tags = ["geo"]
-mandatory_label = True
-python_type = "float"
-SHARED_LATITUDE_LABELS = {
-    "latitude": 1,
-    "lat": 0.75,
-    "y": 0.5,
-    "yf": 0.5,
-    "yd": 0.5,
-    "coordonnee y": 1,
-    "coord y": 1,
-    "ycoord": 1,
-    "ylat": 1,
-}
-labels = SHARED_LATITUDE_LABELS | {
-    "y gps": 1,
-    "latitude wgs84": 1,
-    "y wgs84": 1,
-    "wsg": 0.75,
-    "gps": 0.5,
-}
+labels = [
+    "latitude",
+    "lat",
+    "y",
+    "yf",
+    "yd",
+    "coordonnee y",
+    "coord y",
+    "ycoord",
+    "geocodage y gps",
+    "location latitude",
+    "ylatitude",
+    "ylat",
+    "latitude (y)",
+    "latitudeorg",
+    "coordinates.latitude",
+    "googlemap latitude",
+    "latitudelieu",
+    "latitude googlemap",
+    "latitude wgs84",
+    "y wgs84",
+    "latitude (wgs84)",
+]
 def _is(val):
     try:
-        return (
-            is_float(val)
-            and -90 <= float(val) <= 90
-            # we ideally would like a certain level of decimal precision
-            # but 1.200 is saved as 1.2 in csv so we just discriminate ints
-            and not is_int(val)
-        )
-    except Exception:
+        return is_float(val) and float(val) >= -90 and float(val) <= 90
+    except ValueError:
+        return False
+    except OverflowError:
         return False
 _test_values = {
-    True: ["43.2872", "-22.61", "-3.0"],
-    False: ["100.1973", "40"],
+    True: ["43.2", "-22"],
+    False: ["100"],
 }

csv_detective/formats/latitude_wgs_fr_metropole.py CHANGED Viewed

@@ -1,19 +1,42 @@
-from csv_detective.formats.latitude_wgs import _is as is_latitude, labels  # noqa
+from csv_detective.formats.float import _is as is_float
 proportion = 1
 tags = ["fr", "geo"]
-mandatory_label = True
-python_type = "float"
+labels = [
+    "latitude",
+    "lat",
+    "y",
+    "yf",
+    "yd",
+    "coordonnee y",
+    "coord y",
+    "ycoord",
+    "geocodage y gps",
+    "location latitude",
+    "ylatitude",
+    "ylat",
+    "latitude (y)",
+    "latitudeorg",
+    "coordinates.latitude",
+    "googlemap latitude",
+    "latitudelieu",
+    "latitude googlemap",
+    "latitude wgs84",
+    "y wgs84",
+    "latitude (wgs84)",
+]
 def _is(val):
     try:
-        return is_latitude(val) and 41.3 <= float(val) <= 51.3
-    except Exception:
+        return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3
+    except ValueError:
+        return False
+    except OverflowError:
         return False
 _test_values = {
-    True: ["42.576", "42.5"],
-    False: ["22.5"],
+    True: ["42.5"],
+    False: ["22.5", "62.5"],
 }

csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.2549__py3-none-any.whl

csv-detective 0.10.4.dev1py3-none-any.whl → 0.10.2549py3-none-any.whl