PyPI - csv-detective - Versions diffs - 0.10.1.dev2590__py3-none-any.whl → 0.10.1.dev2599__py3-none-any.whl - Mend

csv-detective 0.10.1.dev2590py3-none-any.whl → 0.10.1.dev2599py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

csv_detective/format.py +3 -3
csv_detective/formats/adresse.py +9 -9
csv_detective/formats/binary.py +1 -1
csv_detective/formats/booleen.py +2 -2
csv_detective/formats/code_commune_insee.py +11 -10
csv_detective/formats/code_csp_insee.py +1 -1
csv_detective/formats/code_departement.py +7 -7
csv_detective/formats/code_fantoir.py +5 -5
csv_detective/formats/code_import.py +1 -1
csv_detective/formats/code_postal.py +9 -9
csv_detective/formats/code_region.py +6 -6
csv_detective/formats/code_rna.py +7 -6
csv_detective/formats/code_waldec.py +1 -1
csv_detective/formats/commune.py +5 -5
csv_detective/formats/csp_insee.py +6 -5
csv_detective/formats/date.py +17 -17
csv_detective/formats/date_fr.py +1 -1
csv_detective/formats/datetime_aware.py +1 -1
csv_detective/formats/departement.py +15 -15
csv_detective/formats/email.py +13 -13
csv_detective/formats/float.py +1 -1
csv_detective/formats/geojson.py +9 -10
csv_detective/formats/insee_ape700.py +10 -8
csv_detective/formats/insee_canton.py +6 -6
csv_detective/formats/int.py +1 -1
csv_detective/formats/iso_country_code_alpha2.py +10 -9
csv_detective/formats/iso_country_code_alpha3.py +2 -9
csv_detective/formats/iso_country_code_numeric.py +2 -9
csv_detective/formats/jour_de_la_semaine.py +11 -12
csv_detective/formats/json.py +5 -0
csv_detective/formats/latitude_l93.py +6 -22
csv_detective/formats/latitude_wgs.py +19 -26
csv_detective/formats/latitude_wgs_fr_metropole.py +2 -26
csv_detective/formats/latlon_wgs.py +26 -26
csv_detective/formats/longitude_l93.py +6 -13
csv_detective/formats/longitude_wgs.py +22 -16
csv_detective/formats/longitude_wgs_fr_metropole.py +2 -16
csv_detective/formats/lonlat_wgs.py +9 -9
csv_detective/formats/mois_de_lannee.py +1 -1
csv_detective/formats/money.py +1 -1
csv_detective/formats/mongo_object_id.py +1 -1
csv_detective/formats/pays.py +11 -13
csv_detective/formats/percent.py +1 -1
csv_detective/formats/region.py +13 -13
csv_detective/formats/sexe.py +1 -1
csv_detective/formats/siren.py +8 -10
csv_detective/formats/siret.py +8 -9
csv_detective/formats/tel_fr.py +7 -13
csv_detective/formats/uai.py +17 -18
csv_detective/formats/url.py +16 -16
csv_detective/formats/username.py +1 -1
csv_detective/formats/uuid.py +1 -1
csv_detective/formats/year.py +6 -12
csv_detective/parsing/text.py +13 -12
{csv_detective-0.10.1.dev2590.dist-info → csv_detective-0.10.1.dev2599.dist-info}/METADATA +1 -1
csv_detective-0.10.1.dev2599.dist-info/RECORD +92 -0
{csv_detective-0.10.1.dev2590.dist-info → csv_detective-0.10.1.dev2599.dist-info}/WHEEL +1 -1
csv_detective-0.10.1.dev2590.dist-info/RECORD +0 -92
{csv_detective-0.10.1.dev2590.dist-info → csv_detective-0.10.1.dev2599.dist-info}/entry_points.txt +0 -0

csv_detective/format.py CHANGED Viewed

@@ -9,7 +9,7 @@ class Format:
         name: str,
         func: Callable[[Any], bool],
         _test_values: dict[bool, list[str]],
-        labels: list[str] = [],
+        labels: dict[str, float] = {},
         proportion: float = 1,
         tags: list[str] = [],
         mandatory_label: bool = False,
@@ -22,14 +22,14 @@ class Format:
             name: the name of the format.
             func: the value test for the format (returns whether a string is valid).
             _test_values: lists of valid and invalid values, used in the tests
-            labels: the list of hint headers for the header score
+            labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
             proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
             tags: to allow users to submit a file to only a subset of formats
         """
         self.name: str = name
         self.func: Callable = func
         self._test_values: dict[bool, list[str]] = _test_values
-        self.labels: list[str] = labels
+        self.labels: dict[str, float] = labels
         self.proportion: float = proportion
         self.tags: list[str] = tags
         self.mandatory_label: bool = mandatory_label

csv_detective/formats/adresse.py CHANGED Viewed

@@ -2,15 +2,15 @@ from csv_detective.parsing.text import _process_text
 proportion = 0.55
 tags = ["fr", "geo"]
-labels = [
-    "adresse",
-    "localisation",
-    "adresse postale",
-    "adresse geographique",
-    "adr",
-    "adresse complete",
-    "adresse station",
-]
+labels = {
+    "adresse": 1,
+    "localisation": 1,
+    "adresse postale": 1,
+    "adresse geographique": 1,
+    "adr": 0.5,
+    "adresse complete": 1,
+    "adresse station": 1,
+}
 voies = {
     "aire ",

csv_detective/formats/binary.py CHANGED Viewed

@@ -3,7 +3,7 @@ import codecs
 proportion = 1
 tags = ["type"]
 python_type = "binary"
-labels = ["bytes", "binary", "image", "encode", "content"]
+labels = {"bytes": 1, "binary": 1, "image": 1, "encode": 1, "content": 1}
 def binary_casting(val: str) -> bytes:

csv_detective/formats/booleen.py CHANGED Viewed

@@ -1,7 +1,7 @@
 proportion = 1
 tags = ["type"]
 python_type = "bool"
-labels = ["is ", "has ", "est "]
+labels = {"is ": 1, "has ": 1, "est ": 1}
 bool_mapping = {
     "1": True,
@@ -22,7 +22,7 @@ bool_mapping = {
 liste_bool = set(bool_mapping.keys())
-def bool_casting(val: str) -> bool:
+def bool_casting(val: str) -> bool | None:
     return bool_mapping.get(val.lower())

csv_detective/formats/code_commune_insee.py CHANGED Viewed

@@ -3,16 +3,17 @@ from frformat import CodeCommuneInsee, Millesime
 proportion = 0.75
 tags = ["fr", "geo"]
 mandatory_label = True
-labels = [
-    "code commune insee",
-    "code insee",
-    "codes insee",
-    "code commune",
-    "code insee commune",
-    "insee",
-    "code com",
-    "com",
-]
+labels = {
+    "code commune insee": 1,
+    "code insee": 1,
+    "codes insee": 1,
+    "code commune": 1,
+    "code insee commune": 1,
+    "insee": 0.75,
+    "code com": 1,
+    "com": 0.5,
+    "code": 0.5,
+}
 _code_commune_insee = CodeCommuneInsee(Millesime.LATEST)

csv_detective/formats/code_csp_insee.py CHANGED Viewed

@@ -4,7 +4,7 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr"]
-labels = ["code csp insee", "code csp"]
+labels = {"code csp insee": 1, "code csp": 1}
 def _is(val):

csv_detective/formats/code_departement.py CHANGED Viewed

@@ -3,13 +3,13 @@ from frformat import Millesime, NumeroDepartement, Options
 proportion = 1
 tags = ["fr", "geo"]
 mandatory_label = True
-labels = [
-    "code departement",
-    "code_departement",
-    "dep",
-    "departement",
-    "dept",
-]
+labels = {
+    "code departement": 1,
+    "code_departement": 1,
+    "dep": 0.5,
+    "departement": 1,
+    "dept": 0.75,
+}
 _options = Options(
     ignore_case=True,

csv_detective/formats/code_fantoir.py CHANGED Viewed

@@ -3,11 +3,11 @@ from frformat import CodeFantoir
 proportion = 1
 tags = ["fr", "geo"]
 mandatory_label = True
-labels = [
-    "cadastre1",
-    "code fantoir",
-    "fantoir",
-]
+labels = {
+    "cadastre1": 1,
+    "code fantoir": 1,
+    "fantoir": 1,
+}
 _code_fantoir = CodeFantoir()

csv_detective/formats/code_import.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 proportion = 0.9
 tags = ["fr"]
-labels = ["code"]
+labels = {"code": 0.5}
 regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"

csv_detective/formats/code_postal.py CHANGED Viewed

@@ -3,15 +3,15 @@ from frformat import CodePostal
 proportion = 0.9
 tags = ["fr", "geo"]
 mandatory_label = True
-labels = [
-    "code postal",
-    "postal code",
-    "postcode",
-    "post code",
-    "cp",
-    "codes postaux",
-    "location postcode",
-]
+labels = {
+    "code postal": 1,
+    "postal code": 1,
+    "postcode": 1,
+    "post code": 1,
+    "cp": 0.5,
+    "codes postaux": 1,
+    "location postcode": 1,
+}
 _code_postal = CodePostal()

csv_detective/formats/code_region.py CHANGED Viewed

@@ -3,12 +3,12 @@ from frformat import CodeRegion, Millesime
 proportion = 1
 tags = ["fr", "geo"]
 mandatory_label = True
-labels = [
-    "code region",
-    "reg",
-    "code insee region",
-    "region",
-]
+labels = {
+    "code region": 1,
+    "reg": 0.5,
+    "code insee region": 1,
+    "region": 1,
+}
 _code_region = CodeRegion(Millesime.LATEST)

csv_detective/formats/code_rna.py CHANGED Viewed

@@ -2,12 +2,13 @@ from frformat import CodeRNA
 proportion = 0.9
 tags = ["fr"]
-labels = [
-    "code rna",
-    "rna",
-    "n° inscription association",
-    "identifiant association",
-]
+labels = {
+    "code rna": 1,
+    "rna": 1,
+    "n° inscription association": 1,
+    "identifiant association": 1,
+    "asso": 0.75,
+}
 _code_rna = CodeRNA()

csv_detective/formats/code_waldec.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 proportion = 0.9
 tags = ["fr"]
-labels = ["code waldec", "waldec"]
+labels = {"code waldec": 1, "waldec": 1}
 regex = r"^W\d[\dA-Z]\d{7}$"

csv_detective/formats/commune.py CHANGED Viewed

@@ -2,11 +2,11 @@ from frformat import Commune, Millesime, Options
 proportion = 0.8
 tags = ["fr", "geo"]
-labels = [
-    "commune",
-    "ville",
-    "libelle commune",
-]
+labels = {
+    "commune": 1,
+    "ville": 1,
+    "libelle commune": 1,
+}
 _options = Options(
     ignore_case=True,

csv_detective/formats/csp_insee.py CHANGED Viewed

@@ -4,11 +4,12 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr"]
-labels = [
-    "csp insee",
-    "csp",
-    "categorie socioprofessionnelle",
-]
+labels = {
+    "csp insee": 1,
+    "csp": 0.75,
+    "categorie socioprofessionnelle": 1,
+    "sociopro": 1,
+}
 f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
 codes_insee = f.read().split("\n")

csv_detective/formats/date.py CHANGED Viewed

@@ -8,23 +8,23 @@ from dateutil.parser import parse as dateutil_parser
 proportion = 1
 tags = ["temp", "type"]
 python_type = "date"
-SHARED_DATE_LABELS = [
-    "date",
-    "mise à jour",
-    "modifie",
-    "maj",
-    "datemaj",
-    "update",
-    "created",
-    "modified",
-]
-labels = SHARED_DATE_LABELS + [
-    "jour",
-    "periode",
-    "dpc",
-    "yyyymmdd",
-    "aaaammjj",
-]
+SHARED_DATE_LABELS = {
+    "date": 1,
+    "mise à jour": 1,
+    "modifie": 1,
+    "maj": 0.75,
+    "datemaj": 1,
+    "update": 1,
+    "created": 1,
+    "modified": 1,
+}
+labels = SHARED_DATE_LABELS | {
+    "jour": 0.75,
+    "periode": 0.75,
+    "dpc": 0.5,
+    "yyyymmdd": 1,
+    "aaaammjj": 1,
+}
 def date_casting(val: str) -> datetime | None:

csv_detective/formats/date_fr.py CHANGED Viewed

@@ -4,7 +4,7 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr", "temp"]
-labels = ["date"]
+labels = {"date": 1}
 pattern = (
     r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"

csv_detective/formats/datetime_aware.py CHANGED Viewed

@@ -5,7 +5,7 @@ from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, dat
 proportion = 1
 tags = ["temp", "type"]
 python_type = "datetime"
-labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
+labels = SHARED_DATE_LABELS | {"datetime": 1, "timestamp": 1}
 threshold = 0.7
 pat = (

csv_detective/formats/departement.py CHANGED Viewed

@@ -2,21 +2,21 @@ from frformat import Departement, Millesime, Options
 proportion = 0.9
 tags = ["fr", "geo"]
-labels = [
-    "departement",
-    "libelle du departement",
-    "deplib",
-    "nom dept",
-    "dept",
-    "libdepartement",
-    "nom departement",
-    "libelle dep",
-    "libelle departement",
-    "lb departements",
-    "dep libusage",
-    "lb departement",
-    "nom dep",
-]
+labels = {
+    "departement": 1,
+    "libelle du departement": 1,
+    "deplib": 1,
+    "nom dept": 1,
+    "dept": 0.75,
+    "libdepartement": 1,
+    "nom departement": 1,
+    "libelle dep": 1,
+    "libelle departement": 1,
+    "lb departements": 1,
+    "dep libusage": 1,
+    "lb departement": 1,
+    "nom dep": 1,
+}
 _options = Options(
     ignore_case=True,

csv_detective/formats/email.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import re
 proportion = 0.9
-labels = [
-    "email",
-    "mail",
-    "courriel",
-    "contact",
-    "mel",
-    "lieucourriel",
-    "coordinates.emailcontact",
-    "e mail",
-    "mo mail",
-    "adresse mail",
-    "adresse email",
-]
+labels = {
+    "email": 1,
+    "mail": 1,
+    "courriel": 1,
+    "contact": 1,
+    "mel": 1,
+    "lieucourriel": 1,
+    "coordinates.emailcontact": 1,
+    "e mail": 1,
+    "mo mail": 1,
+    "adresse mail": 1,
+    "adresse email": 1,
+}
 def _is(val):

csv_detective/formats/float.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 proportion = 1
 tags = ["type"]
 python_type = "float"
-labels = ["part", "ratio", "taux"]
+labels = {"part": 1, "ratio": 1, "taux": 1}
 scientific_notation_pattern = r"\d+\.\d+[e|E][+|-]?\d+"

csv_detective/formats/geojson.py CHANGED Viewed

@@ -3,16 +3,15 @@ import json
 proportion = 1
 tags = ["geo"]
 python_type = "json"
-labels = [
-    "json geojson",
-    "json",
-    "geojson",
-    "geo shape",
-    "geom",
-    "geometry",
-    "geo shape",
-    "geoshape",
-]
+labels = {
+    "json geojson": 1,
+    "json": 1,
+    "geojson": 1,
+    "geo shape": 1,
+    "geom": 0.75,
+    "geometry": 1,
+    "geoshape": 1,
+}
 def _is(val) -> bool:

csv_detective/formats/insee_ape700.py CHANGED Viewed

@@ -4,14 +4,16 @@ from csv_detective.parsing.text import _process_text
 proportion = 0.8
 tags = ["fr"]
-labels = [
-    "code ape",
-    "code activite (ape)",
-    "code naf",
-    "code naf organisme designe",
-    "code naf organisme designant",
-    "base sirene : code ape de l'etablissement siege",
-]
+labels = {
+    "code ape": 1,
+    "code activite (ape)": 1,
+    "code naf": 1,
+    "code naf organisme designe": 1,
+    "code naf organisme designant": 1,
+    "base sirene : code ape de l'etablissement siege": 1,
+    "naf": 0.75,
+    "ape": 0.5,
+}
 f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r")
 condes_insee_ape = f.read().split("\n")

csv_detective/formats/insee_canton.py CHANGED Viewed

@@ -2,12 +2,12 @@ from frformat import Canton, Millesime, Options
 proportion = 0.9
 tags = ["fr", "geo"]
-labels = [
-    "insee canton",
-    "canton",
-    "cant",
-    "nom canton",
-]
+labels = {
+    "insee canton": 1,
+    "canton": 1,
+    "cant": 0.5,
+    "nom canton": 1,
+}
 _options = Options(
     ignore_case=True,

csv_detective/formats/int.py CHANGED Viewed

@@ -1,6 +1,6 @@
 tag = ["type"]
 python_type = "int"
-labels = ["nb", "nombre", "nbre"]
+labels = {"nb": 0.75, "nombre": 1, "nbre": 0.75}
 def _is(val):

csv_detective/formats/iso_country_code_alpha2.py CHANGED Viewed

@@ -3,15 +3,16 @@ from os.path import dirname, join
 proportion = 1
 tags = ["geo"]
-labels = [
-    "iso country code",
-    "code pays",
-    "pays",
-    "country",
-    "nation",
-    "pays code",
-    "code pays (iso)",
-]
+labels = {
+    "iso country code": 1,
+    "code pays": 1,
+    "pays": 1,
+    "country": 1,
+    "nation": 1,
+    "pays code": 1,
+    "code pays (iso)": 1,
+    "code": 0.5,
+}
 with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile:
     liste_pays = iofile.read().split("\n")

csv_detective/formats/iso_country_code_alpha3.py CHANGED Viewed

@@ -1,17 +1,10 @@
 import re
 from os.path import dirname, join
+from csv_detective.formats.iso_country_code_alpha2 import labels  # noqa
 proportion = 1
 tags = ["geo"]
-labels = [
-    "iso country code",
-    "code pays",
-    "pays",
-    "country",
-    "nation",
-    "pays code",
-    "code pays (iso)",
-]
 with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile:
     liste_pays = iofile.read().split("\n")

csv_detective/formats/iso_country_code_numeric.py CHANGED Viewed

@@ -1,17 +1,10 @@
 import re
 from os.path import dirname, join
+from csv_detective.formats.iso_country_code_alpha2 import labels  # noqa
 proportion = 1
 tags = ["geo"]
-labels = [
-    "iso country code",
-    "code pays",
-    "pays",
-    "country",
-    "nation",
-    "pays code",
-    "code pays (iso)",
-]
 with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile:
     liste_pays = iofile.read().split("\n")

csv_detective/formats/jour_de_la_semaine.py CHANGED Viewed

@@ -1,14 +1,14 @@
 proportion = 0.8
 tags = ["fr", "temp"]
-labels = [
-    "jour semaine",
-    "type jour",
-    "jour de la semaine",
-    "saufjour",
-    "nomjour",
-    "jour",
-    "jour de fermeture",
-]
+labels = {
+    "jour semaine": 1,
+    "type jour": 1,
+    "jour de la semaine": 1,
+    "saufjour": 1,
+    "nomjour": 1,
+    "jour": 0.75,
+    "jour de fermeture": 1,
+}
 jours = {
     "lundi",
@@ -31,11 +31,10 @@ jours = {
 def _is(val):
     if not isinstance(val, str):
         return False
-    val = val.lower()
-    return val in jours
+    return val.lower() in jours
 _test_values = {
     True: ["lundi"],
-    False: ["jour de la biere"],
+    False: ["jour"],
 }

csv_detective/formats/json.py CHANGED Viewed

@@ -4,6 +4,11 @@ from json import JSONDecodeError
 proportion = 1
 python_type = "json"
 tags = ["type"]
+labels = {
+    "list": 1,
+    "dict": 1,
+    "complex": 1,
+}
 def _is(val):

csv-detective 0.10.1.dev2590__py3-none-any.whl → 0.10.1.dev2599__py3-none-any.whl

csv-detective 0.10.1.dev2590py3-none-any.whl → 0.10.1.dev2599py3-none-any.whl