PyPI - csv-detective - Versions diffs - 0.10.2549__py3-none-any.whl → 0.10.12674__py3-none-any.whl - Mend

csv-detective 0.10.2549py3-none-any.whl → 0.10.12674py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

csv_detective/detection/formats.py +11 -38
csv_detective/explore_csv.py +3 -2
csv_detective/format.py +11 -4
csv_detective/formats/adresse.py +9 -9
csv_detective/formats/binary.py +2 -1
csv_detective/formats/booleen.py +3 -2
csv_detective/formats/code_commune_insee.py +12 -10
csv_detective/formats/code_csp_insee.py +1 -1
csv_detective/formats/code_departement.py +8 -7
csv_detective/formats/code_fantoir.py +6 -5
csv_detective/formats/code_import.py +1 -1
csv_detective/formats/code_postal.py +10 -9
csv_detective/formats/code_region.py +7 -6
csv_detective/formats/code_rna.py +7 -6
csv_detective/formats/code_waldec.py +1 -1
csv_detective/formats/commune.py +5 -5
csv_detective/formats/csp_insee.py +6 -5
csv_detective/formats/data/insee_ape700.txt +1 -1
csv_detective/formats/data/iso_country_code_alpha2.txt +153 -397
csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
csv_detective/formats/date.py +18 -17
csv_detective/formats/date_fr.py +1 -1
csv_detective/formats/datetime_aware.py +7 -2
csv_detective/formats/datetime_naive.py +3 -0
csv_detective/formats/datetime_rfc822.py +1 -0
csv_detective/formats/departement.py +15 -15
csv_detective/formats/email.py +13 -13
csv_detective/formats/float.py +2 -1
csv_detective/formats/geojson.py +10 -10
csv_detective/formats/insee_ape700.py +10 -8
csv_detective/formats/insee_canton.py +6 -6
csv_detective/formats/int.py +2 -1
csv_detective/formats/iso_country_code_alpha2.py +14 -14
csv_detective/formats/iso_country_code_alpha3.py +6 -13
csv_detective/formats/iso_country_code_numeric.py +2 -9
csv_detective/formats/jour_de_la_semaine.py +11 -12
csv_detective/formats/json.py +6 -0
csv_detective/formats/latitude_l93.py +8 -22
csv_detective/formats/latitude_wgs.py +31 -29
csv_detective/formats/latitude_wgs_fr_metropole.py +7 -30
csv_detective/formats/latlon_wgs.py +30 -28
csv_detective/formats/longitude_l93.py +8 -13
csv_detective/formats/longitude_wgs.py +34 -19
csv_detective/formats/longitude_wgs_fr_metropole.py +6 -19
csv_detective/formats/lonlat_wgs.py +12 -11
csv_detective/formats/mois_de_lannee.py +1 -1
csv_detective/formats/money.py +1 -1
csv_detective/formats/mongo_object_id.py +1 -1
csv_detective/formats/pays.py +11 -13
csv_detective/formats/percent.py +1 -1
csv_detective/formats/region.py +13 -13
csv_detective/formats/sexe.py +1 -1
csv_detective/formats/siren.py +9 -10
csv_detective/formats/siret.py +9 -9
csv_detective/formats/tel_fr.py +7 -13
csv_detective/formats/uai.py +17 -18
csv_detective/formats/url.py +16 -16
csv_detective/formats/username.py +1 -1
csv_detective/formats/uuid.py +1 -1
csv_detective/formats/year.py +7 -12
csv_detective/output/dataframe.py +6 -1
csv_detective/output/profile.py +5 -1
csv_detective/parsing/text.py +13 -12
{csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +2 -2
csv_detective-0.10.12674.dist-info/RECORD +92 -0
{csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/WHEEL +1 -1
csv_detective-0.10.2549.dist-info/RECORD +0 -92
{csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +0 -0

csv_detective/formats/longitude_l93.py CHANGED Viewed

@@ -2,22 +2,17 @@ from frformat import LongitudeL93
 from csv_detective.formats.float import _is as is_float
 from csv_detective.formats.float import float_casting
+from csv_detective.formats.longitude_wgs import SHARED_LONGITUDE_LABELS
 proportion = 1
 tags = ["fr", "geo"]
-labels = [
-    "longitude",
-    "lon",
-    "long",
-    "geocodage x gps",
-    "location longitude",
-    "xlongitude",
-    "lng",
-    "xlong",
-    "x",
-    "xf",
-    "xd",
-]
+mandatory_label = True
+python_type = "float"
+labels = SHARED_LONGITUDE_LABELS | {
+    "x l93": 1,
+    "longitude lb93": 1,
+    "lambx": 1,
+}
 _longitudel93 = LongitudeL93()

csv_detective/formats/longitude_wgs.py CHANGED Viewed

@@ -1,32 +1,47 @@
 from csv_detective.formats.float import _is as is_float
+from csv_detective.formats.int import _is as is_int
 proportion = 1
 tags = ["geo"]
-labels = [
-    "longitude",
-    "lon",
-    "long",
-    "geocodage x gps",
-    "location longitude",
-    "xlongitude",
-    "lng",
-    "xlong",
-    "x",
-    "xf",
-    "xd",
-]
+mandatory_label = True
+python_type = "float"
+SHARED_LONGITUDE_LABELS = {
+    "longitude": 1,
+    "long": 0.75,
+    "lon": 0.75,
+    "lng": 0.5,
+    "x": 0.5,
+    "xf": 0.5,
+    "xd": 0.5,
+    "coordonnee x": 1,
+    "coord x": 1,
+    "xcoord": 1,
+    "xlon": 1,
+    "xlong": 1,
+}
+labels = SHARED_LONGITUDE_LABELS | {
+    "x gps": 1,
+    "longitude wgs84": 1,
+    "x wgs84": 1,
+    "wsg": 0.75,
+    "gps": 0.5,
+}
 def _is(val):
     try:
-        return is_float(val) and float(val) >= -180 and float(val) <= 180
-    except ValueError:
-        return False
-    except OverflowError:
+        return (
+            is_float(val)
+            and -180 <= float(val) <= 180
+            # we ideally would like a certain level of decimal precision
+            # but 1.200 is saved as 1.2 in csv so we just discriminate ints
+            and not is_int(val)
+        )
+    except Exception:
         return False
 _test_values = {
-    True: ["120", "-20.2"],
-    False: ["-200"],
+    True: ["120.8263", "-20.27", "31.0"],
+    False: ["-200", "20"],
 }

csv_detective/formats/longitude_wgs_fr_metropole.py CHANGED Viewed

@@ -1,32 +1,19 @@
-from csv_detective.formats.float import _is as is_float
+from csv_detective.formats.longitude_wgs import _is as is_longitude, labels  # noqa
 proportion = 1
 tags = ["fr", "geo"]
-labels = [
-    "longitude",
-    "lon",
-    "long",
-    "geocodage x gps",
-    "location longitude",
-    "xlongitude",
-    "lng",
-    "xlong",
-    "x",
-    "xf",
-    "xd",
-]
+mandatory_label = True
+python_type = "float"
 def _is(val):
     try:
-        return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8
-    except ValueError:
-        return False
-    except OverflowError:
+        return is_longitude(val) and -5.5 <= float(val) <= 9.8
+    except Exception:
         return False
 _test_values = {
-    True: ["-2.5"],
+    True: ["-2.01", "8.0"],
     False: ["12.8"],
 }

csv_detective/formats/lonlat_wgs.py CHANGED Viewed

@@ -4,19 +4,20 @@ from csv_detective.formats.longitude_wgs import _is as is_lon
 proportion = 1
 tags = ["geo"]
+mandatory_label = True
-specific = [
-    "lonlat",
-    "lon lat",
-    "y x",
-    "yx",
-]
+specific = {
+    "lonlat": 1,
+    "lon lat": 1,
+    "y x": 0.75,
+    "yx": 0.75,
+}
 # we aim wide to catch exact matches if possible for the highest possible score
-words = (
+labels = (
     SHARED_COORDS_LABELS
-    + specific
-    + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
+    | specific
+    | {w + sep + suf: 1 for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]}
 )
@@ -31,6 +32,6 @@ def _is(val):
 _test_values = {
-    True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
-    False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
+    True: ["-22.6,43.012", "140.0,-10.70", "10.829, -40.71", "[-0.28,12.43]"],
+    False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1", "2,4", "-22, 43.0"],
 }

csv_detective/formats/mois_de_lannee.py CHANGED Viewed

@@ -2,7 +2,7 @@ from unidecode import unidecode
 proportion = 1
 tags = ["fr", "temp"]
-labels = ["mois", "month"]
+labels = {"mois": 1, "month": 1}
 mois = {
     "janvier",

csv_detective/formats/money.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from csv_detective.formats.float import _is as is_float
 proportion = 0.8
-labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
+labels = {"budget": 1, "salaire": 1, "euro": 1, "euros": 1, "prêt": 1, "montant": 1}
 currencies = {"€", "$", "£", "¥"}

csv_detective/formats/mongo_object_id.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 proportion = 0.8
-labels = ["id", "objectid"]
+labels = {"id": 1, "objectid": 1}
 def _is(val):

csv_detective/formats/pays.py CHANGED Viewed

@@ -2,19 +2,17 @@ from frformat import Millesime, Options, Pays
 proportion = 0.6
 tags = ["fr", "geo"]
-labels = [
-    "pays",
-    "payslieu",
-    "paysorg",
-    "country",
-    "pays lib",
-    "lieupays",
-    "pays beneficiaire",
-    "nom du pays",
-    "journey start country",
-    "libelle pays",
-    "journey end country",
-]
+labels = {
+    "pays": 1,
+    "payslieu": 1,
+    "paysorg": 1,
+    "country": 1,
+    "pays lib": 1,
+    "lieupays": 1,
+    "pays beneficiaire": 1,
+    "nom du pays": 1,
+    "libelle pays": 1,
+}
 _options = Options(
     ignore_case=True,

csv_detective/formats/percent.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from csv_detective.formats.float import _is as is_float
 proportion = 0.8
-labels = []
+labels = {"pourcent": 1, "part": 0.75, "pct": 0.75}
 def _is(val):

csv_detective/formats/region.py CHANGED Viewed

@@ -2,19 +2,19 @@ from frformat import Millesime, Options, Region
 proportion = 1
 tags = ["fr", "geo"]
-labels = [
-    "region",
-    "libelle region",
-    "nom region",
-    "libelle reg",
-    "nom reg",
-    "reg libusage",
-    "nom de la region",
-    "regionorg",
-    "regionlieu",
-    "reg",
-    "nom officiel region",
-]
+labels = {
+    "region": 1,
+    "libelle region": 1,
+    "nom region": 1,
+    "libelle reg": 1,
+    "nom reg": 1,
+    "reg libusage": 1,
+    "nom de la region": 1,
+    "regionorg": 1,
+    "regionlieu": 1,
+    "reg": 0.5,
+    "nom officiel region": 1,
+}
 _extra_valid_values_set = frozenset(
     {

csv_detective/formats/sexe.py CHANGED Viewed

@@ -2,7 +2,7 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr"]
-labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
+labels = {"sexe": 1, "sex": 1, "civilite": 1, "genre": 1}
 def _is(val):

csv_detective/formats/siren.py CHANGED Viewed

@@ -2,16 +2,15 @@ import re
 proportion = 0.9
 tags = ["fr"]
-labels = [
-    "siren",
-    "siren organisme designe",
-    "siren organisme designant",
-    "n° siren",
-    "siren organisme",
-    "siren titulaire",
-    "numero siren",
-    "epci",
-]
+mandatory_label = True
+labels = {
+    "siren": 1,
+    "n° siren": 1,
+    "siren organisme": 1,
+    "siren titulaire": 1,
+    "numero siren": 1,
+    "epci": 1,
+}
 def _is(val):

csv_detective/formats/siret.py CHANGED Viewed

@@ -2,15 +2,15 @@ import re
 proportion = 0.8
 tags = ["fr"]
-labels = [
-    "siret",
-    "siret d",
-    "num siret",
-    "siretacheteur",
-    "n° siret",
-    "coll siret",
-    "epci",
-]
+mandatory_label = True
+labels = {
+    "siret": 1,
+    "num siret": 1,
+    "siretacheteur": 1,
+    "n° siret": 1,
+    "coll siret": 1,
+    "epci": 1,
+}
 def _is(val):

csv_detective/formats/tel_fr.py CHANGED Viewed

@@ -2,19 +2,13 @@ import re
 proportion = 0.7
 tags = ["fr"]
-labels = [
-    "telephone",
-    "tel",
-    "tel1",
-    "tel2",
-    "phone",
-    "num tel",
-    "tel mob",
-    "telephone sav",
-    "telephone1",
-    "coordinates.phone",
-    "telephone du lieu",
-]
+labels = {
+    "telephone": 1,
+    "tel": 1,
+    "phone": 1,
+    "num tel": 1,
+    "tel mob": 1,
+}
 def _is(val):

csv_detective/formats/uai.py CHANGED Viewed

@@ -2,24 +2,23 @@ import re
 proportion = 0.8
 tags = ["fr"]
-labels = [
-    "uai",
-    "code etablissement",
-    "code uai",
-    "uai - identifiant",
-    "numero uai",
-    "rne",
-    "numero de l'etablissement",
-    "code rne",
-    "codeetab",
-    "code uai de l'etablissement",
-    "ref uai",
-    "cd rne",
-    "numerouai",
-    "numero d etablissement",
-    "code etablissement",
-    "numero etablissement",
-]
+labels = {
+    "uai": 1,
+    "code etablissement": 1,
+    "code uai": 1,
+    "uai - identifiant": 1,
+    "numero uai": 1,
+    "rne": 0.75,
+    "numero de l'etablissement": 1,
+    "code rne": 1,
+    "codeetab": 1,
+    "code uai de l'etablissement": 1,
+    "ref uai": 1,
+    "cd rne": 1,
+    "numerouai": 1,
+    "numero d etablissement": 1,
+    "numero etablissement": 1,
+}
 def _is(val):

csv_detective/formats/url.py CHANGED Viewed

@@ -1,22 +1,22 @@
 import re
 proportion = 1
-labels = [
-    "url",
-    "url source",
-    "site web",
-    "source url",
-    "site internet",
-    "remote url",
-    "web",
-    "site",
-    "lien",
-    "site data",
-    "lien url",
-    "lien vers le fichier",
-    "sitweb",
-    "interneturl",
-]
+labels = {
+    "url": 1,
+    "url source": 1,
+    "site web": 1,
+    "source url": 1,
+    "site internet": 1,
+    "remote url": 1,
+    "web": 1,
+    "site": 1,
+    "lien": 1,
+    "site data": 1,
+    "lien url": 1,
+    "lien vers le fichier": 1,
+    "sitweb": 1,
+    "interneturl": 1,
+}
 pattern = re.compile(
     r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"

csv_detective/formats/username.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 proportion = 1
-labels = ["account", "username", "user"]
+labels = {"account": 1, "username": 1, "user": 0.75}
 def _is(val):

csv_detective/formats/uuid.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 proportion = 0.8
-labels = ["id", "identifiant"]
+labels = {"id": 1, "identifiant": 1}
 def _is(val) -> bool:

csv_detective/formats/year.py CHANGED Viewed

@@ -1,17 +1,12 @@
 proportion = 1
 tags = ["temp"]
-labels = [
-    "year",
-    "annee",
-    "annee depot",
-    "an nais",
-    "exercice",
-    "data year",
-    "annee de publication",
-    "exercice comptable",
-    "annee de naissance",
-    "annee ouverture",
-]
+python_type = "int"
+labels = {
+    "year": 1,
+    "annee": 1,
+    "naissance": 1,
+    "exercice": 1,
+}
 def _is(val):

csv_detective/output/dataframe.py CHANGED Viewed

@@ -13,11 +13,16 @@ from csv_detective.parsing.csv import CHUNK_SIZE
 from csv_detective.utils import display_logs_depending_process_time
-def cast(value: str, _type: str) -> str | float | bool | date | datetime | bytes | None:
+def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
     if not isinstance(value, str) or not value:
         # None is the current default value in hydra, should we keep this?
         return None
     match _type:
+        case "string":
+            # not used here, convenience for external use (cc hydra)
+            return value
+        case "int":
+            return int(value)
         case "float":
             return float_casting(value)
         case "bool":

csv_detective/output/profile.py CHANGED Viewed

@@ -81,7 +81,11 @@ def create_profile(
             del cast_col
         # for all formats we want most frequent values, nb unique values and nb missing values
         tops_bruts = (
-            (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
+            (
+                table[c].value_counts()
+                if _col_values is None
+                else (s := _col_values[c]).loc[s.index.notna()].sort_values(ascending=False)
+            )
             .reset_index(name=_count_col)
             .iloc[:10]
             .to_dict(orient="records")

csv_detective/parsing/text.py CHANGED Viewed

@@ -36,21 +36,22 @@ def is_word_in_string(word: str, string: str):
     return len(word) > 2 and word in string
-def header_score(header: str, words_combinations_list: list[str]) -> float:
+def header_score(header: str, valid_headers: dict[str, float]) -> float:
     """Returns:
-    - 1 if the header is exactly in the specified list
-    - 0.5 if any of the words is within the header
+    - the valid header's credibility if the header is exactly in the valid list
+    - 0.5*credibility if any of the words is within the valid list
     - 0 otherwise"""
     processed_header = _process_text(header)
-    header_matches_words_combination = float(
-        any(words_combination == processed_header for words_combination in words_combinations_list)
-    )
-    words_combination_in_header = 0.5 * (
-        any(
-            is_word_in_string(words_combination, processed_header)
-            for words_combination in words_combinations_list
-        )
+    header_matches_valid = max(
+        (valid == processed_header) * credibility for valid, credibility in valid_headers.items()
     )
-    return max(header_matches_words_combination, words_combination_in_header)
+    return max(
+        header_matches_valid,
+        0.5
+        * max(
+            is_word_in_string(valid, processed_header) * credibility
+            for valid, credibility in valid_headers.items()
+        ),
+    )

{csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: csv-detective
-Version: 0.10.2549
+Version: 0.10.12674
 Summary: Detect tabular files column content
 Keywords: CSV,data processing,encoding,guess,parser,tabular
 Author: data.gouv.fr
@@ -24,7 +24,7 @@ Requires-Dist: pytest>=8.3.0 ; extra == 'dev'
 Requires-Dist: responses>=0.25.0 ; extra == 'dev'
 Requires-Dist: ruff>=0.9.3 ; extra == 'dev'
 Requires-Python: >=3.10, <3.15
-Project-URL: Source, https://github.com/datagouv/csv_detective
+Project-URL: Source, https://github.com/datagouv/csv-detective
 Provides-Extra: dev
 Description-Content-Type: text/markdown

csv-detective 0.10.2549__py3-none-any.whl → 0.10.12674__py3-none-any.whl

csv-detective 0.10.2549py3-none-any.whl → 0.10.12674py3-none-any.whl