PyPI - csv-detective - Versions diffs - 0.10.4.dev1__py3-none-any.whl → 0.10.2549__py3-none-any.whl - Mend

csv-detective 0.10.4.dev1py3-none-any.whl → 0.10.2549py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

csv_detective/detection/__init__.py +0 -0
csv_detective/detection/columns.py +0 -0
csv_detective/detection/encoding.py +0 -0
csv_detective/detection/engine.py +0 -0
csv_detective/detection/formats.py +38 -13
csv_detective/detection/headers.py +14 -12
csv_detective/detection/rows.py +1 -1
csv_detective/detection/separator.py +0 -0
csv_detective/detection/variables.py +0 -0
csv_detective/explore_csv.py +6 -18
csv_detective/format.py +5 -12
csv_detective/formats/__init__.py +0 -0
csv_detective/formats/adresse.py +9 -9
csv_detective/formats/binary.py +1 -2
csv_detective/formats/booleen.py +2 -3
csv_detective/formats/code_commune_insee.py +10 -12
csv_detective/formats/code_csp_insee.py +1 -1
csv_detective/formats/code_departement.py +7 -8
csv_detective/formats/code_fantoir.py +5 -6
csv_detective/formats/code_import.py +1 -1
csv_detective/formats/code_postal.py +9 -10
csv_detective/formats/code_region.py +6 -7
csv_detective/formats/code_rna.py +6 -7
csv_detective/formats/code_waldec.py +1 -1
csv_detective/formats/commune.py +5 -5
csv_detective/formats/csp_insee.py +5 -6
csv_detective/formats/data/insee_ape700.txt +1 -1
csv_detective/formats/data/iso_country_code_alpha2.txt +397 -153
csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
csv_detective/formats/date.py +18 -28
csv_detective/formats/date_fr.py +1 -1
csv_detective/formats/datetime_aware.py +2 -7
csv_detective/formats/datetime_naive.py +0 -3
csv_detective/formats/datetime_rfc822.py +0 -1
csv_detective/formats/departement.py +15 -15
csv_detective/formats/email.py +13 -13
csv_detective/formats/float.py +1 -2
csv_detective/formats/geojson.py +10 -10
csv_detective/formats/insee_ape700.py +8 -10
csv_detective/formats/insee_canton.py +6 -6
csv_detective/formats/int.py +1 -2
csv_detective/formats/iso_country_code_alpha2.py +14 -14
csv_detective/formats/iso_country_code_alpha3.py +13 -6
csv_detective/formats/iso_country_code_numeric.py +9 -2
csv_detective/formats/jour_de_la_semaine.py +12 -11
csv_detective/formats/json.py +0 -6
csv_detective/formats/latitude_l93.py +22 -8
csv_detective/formats/latitude_wgs.py +29 -31
csv_detective/formats/latitude_wgs_fr_metropole.py +30 -7
csv_detective/formats/latlon_wgs.py +28 -30
csv_detective/formats/longitude_l93.py +13 -8
csv_detective/formats/longitude_wgs.py +19 -34
csv_detective/formats/longitude_wgs_fr_metropole.py +19 -6
csv_detective/formats/lonlat_wgs.py +11 -12
csv_detective/formats/mois_de_lannee.py +1 -1
csv_detective/formats/money.py +1 -1
csv_detective/formats/mongo_object_id.py +1 -1
csv_detective/formats/pays.py +13 -11
csv_detective/formats/percent.py +1 -1
csv_detective/formats/region.py +13 -13
csv_detective/formats/sexe.py +1 -1
csv_detective/formats/siren.py +10 -9
csv_detective/formats/siret.py +9 -9
csv_detective/formats/tel_fr.py +13 -7
csv_detective/formats/uai.py +18 -17
csv_detective/formats/url.py +16 -16
csv_detective/formats/username.py +1 -1
csv_detective/formats/uuid.py +1 -1
csv_detective/formats/year.py +12 -7
csv_detective/output/__init__.py +0 -0
csv_detective/output/dataframe.py +3 -8
csv_detective/output/example.py +0 -0
csv_detective/output/profile.py +2 -6
csv_detective/output/schema.py +0 -0
csv_detective/output/utils.py +0 -0
csv_detective/parsing/__init__.py +0 -0
csv_detective/parsing/columns.py +5 -9
csv_detective/parsing/compression.py +0 -0
csv_detective/parsing/csv.py +0 -0
csv_detective/parsing/excel.py +1 -1
csv_detective/parsing/load.py +12 -11
csv_detective/parsing/text.py +12 -13
csv_detective/validate.py +36 -71
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.2549.dist-info}/METADATA +18 -15
csv_detective-0.10.2549.dist-info/RECORD +92 -0
csv_detective-0.10.2549.dist-info/WHEEL +4 -0
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.2549.dist-info}/entry_points.txt +1 -0
csv_detective-0.10.4.dev1.dist-info/RECORD +0 -111
csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
tests/__init__.py +0 -0
tests/data/a_test_file.csv +0 -407
tests/data/a_test_file.json +0 -394
tests/data/b_test_file.csv +0 -7
tests/data/c_test_file.csv +0 -2
tests/data/csv_file +0 -7
tests/data/file.csv.gz +0 -0
tests/data/file.ods +0 -0
tests/data/file.xls +0 -0
tests/data/file.xlsx +0 -0
tests/data/xlsx_file +0 -0
tests/test_example.py +0 -67
tests/test_fields.py +0 -175
tests/test_file.py +0 -469
tests/test_labels.py +0 -26
tests/test_structure.py +0 -45
tests/test_validation.py +0 -163

csv_detective/detection/__init__.py CHANGED Viewed

File without changes

csv_detective/detection/columns.py CHANGED Viewed

File without changes

csv_detective/detection/encoding.py CHANGED Viewed

File without changes

csv_detective/detection/engine.py CHANGED Viewed

File without changes

csv_detective/detection/formats.py CHANGED Viewed

@@ -11,7 +11,6 @@ from csv_detective.format import Format, FormatsManager
 from csv_detective.output.utils import prepare_output_dict
 from csv_detective.parsing.columns import (
     MAX_NUMBER_CATEGORICAL_VALUES,
-    handle_empty_columns,
     test_col,
     test_col_chunks,
     test_label,
@@ -50,7 +49,6 @@ def detect_formats(
             skipna=skipna,
             verbose=verbose,
         )
-        handle_empty_columns(scores_table_fields)
         res_categorical, _ = detect_categorical_variable(
             table,
             max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
@@ -84,7 +82,22 @@ def detect_formats(
     # To reduce false positives: ensure these formats are detected only if the label yields
     # a detection (skipping the ones that have been excluded by the users).
     formats_with_mandatory_label = [
-        f for f in fmtm.get_formats_with_mandatory_label() if f in scores_table.index
+        f
+        for f in [
+            "code_departement",
+            "code_commune_insee",
+            "code_postal",
+            "code_fantoir",
+            "latitude_wgs",
+            "longitude_wgs",
+            "latitude_wgs_fr_metropole",
+            "longitude_wgs_fr_metropole",
+            "latitude_l93",
+            "longitude_l93",
+            "siren",
+            "siret",
+        ]
+        if f in scores_table.index
     ]
     scores_table.loc[formats_with_mandatory_label, :] = np.where(
         scores_table_labels.loc[formats_with_mandatory_label, :],
@@ -93,16 +106,32 @@ def detect_formats(
     )
     analysis["columns"] = prepare_output_dict(scores_table, limited_output)
+    metier_to_python_type = {
+        "booleen": "bool",
+        "int": "int",
+        "float": "float",
+        "string": "string",
+        "json": "json",
+        "geojson": "json",
+        "datetime_aware": "datetime",
+        "datetime_naive": "datetime",
+        "datetime_rfc822": "datetime",
+        "date": "date",
+        "latitude_l93": "float",
+        "latitude_wgs": "float",
+        "latitude_wgs_fr_metropole": "float",
+        "longitude_l93": "float",
+        "longitude_wgs": "float",
+        "longitude_wgs_fr_metropole": "float",
+        "binary": "binary",
+    }
     if not limited_output:
         for detection_method in ["columns_fields", "columns_labels", "columns"]:
             analysis[detection_method] = {
                 col_name: [
                     {
-                        "python_type": (
-                            "string"
-                            if detection["format"] == "string"
-                            else fmtm.formats[detection["format"]].python_type
-                        ),
+                        "python_type": metier_to_python_type.get(detection["format"], "string"),
                         **detection,
                     }
                     for detection in detections
@@ -113,11 +142,7 @@ def detect_formats(
         for detection_method in ["columns_fields", "columns_labels", "columns"]:
             analysis[detection_method] = {
                 col_name: {
-                    "python_type": (
-                        "string"
-                        if detection["format"] == "string"
-                        else fmtm.formats[detection["format"]].python_type
-                    ),
+                    "python_type": metier_to_python_type.get(detection["format"], "string"),
                     **detection,
                 }
                 for col_name, detection in analysis[detection_method].items()

csv_detective/detection/headers.py CHANGED Viewed

@@ -5,22 +5,24 @@ from typing import TextIO
 from csv_detective.utils import display_logs_depending_process_time
-def detect_header_position(file: TextIO, verbose: bool = False) -> int:
+def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
     """Tests 10 first rows for possible header (in case header is not 1st row)"""
     if verbose:
         start = time()
-        logging.info("Detecting header position")
+        logging.info("Detecting headers")
     file.seek(0)
     for i in range(10):
         row = file.readline()
         position = file.tell()
-        next_row = file.readline()
-        file.seek(position)
-        if row != next_row:
-            if verbose:
-                display_logs_depending_process_time(
-                    f"Detected header position in {round(time() - start, 3)}s",
-                    time() - start,
-                )
-            return i
-    raise ValueError("Could not accurately retrieve headers position")
+        headers = [c for c in row.replace("\n", "").split(sep) if c]
+        if not any(col == "" for col in headers):
+            next_row = file.readline()
+            file.seek(position)
+            if row != next_row:
+                if verbose:
+                    display_logs_depending_process_time(
+                        f"Detected headers in {round(time() - start, 3)}s",
+                        time() - start,
+                    )
+                return i, headers
+    raise ValueError("Could not retrieve headers")

csv_detective/detection/rows.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
-    """Analog process to detect_header_position for csv files, determines how many rows to skip
+    """Analog process to detect_headers for csv files, determines how many rows to skip
     to end up with the header at the right place"""
     idx = 0
     if all([str(c).startswith("Unnamed:") for c in table.columns]):

csv_detective/detection/separator.py CHANGED Viewed

File without changes

csv_detective/detection/variables.py CHANGED Viewed

File without changes

csv_detective/explore_csv.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from time import time
-from typing import Iterator
 import pandas as pd
@@ -28,7 +27,7 @@ def routine(
     cast_json: bool = True,
     verbose: bool = False,
     sheet_name: str | int | None = None,
-) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
+) -> dict | tuple[dict, pd.DataFrame]:
     """
     Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
@@ -116,7 +115,7 @@ def validate_then_detect(
     output_df: bool = False,
     cast_json: bool = True,
     verbose: bool = False,
-) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
+):
     """
     Performs a validation of the given file against the given analysis.
     If the validation fails, performs a full analysis and return it.
@@ -142,19 +141,20 @@ def validate_then_detect(
         if is_url(file_path):
             logging.info("Path recognized as a URL")
-    is_valid, analysis, col_values = validate(
+    is_valid, table, analysis, col_values = validate(
         file_path=file_path,
         previous_analysis=previous_analysis,
         verbose=verbose,
         skipna=skipna,
     )
-    if not is_valid:
-        # if loading failed in validate, we load it from scratch and initiate an analysis
+    if analysis is None:
+        # if loading failed in validate, we load it from scratch
         table, analysis = load_file(
             file_path=file_path,
             num_rows=num_rows,
             verbose=verbose,
         )
+    if not is_valid:
         analysis, col_values = detect_formats(
             table=table,
             analysis=analysis,
@@ -164,18 +164,6 @@ def validate_then_detect(
             skipna=skipna,
             verbose=verbose,
         )
-    else:
-        # successful validation means we have a correct analysis and col_values
-        # only need to reload the table, and we already know how
-        table, _ = load_file(
-            file_path=file_path,
-            num_rows=num_rows,
-            verbose=verbose,
-            sep=analysis.get("separator"),
-            encoding=analysis.get("encoding"),
-            engine=analysis.get("engine"),
-            sheet_name=analysis.get("sheet_name"),
-        )
     try:
         return generate_output(
             table=table,

csv_detective/format.py CHANGED Viewed

@@ -9,11 +9,9 @@ class Format:
         name: str,
         func: Callable[[Any], bool],
         _test_values: dict[bool, list[str]],
-        labels: dict[str, float] = {},
+        labels: list[str] = [],
         proportion: float = 1,
         tags: list[str] = [],
-        mandatory_label: bool = False,
-        python_type: str = "string",
     ) -> None:
         """
         Instanciates a Format object.
@@ -22,18 +20,16 @@ class Format:
             name: the name of the format.
             func: the value test for the format (returns whether a string is valid).
             _test_values: lists of valid and invalid values, used in the tests
-            labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
+            labels: the list of hint headers for the header score
             proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
             tags: to allow users to submit a file to only a subset of formats
         """
         self.name: str = name
-        self.func: Callable[[Any], bool] = func
+        self.func: Callable = func
         self._test_values: dict[bool, list[str]] = _test_values
-        self.labels: dict[str, float] = labels
+        self.labels: list[str] = labels
         self.proportion: float = proportion
         self.tags: list[str] = tags
-        self.mandatory_label: bool = mandatory_label
-        self.python_type: str = python_type
     def is_valid_label(self, val: str) -> float:
         return header_score(val, self.labels)
@@ -53,7 +49,7 @@ class FormatsManager:
                 _test_values=module._test_values,
                 **{
                     attr: val
-                    for attr in ["labels", "proportion", "tags", "mandatory_label", "python_type"]
+                    for attr in ["labels", "proportion", "tags"]
                     if (val := getattr(module, attr, None))
                 },
             )
@@ -67,8 +63,5 @@ class FormatsManager:
             if all(tag in fmt.tags for tag in tags)
         }
-    def get_formats_with_mandatory_label(self) -> dict[str, Format]:
-        return {label: fmt for label, fmt in self.formats.items() if fmt.mandatory_label}
     def available_tags(self) -> set[str]:
         return set(tag for format in self.formats.values() for tag in format.tags)

csv_detective/formats/__init__.py CHANGED Viewed

File without changes

csv_detective/formats/adresse.py CHANGED Viewed

@@ -2,15 +2,15 @@ from csv_detective.parsing.text import _process_text
 proportion = 0.55
 tags = ["fr", "geo"]
-labels = {
-    "adresse": 1,
-    "localisation": 1,
-    "adresse postale": 1,
-    "adresse geographique": 1,
-    "adr": 0.5,
-    "adresse complete": 1,
-    "adresse station": 1,
-}
+labels = [
+    "adresse",
+    "localisation",
+    "adresse postale",
+    "adresse geographique",
+    "adr",
+    "adresse complete",
+    "adresse station",
+]
 voies = {
     "aire ",

csv_detective/formats/binary.py CHANGED Viewed

@@ -2,8 +2,7 @@ import codecs
 proportion = 1
 tags = ["type"]
-python_type = "binary"
-labels = {"bytes": 1, "binary": 1, "image": 1, "encode": 1, "content": 1}
+labels = ["bytes", "binary", "image", "encode", "content"]
 def binary_casting(val: str) -> bytes:

csv_detective/formats/booleen.py CHANGED Viewed

@@ -1,7 +1,6 @@
 proportion = 1
 tags = ["type"]
-python_type = "bool"
-labels = {"is ": 1, "has ": 1, "est ": 1}
+labels = ["is ", "has ", "est "]
 bool_mapping = {
     "1": True,
@@ -22,7 +21,7 @@ bool_mapping = {
 liste_bool = set(bool_mapping.keys())
-def bool_casting(val: str) -> bool | None:
+def bool_casting(val: str) -> bool:
     return bool_mapping.get(val.lower())

csv_detective/formats/code_commune_insee.py CHANGED Viewed

@@ -2,18 +2,16 @@ from frformat import CodeCommuneInsee, Millesime
 proportion = 0.75
 tags = ["fr", "geo"]
-mandatory_label = True
-labels = {
-    "code commune insee": 1,
-    "code insee": 1,
-    "codes insee": 1,
-    "code commune": 1,
-    "code insee commune": 1,
-    "insee": 0.75,
-    "code com": 1,
-    "com": 0.5,
-    "code": 0.5,
-}
+labels = [
+    "code commune insee",
+    "code insee",
+    "codes insee",
+    "code commune",
+    "code insee commune",
+    "insee",
+    "code com",
+    "com",
+]
 _code_commune_insee = CodeCommuneInsee(Millesime.LATEST)

csv_detective/formats/code_csp_insee.py CHANGED Viewed

@@ -4,7 +4,7 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr"]
-labels = {"code csp insee": 1, "code csp": 1}
+labels = ["code csp insee", "code csp"]
 def _is(val):

csv_detective/formats/code_departement.py CHANGED Viewed

@@ -2,14 +2,13 @@ from frformat import Millesime, NumeroDepartement, Options
 proportion = 1
 tags = ["fr", "geo"]
-mandatory_label = True
-labels = {
-    "code departement": 1,
-    "code_departement": 1,
-    "dep": 0.5,
-    "departement": 1,
-    "dept": 0.75,
-}
+labels = [
+    "code departement",
+    "code_departement",
+    "dep",
+    "departement",
+    "dept",
+]
 _options = Options(
     ignore_case=True,

csv_detective/formats/code_fantoir.py CHANGED Viewed

@@ -2,12 +2,11 @@ from frformat import CodeFantoir
 proportion = 1
 tags = ["fr", "geo"]
-mandatory_label = True
-labels = {
-    "cadastre1": 1,
-    "code fantoir": 1,
-    "fantoir": 1,
-}
+labels = [
+    "cadastre1",
+    "code fantoir",
+    "fantoir",
+]
 _code_fantoir = CodeFantoir()

csv_detective/formats/code_import.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 proportion = 0.9
 tags = ["fr"]
-labels = {"code": 0.5}
+labels = ["code"]
 regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"

csv_detective/formats/code_postal.py CHANGED Viewed

@@ -2,16 +2,15 @@ from frformat import CodePostal
 proportion = 0.9
 tags = ["fr", "geo"]
-mandatory_label = True
-labels = {
-    "code postal": 1,
-    "postal code": 1,
-    "postcode": 1,
-    "post code": 1,
-    "cp": 0.5,
-    "codes postaux": 1,
-    "location postcode": 1,
-}
+labels = [
+    "code postal",
+    "postal code",
+    "postcode",
+    "post code",
+    "cp",
+    "codes postaux",
+    "location postcode",
+]
 _code_postal = CodePostal()

csv_detective/formats/code_region.py CHANGED Viewed

@@ -2,13 +2,12 @@ from frformat import CodeRegion, Millesime
 proportion = 1
 tags = ["fr", "geo"]
-mandatory_label = True
-labels = {
-    "code region": 1,
-    "reg": 0.5,
-    "code insee region": 1,
-    "region": 1,
-}
+labels = [
+    "code region",
+    "reg",
+    "code insee region",
+    "region",
+]
 _code_region = CodeRegion(Millesime.LATEST)

csv_detective/formats/code_rna.py CHANGED Viewed

@@ -2,13 +2,12 @@ from frformat import CodeRNA
 proportion = 0.9
 tags = ["fr"]
-labels = {
-    "code rna": 1,
-    "rna": 1,
-    "n° inscription association": 1,
-    "identifiant association": 1,
-    "asso": 0.75,
-}
+labels = [
+    "code rna",
+    "rna",
+    "n° inscription association",
+    "identifiant association",
+]
 _code_rna = CodeRNA()

csv_detective/formats/code_waldec.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 proportion = 0.9
 tags = ["fr"]
-labels = {"code waldec": 1, "waldec": 1}
+labels = ["code waldec", "waldec"]
 regex = r"^W\d[\dA-Z]\d{7}$"

csv_detective/formats/commune.py CHANGED Viewed

@@ -2,11 +2,11 @@ from frformat import Commune, Millesime, Options
 proportion = 0.8
 tags = ["fr", "geo"]
-labels = {
-    "commune": 1,
-    "ville": 1,
-    "libelle commune": 1,
-}
+labels = [
+    "commune",
+    "ville",
+    "libelle commune",
+]
 _options = Options(
     ignore_case=True,

csv_detective/formats/csp_insee.py CHANGED Viewed

@@ -4,12 +4,11 @@ from csv_detective.parsing.text import _process_text
 proportion = 1
 tags = ["fr"]
-labels = {
-    "csp insee": 1,
-    "csp": 0.75,
-    "categorie socioprofessionnelle": 1,
-    "sociopro": 1,
-}
+labels = [
+    "csp insee",
+    "csp",
+    "categorie socioprofessionnelle",
+]
 f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
 codes_insee = f.read().split("\n")

csv_detective/formats/data/insee_ape700.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 0000Z
 0000Z
-000Z
+000Z
 0111Z
 0112Z
 0113Z

csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.2549__py3-none-any.whl

csv-detective 0.10.4.dev1py3-none-any.whl → 0.10.2549py3-none-any.whl