PyPI - csv-detective - Versions diffs - 0.10.4.dev1__py3-none-any.whl → 0.10.12674__py3-none-any.whl - Mend

csv-detective 0.10.4.dev1py3-none-any.whl → 0.10.12674py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

csv_detective/detection/__init__.py +0 -0
csv_detective/detection/columns.py +0 -0
csv_detective/detection/encoding.py +0 -0
csv_detective/detection/engine.py +0 -0
csv_detective/detection/formats.py +0 -2
csv_detective/detection/headers.py +14 -12
csv_detective/detection/rows.py +1 -1
csv_detective/detection/separator.py +0 -0
csv_detective/detection/variables.py +0 -0
csv_detective/explore_csv.py +4 -15
csv_detective/format.py +1 -1
csv_detective/formats/__init__.py +0 -0
csv_detective/formats/adresse.py +0 -0
csv_detective/formats/binary.py +0 -0
csv_detective/formats/booleen.py +0 -0
csv_detective/formats/code_commune_insee.py +0 -0
csv_detective/formats/code_csp_insee.py +0 -0
csv_detective/formats/code_departement.py +0 -0
csv_detective/formats/code_fantoir.py +0 -0
csv_detective/formats/code_import.py +0 -0
csv_detective/formats/code_postal.py +0 -0
csv_detective/formats/code_region.py +0 -0
csv_detective/formats/code_rna.py +0 -0
csv_detective/formats/code_waldec.py +0 -0
csv_detective/formats/commune.py +0 -0
csv_detective/formats/csp_insee.py +0 -0
csv_detective/formats/date.py +1 -10
csv_detective/formats/date_fr.py +0 -0
csv_detective/formats/datetime_aware.py +0 -0
csv_detective/formats/datetime_naive.py +0 -0
csv_detective/formats/datetime_rfc822.py +0 -0
csv_detective/formats/departement.py +0 -0
csv_detective/formats/email.py +0 -0
csv_detective/formats/float.py +0 -0
csv_detective/formats/geojson.py +0 -0
csv_detective/formats/insee_ape700.py +0 -0
csv_detective/formats/insee_canton.py +0 -0
csv_detective/formats/int.py +0 -0
csv_detective/formats/iso_country_code_alpha2.py +0 -0
csv_detective/formats/iso_country_code_alpha3.py +0 -0
csv_detective/formats/iso_country_code_numeric.py +0 -0
csv_detective/formats/jour_de_la_semaine.py +0 -0
csv_detective/formats/json.py +0 -0
csv_detective/formats/latitude_l93.py +0 -0
csv_detective/formats/latitude_wgs.py +0 -0
csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
csv_detective/formats/latlon_wgs.py +0 -0
csv_detective/formats/longitude_l93.py +0 -0
csv_detective/formats/longitude_wgs.py +0 -0
csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
csv_detective/formats/lonlat_wgs.py +0 -0
csv_detective/formats/mois_de_lannee.py +0 -0
csv_detective/formats/money.py +0 -0
csv_detective/formats/mongo_object_id.py +0 -0
csv_detective/formats/pays.py +0 -0
csv_detective/formats/percent.py +0 -0
csv_detective/formats/region.py +0 -0
csv_detective/formats/sexe.py +0 -0
csv_detective/formats/siren.py +0 -0
csv_detective/formats/siret.py +0 -0
csv_detective/formats/tel_fr.py +0 -0
csv_detective/formats/uai.py +0 -0
csv_detective/formats/url.py +0 -0
csv_detective/formats/username.py +0 -0
csv_detective/formats/uuid.py +0 -0
csv_detective/formats/year.py +0 -0
csv_detective/output/__init__.py +0 -0
csv_detective/output/dataframe.py +2 -2
csv_detective/output/example.py +0 -0
csv_detective/output/profile.py +1 -1
csv_detective/output/schema.py +0 -0
csv_detective/output/utils.py +0 -0
csv_detective/parsing/__init__.py +0 -0
csv_detective/parsing/columns.py +5 -9
csv_detective/parsing/compression.py +0 -0
csv_detective/parsing/csv.py +0 -0
csv_detective/parsing/excel.py +1 -1
csv_detective/parsing/load.py +12 -11
csv_detective/validate.py +36 -71
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +18 -15
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/RECORD +22 -41
csv_detective-0.10.12674.dist-info/WHEEL +4 -0
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +1 -0
csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
tests/__init__.py +0 -0
tests/data/a_test_file.csv +0 -407
tests/data/a_test_file.json +0 -394
tests/data/b_test_file.csv +0 -7
tests/data/c_test_file.csv +0 -2
tests/data/csv_file +0 -7
tests/data/file.csv.gz +0 -0
tests/data/file.ods +0 -0
tests/data/file.xls +0 -0
tests/data/file.xlsx +0 -0
tests/data/xlsx_file +0 -0
tests/test_example.py +0 -67
tests/test_fields.py +0 -175
tests/test_file.py +0 -469
tests/test_labels.py +0 -26
tests/test_structure.py +0 -45
tests/test_validation.py +0 -163

csv_detective/detection/__init__.py CHANGED Viewed

File without changes

csv_detective/detection/columns.py CHANGED Viewed

File without changes

csv_detective/detection/encoding.py CHANGED Viewed

File without changes

csv_detective/detection/engine.py CHANGED Viewed

File without changes

csv_detective/detection/formats.py CHANGED Viewed

@@ -11,7 +11,6 @@ from csv_detective.format import Format, FormatsManager
 from csv_detective.output.utils import prepare_output_dict
 from csv_detective.parsing.columns import (
     MAX_NUMBER_CATEGORICAL_VALUES,
-    handle_empty_columns,
     test_col,
     test_col_chunks,
     test_label,
@@ -50,7 +49,6 @@ def detect_formats(
             skipna=skipna,
             verbose=verbose,
         )
-        handle_empty_columns(scores_table_fields)
         res_categorical, _ = detect_categorical_variable(
             table,
             max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,

csv_detective/detection/headers.py CHANGED Viewed

@@ -5,22 +5,24 @@ from typing import TextIO
 from csv_detective.utils import display_logs_depending_process_time
-def detect_header_position(file: TextIO, verbose: bool = False) -> int:
+def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
     """Tests 10 first rows for possible header (in case header is not 1st row)"""
     if verbose:
         start = time()
-        logging.info("Detecting header position")
+        logging.info("Detecting headers")
     file.seek(0)
     for i in range(10):
         row = file.readline()
         position = file.tell()
-        next_row = file.readline()
-        file.seek(position)
-        if row != next_row:
-            if verbose:
-                display_logs_depending_process_time(
-                    f"Detected header position in {round(time() - start, 3)}s",
-                    time() - start,
-                )
-            return i
-    raise ValueError("Could not accurately retrieve headers position")
+        headers = [c for c in row.replace("\n", "").split(sep) if c]
+        if not any(col == "" for col in headers):
+            next_row = file.readline()
+            file.seek(position)
+            if row != next_row:
+                if verbose:
+                    display_logs_depending_process_time(
+                        f"Detected headers in {round(time() - start, 3)}s",
+                        time() - start,
+                    )
+                return i, headers
+    raise ValueError("Could not retrieve headers")

csv_detective/detection/rows.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
-    """Analog process to detect_header_position for csv files, determines how many rows to skip
+    """Analog process to detect_headers for csv files, determines how many rows to skip
     to end up with the header at the right place"""
     idx = 0
     if all([str(c).startswith("Unnamed:") for c in table.columns]):

csv_detective/detection/separator.py CHANGED Viewed

File without changes

csv_detective/detection/variables.py CHANGED Viewed

File without changes

csv_detective/explore_csv.py CHANGED Viewed

@@ -142,19 +142,20 @@ def validate_then_detect(
         if is_url(file_path):
             logging.info("Path recognized as a URL")
-    is_valid, analysis, col_values = validate(
+    is_valid, table, analysis, col_values = validate(
         file_path=file_path,
         previous_analysis=previous_analysis,
         verbose=verbose,
         skipna=skipna,
     )
-    if not is_valid:
-        # if loading failed in validate, we load it from scratch and initiate an analysis
+    if analysis is None:
+        # if loading failed in validate, we load it from scratch
         table, analysis = load_file(
             file_path=file_path,
             num_rows=num_rows,
             verbose=verbose,
         )
+    if not is_valid:
         analysis, col_values = detect_formats(
             table=table,
             analysis=analysis,
@@ -164,18 +165,6 @@ def validate_then_detect(
             skipna=skipna,
             verbose=verbose,
         )
-    else:
-        # successful validation means we have a correct analysis and col_values
-        # only need to reload the table, and we already know how
-        table, _ = load_file(
-            file_path=file_path,
-            num_rows=num_rows,
-            verbose=verbose,
-            sep=analysis.get("separator"),
-            encoding=analysis.get("encoding"),
-            engine=analysis.get("engine"),
-            sheet_name=analysis.get("sheet_name"),
-        )
     try:
         return generate_output(
             table=table,

csv_detective/format.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Format:
             tags: to allow users to submit a file to only a subset of formats
         """
         self.name: str = name
-        self.func: Callable[[Any], bool] = func
+        self.func: Callable = func
         self._test_values: dict[bool, list[str]] = _test_values
         self.labels: dict[str, float] = labels
         self.proportion: float = proportion

csv_detective/formats/__init__.py CHANGED Viewed

File without changes

csv_detective/formats/adresse.py CHANGED Viewed

File without changes

csv_detective/formats/binary.py CHANGED Viewed

File without changes

csv_detective/formats/booleen.py CHANGED Viewed

File without changes

csv_detective/formats/code_commune_insee.py CHANGED Viewed

File without changes

csv_detective/formats/code_csp_insee.py CHANGED Viewed

File without changes

csv_detective/formats/code_departement.py CHANGED Viewed

File without changes

csv_detective/formats/code_fantoir.py CHANGED Viewed

File without changes

csv_detective/formats/code_import.py CHANGED Viewed

File without changes

csv_detective/formats/code_postal.py CHANGED Viewed

File without changes

csv_detective/formats/code_region.py CHANGED Viewed

File without changes

csv_detective/formats/code_rna.py CHANGED Viewed

File without changes

csv_detective/formats/code_waldec.py CHANGED Viewed

File without changes

csv_detective/formats/commune.py CHANGED Viewed

File without changes

csv_detective/formats/csp_insee.py CHANGED Viewed

File without changes

csv_detective/formats/date.py CHANGED Viewed

@@ -57,9 +57,7 @@ string_month_pattern = (
 def _is(val):
-    # many early stops, to cut processing time
-    # and avoid the costly use of date_casting as much as possible
-    # /!\ timestamps are considered ints, not dates
+    # early stops, to cut processing time
     if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
         return False
     # if it's a usual date pattern
@@ -72,13 +70,8 @@ def _is(val):
         ]
     ):
         return True
-    if re.match(r"^-?\d+[\.|,]\d+$", val):
-        # regular floats are excluded
-        return False
-    # not enough digits => not a date (slightly arbitrary)
     if sum([char.isdigit() for char in val]) / len(val) < threshold:
         return False
-    # last resort
     res = date_casting(val)
     if not res or res.hour or res.minute or res.second:
         return False
@@ -93,7 +86,6 @@ _test_values = {
         "15 décembre 1985",
         "02 05 2003",
         "20030502",
-        "2003.05.02",
         "1993-12/02",
     ],
     False: [
@@ -104,6 +96,5 @@ _test_values = {
         "12152003",
         "20031512",
         "02052003",
-        "6.27367393749392839",
     ],
 }

csv_detective/formats/date_fr.py CHANGED Viewed

File without changes

csv_detective/formats/datetime_aware.py CHANGED Viewed

File without changes

csv_detective/formats/datetime_naive.py CHANGED Viewed

File without changes

csv_detective/formats/datetime_rfc822.py CHANGED Viewed

File without changes

csv_detective/formats/departement.py CHANGED Viewed

File without changes

csv_detective/formats/email.py CHANGED Viewed

File without changes

csv_detective/formats/float.py CHANGED Viewed

File without changes

csv_detective/formats/geojson.py CHANGED Viewed

File without changes

csv_detective/formats/insee_ape700.py CHANGED Viewed

File without changes

csv_detective/formats/insee_canton.py CHANGED Viewed

File without changes

csv_detective/formats/int.py CHANGED Viewed

File without changes

csv_detective/formats/iso_country_code_alpha2.py CHANGED Viewed

File without changes

csv_detective/formats/iso_country_code_alpha3.py CHANGED Viewed

File without changes

csv_detective/formats/iso_country_code_numeric.py CHANGED Viewed

File without changes

csv_detective/formats/jour_de_la_semaine.py CHANGED Viewed

File without changes

csv_detective/formats/json.py CHANGED Viewed

File without changes

csv_detective/formats/latitude_l93.py CHANGED Viewed

File without changes

csv_detective/formats/latitude_wgs.py CHANGED Viewed

File without changes

csv_detective/formats/latitude_wgs_fr_metropole.py CHANGED Viewed

File without changes

csv_detective/formats/latlon_wgs.py CHANGED Viewed

File without changes

csv_detective/formats/longitude_l93.py CHANGED Viewed

File without changes

csv_detective/formats/longitude_wgs.py CHANGED Viewed

File without changes

csv_detective/formats/longitude_wgs_fr_metropole.py CHANGED Viewed

File without changes

csv_detective/formats/lonlat_wgs.py CHANGED Viewed

File without changes

csv_detective/formats/mois_de_lannee.py CHANGED Viewed

File without changes

csv_detective/formats/money.py CHANGED Viewed

File without changes

csv_detective/formats/mongo_object_id.py CHANGED Viewed

File without changes

csv_detective/formats/pays.py CHANGED Viewed

File without changes

csv_detective/formats/percent.py CHANGED Viewed

File without changes

csv_detective/formats/region.py CHANGED Viewed

File without changes

csv_detective/formats/sexe.py CHANGED Viewed

File without changes

csv_detective/formats/siren.py CHANGED Viewed

File without changes

csv_detective/formats/siret.py CHANGED Viewed

File without changes

csv_detective/formats/tel_fr.py CHANGED Viewed

File without changes

csv_detective/formats/uai.py CHANGED Viewed

File without changes

csv_detective/formats/url.py CHANGED Viewed

File without changes

csv_detective/formats/username.py CHANGED Viewed

File without changes

csv_detective/formats/uuid.py CHANGED Viewed

File without changes

csv_detective/formats/year.py CHANGED Viewed

File without changes

csv_detective/output/__init__.py CHANGED Viewed

File without changes

csv_detective/output/dataframe.py CHANGED Viewed

@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
 def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
-    if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
-        # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
+    if not isinstance(value, str) or not value:
+        # None is the current default value in hydra, should we keep this?
         return None
     match _type:
         case "string":

csv_detective/output/example.py CHANGED Viewed

File without changes

csv_detective/output/profile.py CHANGED Viewed

@@ -23,7 +23,7 @@ def create_profile(
         logging.info("Creating profile")
     if num_rows > 0:
-        raise ValueError("To create profile `num_rows` must be set to -1")
+        raise ValueError("To create profiles num_rows has to be set to -1")
     if not limited_output:
         columns = {
             k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}

csv_detective/output/schema.py CHANGED Viewed

File without changes

csv_detective/output/utils.py CHANGED Viewed

File without changes

csv_detective/parsing/__init__.py CHANGED Viewed

File without changes

csv_detective/parsing/columns.py CHANGED Viewed

@@ -13,13 +13,6 @@ from csv_detective.utils import display_logs_depending_process_time
 MAX_NUMBER_CATEGORICAL_VALUES = 25
-def handle_empty_columns(return_table: pd.DataFrame):
-    # handling that empty columns score 1 everywhere
-    for col in return_table.columns:
-        if sum(return_table[col]) == len(return_table):
-            return_table[col] = 0
 def test_col_val(
     serie: pd.Series,
     format: Format,
@@ -40,7 +33,7 @@ def test_col_val(
     try:
         if skipna:
-            serie = serie.dropna()
+            serie = serie.loc[serie.notnull()]
         ser_len = len(serie)
         if ser_len == 0:
             # being here means the whole column is NaN, so if skipna it's a pass
@@ -229,7 +222,10 @@ def test_col_chunks(
     analysis["categorical"] = [
         col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
     ]
-    handle_empty_columns(return_table)
+    # handling that empty columns score 1 everywhere
+    for col in return_table.columns:
+        if sum(return_table[col]) == len(return_table):
+            return_table[col] = 0
     if verbose:
         display_logs_depending_process_time(
             f"Done testing chunks in {round(time() - start, 3)}s", time() - start

csv_detective/parsing/compression.py CHANGED Viewed

File without changes

csv_detective/parsing/csv.py CHANGED Viewed

File without changes

csv_detective/parsing/excel.py CHANGED Viewed

@@ -23,7 +23,7 @@ def parse_excel(
     file_path: str,
     num_rows: int = -1,
     engine: str | None = None,
-    sheet_name: str | int | None = None,
+    sheet_name: str | None = None,
     random_state: int = 42,
     verbose: bool = False,
 ) -> tuple[pd.DataFrame, int, int, str, str, int]:

csv_detective/parsing/load.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import codecs
 from io import BytesIO, StringIO
 import pandas as pd
@@ -11,7 +10,7 @@ from csv_detective.detection.engine import (
     EXCEL_ENGINES,
     detect_engine,
 )
-from csv_detective.detection.headers import detect_header_position
+from csv_detective.detection.headers import detect_headers
 from csv_detective.detection.separator import detect_separator
 from csv_detective.parsing.compression import unzip
 from csv_detective.parsing.csv import parse_csv
@@ -28,12 +27,12 @@ def load_file(
     encoding: str | None = None,
     sep: str | None = None,
     verbose: bool = False,
-    engine: str | None = None,
     sheet_name: str | int | None = None,
 ) -> tuple[pd.DataFrame, dict]:
     file_name = file_path.split("/")[-1]
-    if ("." not in file_name or not file_name.endswith("csv")) and engine is None and sep is None:
-        # file has no extension and we don't have insights from arguments, we'll investigate how to read it
+    engine = None
+    if "." not in file_name or not file_name.endswith("csv"):
+        # file has no extension, we'll investigate how to read it
         engine = detect_engine(file_path, verbose=verbose)
     if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
@@ -46,6 +45,9 @@ def load_file(
         )
         if table.empty:
             raise ValueError("Table seems to be empty")
+        header = table.columns.to_list()
+        if any(col.startswith("Unnamed") for col in header):
+            raise ValueError("Could not retrieve headers")
         analysis = {
             "engine": engine,
             "sheet_name": sheet_name,
@@ -67,20 +69,21 @@ def load_file(
             binary_file.seek(0)
         # decoding and reading file
         if is_url(file_path) or engine in COMPRESSION_ENGINES:
-            decoder = codecs.getincrementaldecoder(encoding)()
             str_file = StringIO()
             while True:
                 chunk = binary_file.read(1024**2)
                 if not chunk:
                     break
-                str_file.write(decoder.decode(chunk))
+                str_file.write(chunk.decode(encoding=encoding))
             del binary_file
             str_file.seek(0)
         else:
             str_file = open(file_path, "r", encoding=encoding)
         if sep is None:
             sep = detect_separator(str_file, verbose=verbose)
-        header_row_idx = detect_header_position(str_file, verbose=verbose)
+        header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
+        if header is None or (isinstance(header, list) and any([h is None for h in header])):
+            raise ValueError("Could not retrieve headers")
         heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
         trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
         table, total_lines, nb_duplicates = parse_csv(
@@ -97,11 +100,9 @@ def load_file(
         }
         if engine is not None:
             analysis["compression"] = engine
-    if any(not isinstance(col, str) or col.startswith("Unnamed:") for col in table.columns):
-        raise ValueError("Could not accurately detect the file's columns")
     analysis |= {
         "header_row_idx": header_row_idx,
-        "header": list(table.columns),
+        "header": header,
     }
     if total_lines is not None:
         analysis["total_lines"] = total_lines

csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.12674__py3-none-any.whl

csv-detective 0.10.4.dev1py3-none-any.whl → 0.10.12674py3-none-any.whl