PyPI - csv-detective - Versions diffs - 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl - Mend

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

csv_detective/__init__.py +7 -1
csv_detective/cli.py +33 -21
csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
csv_detective/detection/columns.py +89 -0
csv_detective/detection/encoding.py +29 -0
csv_detective/detection/engine.py +46 -0
csv_detective/detection/formats.py +156 -0
csv_detective/detection/headers.py +28 -0
csv_detective/detection/rows.py +18 -0
csv_detective/detection/separator.py +44 -0
csv_detective/detection/variables.py +97 -0
csv_detective/explore_csv.py +151 -377
csv_detective/format.py +67 -0
csv_detective/formats/__init__.py +9 -0
csv_detective/formats/adresse.py +116 -0
csv_detective/formats/binary.py +26 -0
csv_detective/formats/booleen.py +35 -0
csv_detective/formats/code_commune_insee.py +26 -0
csv_detective/formats/code_csp_insee.py +36 -0
csv_detective/formats/code_departement.py +29 -0
csv_detective/formats/code_fantoir.py +21 -0
csv_detective/formats/code_import.py +17 -0
csv_detective/formats/code_postal.py +25 -0
csv_detective/formats/code_region.py +22 -0
csv_detective/formats/code_rna.py +29 -0
csv_detective/formats/code_waldec.py +17 -0
csv_detective/formats/commune.py +27 -0
csv_detective/formats/csp_insee.py +31 -0
csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
csv_detective/formats/date.py +99 -0
csv_detective/formats/date_fr.py +22 -0
csv_detective/formats/datetime_aware.py +45 -0
csv_detective/formats/datetime_naive.py +48 -0
csv_detective/formats/datetime_rfc822.py +24 -0
csv_detective/formats/departement.py +37 -0
csv_detective/formats/email.py +28 -0
csv_detective/formats/float.py +29 -0
csv_detective/formats/geojson.py +36 -0
csv_detective/formats/insee_ape700.py +31 -0
csv_detective/formats/insee_canton.py +28 -0
csv_detective/formats/int.py +23 -0
csv_detective/formats/iso_country_code_alpha2.py +30 -0
csv_detective/formats/iso_country_code_alpha3.py +30 -0
csv_detective/formats/iso_country_code_numeric.py +31 -0
csv_detective/formats/jour_de_la_semaine.py +41 -0
csv_detective/formats/json.py +20 -0
csv_detective/formats/latitude_l93.py +48 -0
csv_detective/formats/latitude_wgs.py +42 -0
csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
csv_detective/formats/latlon_wgs.py +53 -0
csv_detective/formats/longitude_l93.py +39 -0
csv_detective/formats/longitude_wgs.py +32 -0
csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
csv_detective/formats/lonlat_wgs.py +36 -0
csv_detective/formats/mois_de_lannee.py +48 -0
csv_detective/formats/money.py +18 -0
csv_detective/formats/mongo_object_id.py +14 -0
csv_detective/formats/pays.py +35 -0
csv_detective/formats/percent.py +16 -0
csv_detective/formats/region.py +70 -0
csv_detective/formats/sexe.py +17 -0
csv_detective/formats/siren.py +37 -0
csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
csv_detective/formats/tel_fr.py +36 -0
csv_detective/formats/uai.py +36 -0
csv_detective/formats/url.py +46 -0
csv_detective/formats/username.py +14 -0
csv_detective/formats/uuid.py +16 -0
csv_detective/formats/year.py +28 -0
csv_detective/output/__init__.py +65 -0
csv_detective/output/dataframe.py +96 -0
csv_detective/output/example.py +250 -0
csv_detective/output/profile.py +119 -0
csv_detective/{schema_generation.py → output/schema.py} +268 -343
csv_detective/output/utils.py +74 -0
csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
csv_detective/parsing/columns.py +235 -0
csv_detective/parsing/compression.py +11 -0
csv_detective/parsing/csv.py +56 -0
csv_detective/parsing/excel.py +167 -0
csv_detective/parsing/load.py +111 -0
csv_detective/parsing/text.py +56 -0
csv_detective/utils.py +23 -196
csv_detective/validate.py +138 -0
csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
{csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
csv_detective/all_packages.txt +0 -104
csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
csv_detective/detect_fields/FR/other/__init__.py +0 -0
csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
csv_detective/detect_fields/FR/temp/__init__.py +0 -0
csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
csv_detective/detect_fields/__init__.py +0 -57
csv_detective/detect_fields/geo/__init__.py +0 -0
csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/other/__init__.py +0 -0
csv_detective/detect_fields/other/booleen/__init__.py +0 -21
csv_detective/detect_fields/other/email/__init__.py +0 -8
csv_detective/detect_fields/other/float/__init__.py +0 -17
csv_detective/detect_fields/other/int/__init__.py +0 -12
csv_detective/detect_fields/other/json/__init__.py +0 -24
csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
csv_detective/detect_fields/other/twitter/__init__.py +0 -8
csv_detective/detect_fields/other/url/__init__.py +0 -11
csv_detective/detect_fields/other/uuid/__init__.py +0 -11
csv_detective/detect_fields/temp/__init__.py +0 -0
csv_detective/detect_fields/temp/date/__init__.py +0 -62
csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
csv_detective/detect_fields/temp/year/__init__.py +0 -10
csv_detective/detect_labels/FR/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
csv_detective/detect_labels/FR/other/__init__.py +0 -0
csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
csv_detective/detect_labels/FR/temp/__init__.py +0 -0
csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
csv_detective/detect_labels/__init__.py +0 -43
csv_detective/detect_labels/geo/__init__.py +0 -0
csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
csv_detective/detect_labels/other/__init__.py +0 -0
csv_detective/detect_labels/other/booleen/__init__.py +0 -34
csv_detective/detect_labels/other/email/__init__.py +0 -45
csv_detective/detect_labels/other/float/__init__.py +0 -33
csv_detective/detect_labels/other/int/__init__.py +0 -33
csv_detective/detect_labels/other/money/__init__.py +0 -11
csv_detective/detect_labels/other/money/check_col_name.py +0 -8
csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
csv_detective/detect_labels/other/twitter/__init__.py +0 -33
csv_detective/detect_labels/other/url/__init__.py +0 -48
csv_detective/detect_labels/other/uuid/__init__.py +0 -33
csv_detective/detect_labels/temp/__init__.py +0 -0
csv_detective/detect_labels/temp/date/__init__.py +0 -51
csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
csv_detective/detect_labels/temp/year/__init__.py +0 -44
csv_detective/detection.py +0 -361
csv_detective/process_text.py +0 -39
csv_detective/s3_utils.py +0 -48
csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.dist-info/METADATA +0 -23
csv_detective-0.6.7.dist-info/RECORD +0 -150
csv_detective-0.6.7.dist-info/WHEEL +0 -5
csv_detective-0.6.7.dist-info/top_level.txt +0 -2
tests/__init__.py +0 -0
tests/test_fields.py +0 -360
tests/test_file.py +0 -116
tests/test_labels.py +0 -7
/csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0

csv_detective/output/utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+import pandas as pd
+def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
+    # -> dict[str, dict | list[dict]] (to be added when upgrading to python>=3.10)
+    return_dict_cols = return_table.to_dict("dict")
+    output_dict = {}
+    for column_name in return_dict_cols:
+        # keep only formats with a non-zero score
+        output_dict[column_name] = [
+            {
+                "format": detected_value_type,
+                "score": return_dict_cols[column_name][detected_value_type],
+            }
+            for detected_value_type in return_dict_cols[column_name]
+            if return_dict_cols[column_name][detected_value_type] > 0
+        ]
+        priorities = [
+            # no need to specify int and float everywhere, they are deprioritized anyway
+            ("int", ("float",)),
+            # bool over everything
+            (
+                "booleen",
+                (
+                    "latitude_l93",
+                    "latitude_wgs",
+                    "latitude_wgs_fr_metropole",
+                    "longitude_l93",
+                    "longitude_wgs",
+                    "longitude_wgs_fr_metropole",
+                ),
+            ),
+            ("geojson", ("json",)),
+            # latlon over lonlat if no longitude allows to discriminate
+            ("latlon_wgs", ("json", "lonlat_wgs")),
+            ("lonlat_wgs", ("json",)),
+            ("latitude_wgs_fr_metropole", ("latitude_l93", "latitude_wgs")),
+            ("longitude_wgs_fr_metropole", ("longitude_l93", "longitude_wgs")),
+            ("latitude_wgs", ("latitude_l93",)),
+            ("longitude_wgs", ("longitude_l93",)),
+            ("code_region", ("code_departement",)),
+            ("datetime_rfc822", ("datetime_aware",)),
+        ]
+        detected_formats = set(x["format"] for x in output_dict[column_name])
+        formats_to_remove = set()
+        # Deprioritise float and int detection vs others
+        if len(detected_formats - {"float", "int"}) > 0:
+            formats_to_remove = formats_to_remove.union({"float", "int"})
+        # Deprioritize less specific formats if:
+        # secondary score is even or worse
+        # or priority score is at least 1 (max of the field score)
+        for prio_format, secondary_formats in priorities:
+            if prio_format in detected_formats:
+                for secondary in secondary_formats:
+                    if secondary in detected_formats and (
+                        return_dict_cols[column_name][prio_format]
+                        >= return_dict_cols[column_name][secondary]
+                        or return_dict_cols[column_name][prio_format] >= 1
+                    ):
+                        formats_to_remove.add(secondary)
+        formats_to_keep = detected_formats - formats_to_remove
+        detections = [x for x in output_dict[column_name] if x["format"] in formats_to_keep]
+        if not limited_output:
+            output_dict[column_name] = detections
+        else:
+            output_dict[column_name] = (
+                max(detections, key=lambda x: x["score"])
+                if len(detections) > 0
+                else {"format": "string", "score": 1.0}
+            )
+    return output_dict

csv_detective/{detect_fields/FR/geo → parsing}/__init__.py RENAMED Viewed

File without changes

csv_detective/parsing/columns.py ADDED Viewed

@@ -0,0 +1,235 @@
+import logging
+from time import time
+from typing import Callable
+import pandas as pd
+from more_itertools import peekable
+from csv_detective.format import Format
+from csv_detective.parsing.csv import CHUNK_SIZE
+from csv_detective.utils import display_logs_depending_process_time
+# above this threshold, a column is not considered categorical
+MAX_NUMBER_CATEGORICAL_VALUES = 25
+def test_col_val(
+    serie: pd.Series,
+    format: Format,
+    skipna: bool = True,
+    limited_output: bool = False,
+    verbose: bool = False,
+) -> float:
+    """Tests values of the serie using test_func.
+         - skipna : if True indicates that NaNs are considered True
+    for the serie to be detected as a certain format
+    """
+    if verbose:
+        start = time()
+    # TODO : change for a cleaner method and only test columns in modules labels
+    def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
+        return serie.sample(n=_range).apply(test_func)
+    try:
+        if skipna:
+            serie = serie.loc[serie.notnull()]
+        ser_len = len(serie)
+        if ser_len == 0:
+            # being here means the whole column is NaN, so if skipna it's a pass
+            return 1.0 if skipna else 0.0
+        if not limited_output or format.proportion < 1:
+            # we want or have to go through the whole column to have the proportion
+            result: float = serie.apply(format.func).sum() / ser_len
+            return result if result >= format.proportion else 0.0
+        else:
+            # the whole column has to be valid so we have early stops (1 then 5 rows)
+            # to not waste time if directly unsuccessful
+            for _range in [
+                min(1, ser_len),
+                min(5, ser_len),
+                ser_len,
+            ]:
+                if not all(apply_test_func(serie, format.func, _range)):
+                    return 0.0
+            return 1.0
+    finally:
+        if verbose and time() - start > 3:
+            display_logs_depending_process_time(
+                f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
+                time() - start,
+            )
+def test_col(
+    table: pd.DataFrame,
+    formats: dict[str, Format],
+    limited_output: bool,
+    skipna: bool = True,
+    verbose: bool = False,
+):
+    if verbose:
+        start = time()
+        logging.info("Testing columns to get formats")
+    return_table = pd.DataFrame(columns=table.columns)
+    for idx, (label, format) in enumerate(formats.items()):
+        if verbose:
+            start_type = time()
+            logging.info(f"\t- Starting with format '{label}'")
+        # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
+        # => the following needs to change, "apply" means all columns are tested for one type at once
+        return_table.loc[label] = table.apply(
+            lambda serie: test_col_val(
+                serie,
+                format,
+                skipna=skipna,
+                limited_output=limited_output,
+                verbose=verbose,
+            )
+        )
+        if verbose:
+            display_logs_depending_process_time(
+                f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
+                time() - start_type,
+            )
+    if verbose:
+        display_logs_depending_process_time(
+            f"Done testing columns in {round(time() - start, 3)}s", time() - start
+        )
+    return return_table
+def test_label(
+    columns: list[str], formats: dict[str, Format], limited_output: bool, verbose: bool = False
+):
+    if verbose:
+        start = time()
+        logging.info("Testing labels to get types")
+    return_table = pd.DataFrame(columns=columns)
+    for idx, (label, format) in enumerate(formats.items()):
+        if verbose:
+            start_type = time()
+        return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
+        if verbose:
+            display_logs_depending_process_time(
+                f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
+                time() - start_type,
+            )
+    if verbose:
+        display_logs_depending_process_time(
+            f"Done testing labels in {round(time() - start, 3)}s", time() - start
+        )
+    return return_table
+def test_col_chunks(
+    table: pd.DataFrame,
+    file_path: str,
+    analysis: dict,
+    formats: dict[str, Format],
+    limited_output: bool,
+    skipna: bool = True,
+    verbose: bool = False,
+) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]:
+    def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]:
+        # returns a dict with the table's columns as keys and the list of remaining format labels to apply
+        return {
+            col: [
+                fmt_label
+                for fmt_label in return_table.index
+                if return_table.loc[fmt_label, col] > 0
+            ]
+            for col in return_table.columns
+        }
+    if verbose:
+        start = time()
+        logging.info("Testing columns to get formats on chunks")
+    # analysing the sample to get a first guess
+    return_table = test_col(table, formats, limited_output, skipna=skipna, verbose=verbose)
+    remaining_tests_per_col = build_remaining_tests_per_col(return_table)
+    # hashing rows to get nb_duplicates
+    row_hashes_count = table.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
+    # getting values for profile to read the file only once
+    col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
+    # only csv files can end up here, can't chunk excel
+    chunks = pd.read_csv(
+        file_path,
+        dtype=str,
+        encoding=analysis["encoding"],
+        sep=analysis["separator"],
+        skiprows=analysis["header_row_idx"],
+        compression=analysis.get("compression"),
+        chunksize=CHUNK_SIZE,
+    )
+    analysis["total_lines"] = CHUNK_SIZE
+    batch, batch_number = [], 1
+    iterator = peekable(enumerate(chunks))
+    while iterator:
+        idx, chunk = next(iterator)
+        if idx == 0:
+            # we have read and analysed the first chunk already
+            continue
+        if len(batch) < 10:
+            # it's too slow to process chunks directly, but we want to keep the first analysis
+            # on a "small" chunk, so partial analyses are done on batches of chunks
+            batch.append(chunk)
+            # we don't know when the chunks end, and doing one additionnal step
+            # for the final batch is ugly
+            try:
+                iterator.peek()
+                continue
+            except StopIteration:
+                pass
+        if verbose:
+            logging.info(f"> Testing batch number {batch_number}")
+        batch = pd.concat(batch, ignore_index=True)
+        analysis["total_lines"] += len(batch)
+        row_hashes_count = row_hashes_count.add(
+            batch.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
+            fill_value=0,
+        )
+        for col in batch.columns:
+            col_values[col] = col_values[col].add(
+                batch[col].value_counts(dropna=False),
+                fill_value=0,
+            )
+        if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()):
+            # no more potential tests to do on any column, early stop
+            break
+        for col, fmt_labels in remaining_tests_per_col.items():
+            # testing each column with the tests that are still competing
+            # after previous batchs analyses
+            for label in fmt_labels:
+                batch_col_test = test_col_val(
+                    batch[col],
+                    formats[label],
+                    limited_output=limited_output,
+                    skipna=skipna,
+                )
+                return_table.loc[label, col] = (
+                    # if this batch's column tested 0 then test fails overall
+                    0
+                    if batch_col_test == 0
+                    # otherwise updating the score with weighted average
+                    else ((return_table.loc[label, col] * idx + batch_col_test) / (idx + 1))
+                )
+        remaining_tests_per_col = build_remaining_tests_per_col(return_table)
+        batch, batch_number = [], batch_number + 1
+    analysis["nb_duplicates"] = sum(row_hashes_count > 1)
+    analysis["categorical"] = [
+        col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
+    ]
+    # handling that empty columns score 1 everywhere
+    for col in return_table.columns:
+        if sum(return_table[col]) == len(return_table):
+            return_table[col] = 0
+    if verbose:
+        display_logs_depending_process_time(
+            f"Done testing chunks in {round(time() - start, 3)}s", time() - start
+        )
+    return return_table, analysis, col_values

csv_detective/parsing/compression.py ADDED Viewed

@@ -0,0 +1,11 @@
+import gzip
+from io import BytesIO
+def unzip(binary_file: BytesIO, engine: str) -> BytesIO:
+    if engine == "gzip":
+        with gzip.open(binary_file, mode="rb") as binary_file:
+            file_content = binary_file.read()
+    else:
+        raise NotImplementedError(f"{engine} is not yet supported")
+    return BytesIO(file_content)

csv_detective/parsing/csv.py ADDED Viewed

@@ -0,0 +1,56 @@
+import logging
+from time import time
+from typing import TextIO
+import pandas as pd
+from csv_detective.utils import display_logs_depending_process_time
+# the number of rows for the first analysis, and the number of rows per chunk of the df iterator
+CHUNK_SIZE = int(1e4)
+def parse_csv(
+    the_file: TextIO,
+    encoding: str,
+    sep: str,
+    num_rows: int,
+    skiprows: int,
+    random_state: int = 42,
+    verbose: bool = False,
+) -> tuple[pd.DataFrame, int | None, int | None]:
+    if verbose:
+        start = time()
+        logging.info("Parsing table")
+    if not isinstance(the_file, str):
+        the_file.seek(0)
+    try:
+        table = pd.read_csv(
+            the_file,
+            sep=sep,
+            dtype=str,
+            encoding=encoding,
+            skiprows=skiprows,
+            nrows=CHUNK_SIZE,
+        )
+        total_lines = len(table)
+        # branch between small and big files starts here
+        if total_lines == CHUNK_SIZE:
+            if verbose:
+                logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
+            total_lines, nb_duplicates = None, None
+        else:
+            nb_duplicates = len(table.loc[table.duplicated()])
+        if num_rows > 0:
+            num_rows = min(num_rows, total_lines or len(table))
+            table = table.sample(num_rows, random_state=random_state)
+    except Exception as e:
+        raise ValueError("Could not load file") from e
+    if verbose:
+        display_logs_depending_process_time(
+            f"Table parsed successfully in {round(time() - start, 3)}s",
+            time() - start,
+        )
+    return table, total_lines, nb_duplicates

csv_detective/parsing/excel.py ADDED Viewed

@@ -0,0 +1,167 @@
+from io import BytesIO
+from time import time
+import openpyxl
+import pandas as pd
+import requests
+import xlrd
+from csv_detective.detection.engine import engine_to_file
+from csv_detective.detection.rows import remove_empty_first_rows
+from csv_detective.utils import (
+    display_logs_depending_process_time,
+    is_url,
+)
+NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
+OLD_EXCEL_EXT = [".xls"]
+OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
+XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
+def parse_excel(
+    file_path: str,
+    num_rows: int = -1,
+    engine: str | None = None,
+    sheet_name: str | None = None,
+    random_state: int = 42,
+    verbose: bool = False,
+) -> tuple[pd.DataFrame, int, int, str, str, int]:
+    """ "Excel-like parsing is really slow, could be a good improvement for future development"""
+    if verbose:
+        start = time()
+    no_sheet_specified = sheet_name is None
+    if engine in ["openpyxl", "xlrd"] or any(
+        [file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT]
+    ):
+        remote_content = None
+        if is_url(file_path):
+            r = requests.get(file_path)
+            r.raise_for_status()
+            remote_content = BytesIO(r.content)
+        if not engine:
+            if any([file_path.endswith(k) for k in NEW_EXCEL_EXT]):
+                engine = "openpyxl"
+            else:
+                engine = "xlrd"
+        if sheet_name is None:
+            if verbose:
+                display_logs_depending_process_time(
+                    f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
+                    time() - start,
+                )
+            try:
+                if engine == "openpyxl":
+                    # openpyxl doesn't want to open files that don't have a valid extension
+                    # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
+                    # if the file is remote, we have a remote content anyway so it's fine
+                    if not remote_content and "." not in file_path.split("/")[-1]:
+                        with open(file_path, "rb") as f:
+                            remote_content = BytesIO(f.read())
+                    # faster than loading all sheets
+                    wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
+                    try:
+                        sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
+                    except TypeError:
+                        # sometimes read_only can't get the info, so we have to open the file for real
+                        # this takes more time but it's for a limited number of files
+                        # and it's this or nothing
+                        wb = openpyxl.load_workbook(remote_content or file_path)
+                        sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
+                else:
+                    if remote_content:
+                        wb = xlrd.open_workbook(file_contents=remote_content.read())
+                    else:
+                        wb = xlrd.open_workbook(file_path)
+                    sizes = {s.name: s.nrows * s.ncols for s in wb.sheets()}
+                sheet_name = max(sizes, key=sizes.get)
+            except xlrd.biffh.XLRDError:
+                # sometimes a xls file is recognized as ods
+                if verbose:
+                    display_logs_depending_process_time(
+                        "Could not read file with classic xls reader, trying with ODS",
+                        time() - start,
+                    )
+                engine = "odf"
+    if engine == "odf" or any([file_path.endswith(k) for k in OPEN_OFFICE_EXT]):
+        # for ODS files, no way to get sheets' sizes without
+        # loading the file one way or another (pandas or pure odfpy)
+        # so all in one
+        engine = "odf"
+        if sheet_name is None:
+            if verbose:
+                display_logs_depending_process_time(
+                    f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
+                    time() - start,
+                )
+            tables = pd.read_excel(
+                file_path,
+                engine="odf",
+                sheet_name=None,
+                dtype=str,
+            )
+            sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
+            sheet_name = max(sizes, key=sizes.get)
+            if verbose:
+                display_logs_depending_process_time(
+                    f'Going forwards with sheet "{sheet_name}"',
+                    time() - start,
+                )
+            table = tables[sheet_name]
+        else:
+            if verbose:
+                display_logs_depending_process_time(
+                    f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
+                    time() - start,
+                )
+            table = pd.read_excel(
+                file_path,
+                engine="odf",
+                sheet_name=sheet_name,
+                dtype=str,
+            )
+        table, header_row_idx = remove_empty_first_rows(table)
+        total_lines = len(table)
+        nb_duplicates = len(table.loc[table.duplicated()])
+        if num_rows > 0:
+            num_rows = min(num_rows - 1, total_lines)
+            table = table.sample(num_rows, random_state=random_state)
+        if verbose:
+            display_logs_depending_process_time(
+                f"Table parsed successfully in {round(time() - start, 3)}s",
+                time() - start,
+            )
+        return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
+    # so here we end up with (old and new) excel files only
+    if verbose:
+        if no_sheet_specified:
+            display_logs_depending_process_time(
+                f'Going forwards with sheet "{sheet_name}"',
+                time() - start,
+            )
+        else:
+            display_logs_depending_process_time(
+                f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
+                time() - start,
+            )
+    table = pd.read_excel(
+        file_path,
+        engine=engine,
+        sheet_name=sheet_name,
+        dtype=str,
+    )
+    table, header_row_idx = remove_empty_first_rows(table)
+    total_lines = len(table)
+    nb_duplicates = len(table.loc[table.duplicated()])
+    if num_rows > 0:
+        num_rows = min(num_rows - 1, total_lines)
+        table = table.sample(num_rows, random_state=random_state)
+    if verbose:
+        display_logs_depending_process_time(
+            f"Table parsed successfully in {round(time() - start, 3)}s",
+            time() - start,
+        )
+    return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx

csv_detective/parsing/load.py ADDED Viewed

@@ -0,0 +1,111 @@
+from io import BytesIO, StringIO
+import pandas as pd
+import requests
+from csv_detective.detection.columns import detect_heading_columns, detect_trailing_columns
+from csv_detective.detection.encoding import detect_encoding
+from csv_detective.detection.engine import (
+    COMPRESSION_ENGINES,
+    EXCEL_ENGINES,
+    detect_engine,
+)
+from csv_detective.detection.headers import detect_headers
+from csv_detective.detection.separator import detect_separator
+from csv_detective.parsing.compression import unzip
+from csv_detective.parsing.csv import parse_csv
+from csv_detective.parsing.excel import (
+    XLS_LIKE_EXT,
+    parse_excel,
+)
+from csv_detective.utils import is_url
+def load_file(
+    file_path: str,
+    num_rows: int = 500,
+    encoding: str | None = None,
+    sep: str | None = None,
+    verbose: bool = False,
+    sheet_name: str | int | None = None,
+) -> tuple[pd.DataFrame, dict]:
+    file_name = file_path.split("/")[-1]
+    engine = None
+    if "." not in file_name or not file_name.endswith("csv"):
+        # file has no extension, we'll investigate how to read it
+        engine = detect_engine(file_path, verbose=verbose)
+    if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
+        table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
+            file_path=file_path,
+            num_rows=num_rows,
+            engine=engine,
+            sheet_name=sheet_name,
+            verbose=verbose,
+        )
+        if table.empty:
+            raise ValueError("Table seems to be empty")
+        header = table.columns.to_list()
+        if any(col.startswith("Unnamed") for col in header):
+            raise ValueError("Could not retrieve headers")
+        analysis = {
+            "engine": engine,
+            "sheet_name": sheet_name,
+        }
+    else:
+        # fetching or reading file as binary
+        if is_url(file_path):
+            r = requests.get(file_path, allow_redirects=True)
+            r.raise_for_status()
+            binary_file = BytesIO(r.content)
+        else:
+            binary_file = open(file_path, "rb")
+        # handling compression
+        if engine in COMPRESSION_ENGINES:
+            binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
+        # detecting encoding if not specified
+        if encoding is None:
+            encoding: str = detect_encoding(binary_file, verbose=verbose)
+            binary_file.seek(0)
+        # decoding and reading file
+        if is_url(file_path) or engine in COMPRESSION_ENGINES:
+            str_file = StringIO()
+            while True:
+                chunk = binary_file.read(1024**2)
+                if not chunk:
+                    break
+                str_file.write(chunk.decode(encoding=encoding))
+            del binary_file
+            str_file.seek(0)
+        else:
+            str_file = open(file_path, "r", encoding=encoding)
+        if sep is None:
+            sep = detect_separator(str_file, verbose=verbose)
+        header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
+        if header is None or (isinstance(header, list) and any([h is None for h in header])):
+            raise ValueError("Could not retrieve headers")
+        heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
+        trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
+        table, total_lines, nb_duplicates = parse_csv(
+            str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
+        )
+        del str_file
+        if table.empty:
+            raise ValueError("Table seems to be empty")
+        analysis = {
+            "encoding": encoding,
+            "separator": sep,
+            "heading_columns": heading_columns,
+            "trailing_columns": trailing_columns,
+        }
+        if engine is not None:
+            analysis["compression"] = engine
+    analysis |= {
+        "header_row_idx": header_row_idx,
+        "header": header,
+    }
+    if total_lines is not None:
+        analysis["total_lines"] = total_lines
+    if nb_duplicates is not None:
+        analysis["nb_duplicates"] = nb_duplicates
+    return table, analysis

csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl