PyPI - csv-detective - Versions diffs - 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl - Mend

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

csv_detective/__init__.py +7 -1
csv_detective/cli.py +33 -21
csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
csv_detective/detection/columns.py +89 -0
csv_detective/detection/encoding.py +29 -0
csv_detective/detection/engine.py +46 -0
csv_detective/detection/formats.py +156 -0
csv_detective/detection/headers.py +28 -0
csv_detective/detection/rows.py +18 -0
csv_detective/detection/separator.py +44 -0
csv_detective/detection/variables.py +97 -0
csv_detective/explore_csv.py +151 -377
csv_detective/format.py +67 -0
csv_detective/formats/__init__.py +9 -0
csv_detective/formats/adresse.py +116 -0
csv_detective/formats/binary.py +26 -0
csv_detective/formats/booleen.py +35 -0
csv_detective/formats/code_commune_insee.py +26 -0
csv_detective/formats/code_csp_insee.py +36 -0
csv_detective/formats/code_departement.py +29 -0
csv_detective/formats/code_fantoir.py +21 -0
csv_detective/formats/code_import.py +17 -0
csv_detective/formats/code_postal.py +25 -0
csv_detective/formats/code_region.py +22 -0
csv_detective/formats/code_rna.py +29 -0
csv_detective/formats/code_waldec.py +17 -0
csv_detective/formats/commune.py +27 -0
csv_detective/formats/csp_insee.py +31 -0
csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
csv_detective/formats/date.py +99 -0
csv_detective/formats/date_fr.py +22 -0
csv_detective/formats/datetime_aware.py +45 -0
csv_detective/formats/datetime_naive.py +48 -0
csv_detective/formats/datetime_rfc822.py +24 -0
csv_detective/formats/departement.py +37 -0
csv_detective/formats/email.py +28 -0
csv_detective/formats/float.py +29 -0
csv_detective/formats/geojson.py +36 -0
csv_detective/formats/insee_ape700.py +31 -0
csv_detective/formats/insee_canton.py +28 -0
csv_detective/formats/int.py +23 -0
csv_detective/formats/iso_country_code_alpha2.py +30 -0
csv_detective/formats/iso_country_code_alpha3.py +30 -0
csv_detective/formats/iso_country_code_numeric.py +31 -0
csv_detective/formats/jour_de_la_semaine.py +41 -0
csv_detective/formats/json.py +20 -0
csv_detective/formats/latitude_l93.py +48 -0
csv_detective/formats/latitude_wgs.py +42 -0
csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
csv_detective/formats/latlon_wgs.py +53 -0
csv_detective/formats/longitude_l93.py +39 -0
csv_detective/formats/longitude_wgs.py +32 -0
csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
csv_detective/formats/lonlat_wgs.py +36 -0
csv_detective/formats/mois_de_lannee.py +48 -0
csv_detective/formats/money.py +18 -0
csv_detective/formats/mongo_object_id.py +14 -0
csv_detective/formats/pays.py +35 -0
csv_detective/formats/percent.py +16 -0
csv_detective/formats/region.py +70 -0
csv_detective/formats/sexe.py +17 -0
csv_detective/formats/siren.py +37 -0
csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
csv_detective/formats/tel_fr.py +36 -0
csv_detective/formats/uai.py +36 -0
csv_detective/formats/url.py +46 -0
csv_detective/formats/username.py +14 -0
csv_detective/formats/uuid.py +16 -0
csv_detective/formats/year.py +28 -0
csv_detective/output/__init__.py +65 -0
csv_detective/output/dataframe.py +96 -0
csv_detective/output/example.py +250 -0
csv_detective/output/profile.py +119 -0
csv_detective/{schema_generation.py → output/schema.py} +268 -343
csv_detective/output/utils.py +74 -0
csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
csv_detective/parsing/columns.py +235 -0
csv_detective/parsing/compression.py +11 -0
csv_detective/parsing/csv.py +56 -0
csv_detective/parsing/excel.py +167 -0
csv_detective/parsing/load.py +111 -0
csv_detective/parsing/text.py +56 -0
csv_detective/utils.py +23 -196
csv_detective/validate.py +138 -0
csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
{csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
csv_detective/all_packages.txt +0 -104
csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
csv_detective/detect_fields/FR/other/__init__.py +0 -0
csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
csv_detective/detect_fields/FR/temp/__init__.py +0 -0
csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
csv_detective/detect_fields/__init__.py +0 -57
csv_detective/detect_fields/geo/__init__.py +0 -0
csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
csv_detective/detect_fields/other/__init__.py +0 -0
csv_detective/detect_fields/other/booleen/__init__.py +0 -21
csv_detective/detect_fields/other/email/__init__.py +0 -8
csv_detective/detect_fields/other/float/__init__.py +0 -17
csv_detective/detect_fields/other/int/__init__.py +0 -12
csv_detective/detect_fields/other/json/__init__.py +0 -24
csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
csv_detective/detect_fields/other/twitter/__init__.py +0 -8
csv_detective/detect_fields/other/url/__init__.py +0 -11
csv_detective/detect_fields/other/uuid/__init__.py +0 -11
csv_detective/detect_fields/temp/__init__.py +0 -0
csv_detective/detect_fields/temp/date/__init__.py +0 -62
csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
csv_detective/detect_fields/temp/year/__init__.py +0 -10
csv_detective/detect_labels/FR/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/__init__.py +0 -0
csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
csv_detective/detect_labels/FR/other/__init__.py +0 -0
csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
csv_detective/detect_labels/FR/temp/__init__.py +0 -0
csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
csv_detective/detect_labels/__init__.py +0 -43
csv_detective/detect_labels/geo/__init__.py +0 -0
csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
csv_detective/detect_labels/other/__init__.py +0 -0
csv_detective/detect_labels/other/booleen/__init__.py +0 -34
csv_detective/detect_labels/other/email/__init__.py +0 -45
csv_detective/detect_labels/other/float/__init__.py +0 -33
csv_detective/detect_labels/other/int/__init__.py +0 -33
csv_detective/detect_labels/other/money/__init__.py +0 -11
csv_detective/detect_labels/other/money/check_col_name.py +0 -8
csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
csv_detective/detect_labels/other/twitter/__init__.py +0 -33
csv_detective/detect_labels/other/url/__init__.py +0 -48
csv_detective/detect_labels/other/uuid/__init__.py +0 -33
csv_detective/detect_labels/temp/__init__.py +0 -0
csv_detective/detect_labels/temp/date/__init__.py +0 -51
csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
csv_detective/detect_labels/temp/year/__init__.py +0 -44
csv_detective/detection.py +0 -361
csv_detective/process_text.py +0 -39
csv_detective/s3_utils.py +0 -48
csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
csv_detective-0.6.7.dist-info/METADATA +0 -23
csv_detective-0.6.7.dist-info/RECORD +0 -150
csv_detective-0.6.7.dist-info/WHEEL +0 -5
csv_detective-0.6.7.dist-info/top_level.txt +0 -2
tests/__init__.py +0 -0
tests/test_fields.py +0 -360
tests/test_file.py +0 -116
tests/test_labels.py +0 -7
/csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
/csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0

csv_detective/explore_csv.py CHANGED Viewed

@@ -1,413 +1,187 @@
-"""
-Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
-contenu possible des champs
-"""
-from typing import Dict, List, Literal, Union
-import json
-import numpy as np
-import os
-import tempfile
-from pkg_resources import resource_string
 import logging
 from time import time
-# flake8: noqa
-from csv_detective import detect_fields
-from csv_detective import detect_labels
-from csv_detective.s3_utils import download_from_minio, upload_to_minio
-from csv_detective.schema_generation import generate_table_schema
-from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
-from .detection import (
-    detect_separator,
-    detect_encoding,
-    detect_headers,
-    detect_heading_columns,
-    detect_trailing_columns,
-    parse_table,
-    create_profile,
-    detetect_categorical_variable,
-    # detect_continuous_variable,
-)
+import pandas as pd
+from csv_detective.detection.formats import detect_formats
+from csv_detective.output import generate_output
+from csv_detective.parsing.load import load_file
+from csv_detective.utils import display_logs_depending_process_time, is_url
+from csv_detective.validate import validate
 logging.basicConfig(level=logging.INFO)
-def return_all_tests(user_input_tests, detect_type="detect_fields"):
-    """
-    returns all tests that have a method _is and are listed in the user_input_tests
-    the function can select a sub_package from csv_detective
-    """
-    all_packages = resource_string(__name__, "all_packages.txt")
-    all_packages = all_packages.decode().split("\n")
-    all_packages.remove("")
-    all_packages.remove("csv_detective")
-    all_packages = [x.replace("csv_detective.", "") for x in all_packages]
-    if user_input_tests is None:
-        return []
-    if isinstance(user_input_tests, str):
-        assert user_input_tests[0] != "-"
-        if user_input_tests == "ALL":
-            tests_to_do = [detect_type]
-        else:
-            tests_to_do = [detect_type + "." + user_input_tests]
-        tests_to_not_do = []
-    elif isinstance(user_input_tests, list):
-        if "ALL" in user_input_tests:
-            tests_to_do = [detect_type]
-        else:
-            tests_to_do = [
-                detect_type + "." + x for x in user_input_tests if x[0] != "-"
-            ]
-        tests_to_not_do = [
-            detect_type + "." + x[1:] for x in user_input_tests if x[0] == "-"
-        ]
-    all_fields = [
-        x
-        for x in all_packages
-        if any([y == x[: len(y)] for y in tests_to_do])
-        and all([y != x[: len(y)] for y in tests_to_not_do])
-    ]
-    all_tests = [eval(field) for field in all_fields]
-    all_tests = [
-        test for test in all_tests if "_is" in dir(test)
-    ]  # TODO : Fix this shit
-    return all_tests
 def routine(
-    csv_file_path: str,
+    file_path: str,
     num_rows: int = 500,
-    user_input_tests: Union[str, List[str]] = "ALL",
-    output_mode: Literal["ALL", "LIMITED"] = "LIMITED",
-    save_results: bool = True,
-    encoding: str = None,
-    sep: str = None,
+    tags: list[str] | None = None,
+    limited_output: bool = True,
+    save_results: bool | str = True,
+    encoding: str | None = None,
+    sep: str | None = None,
+    skipna: bool = True,
     output_profile: bool = False,
     output_schema: bool = False,
-    verbose: bool = False
-):
-    """Returns a dict with information about the csv table and possible
-    column contents.
+    output_df: bool = False,
+    cast_json: bool = True,
+    verbose: bool = False,
+    sheet_name: str | int | None = None,
+) -> dict | tuple[dict, pd.DataFrame]:
+    """
+    Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
     Args:
-        csv_file_path: local path to CSV file if not using Minio
-        num_rows: number of rows to sample from the file for analysis ; -1 for analysis
-        of the whole file
-        user_input_tests: tests to run on the file
-        output_mode: LIMITED or ALL, whether or not to return all possible types or only
-        the most likely one for each column
-        save_results: whether or not to save the results in a json file
+        file_path: local path or URL to file
+        num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
+        tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
+        limited_output: whether or not to return all possible types or only the most likely one for each column
+        save_results: whether or not to save the results in a json file, or the path where to dump the output
         output_profile: whether or not to add the 'profile' field to the output
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
-        verbose: whether or not to print process logs in console
+        output_df: whether or not to return the loaded DataFrame along with the analysis report
+        cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
+        verbose: whether or not to print process logs in console
+        sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
+        skipna: whether to keep NaN (empty cells) for tests
     Returns:
         dict: a dict with information about the csv and possible types for each column
     """
-    if verbose:
-        start_routine = time()
-    if csv_file_path is None:
-        raise ValueError("csv_file_path is required.")
-    if encoding is None:
-        binary_file = open(csv_file_path, mode="rb")
-        encoding = detect_encoding(binary_file, verbose=verbose)
-    with open(csv_file_path, "r", encoding=encoding) as str_file:
-        if sep is None:
-            sep = detect_separator(str_file, verbose=verbose)
-        header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
-        if header is None:
-            return_dict = {"error": True}
-            return return_dict
-        elif isinstance(header, list):
-            if any([x is None for x in header]):
-                return_dict = {"error": True}
-                return return_dict
-        heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
-        trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
-        table, total_lines, nb_duplicates = parse_table(
-            str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
-        )
-    if table.empty:
-        res_categorical = []
-        # res_continuous = []
-    else:
-        # Detects columns that are categorical
-        res_categorical, categorical_mask = detetect_categorical_variable(table, verbose=verbose)
-        res_categorical = list(res_categorical)
-        # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
-        # res_continuous = list(
-        #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
-        # )
-    # Creating return dictionary
-    return_dict = dict()
-    return_dict["encoding"] = encoding
-    return_dict["separator"] = sep
-    return_dict["header_row_idx"] = header_row_idx
-    return_dict["header"] = header
-    return_dict["total_lines"] = total_lines
-    return_dict["nb_duplicates"] = nb_duplicates
-    return_dict["heading_columns"] = heading_columns
-    return_dict["trailing_columns"] = trailing_columns
-    # return_dict["continuous"] = res_continuous
-    return_dict["categorical"] = res_categorical
-    # list testing to be performed
-    all_tests_fields = return_all_tests(
-        user_input_tests, detect_type="detect_fields"
-    )  # list all tests for the fields
-    all_tests_labels = return_all_tests(
-        user_input_tests, detect_type="detect_labels"
-    )  # list all tests for the labels
-    # if no testing then return
-    if not all_tests_fields and not all_tests_labels:
-        return return_dict
-    # Perform testing on fields
-    return_table_fields = test_col(table, all_tests_fields, output_mode, verbose=verbose)
-    return_dict_cols_fields = prepare_output_dict(return_table_fields, output_mode)
-    return_dict["columns_fields"] = return_dict_cols_fields
+    if not (
+        isinstance(save_results, bool)
+        or (isinstance(save_results, str) and save_results.endswith(".json"))
+    ):
+        raise ValueError("`save_results` must be a bool or a valid path to a json file.")
-    # Perform testing on labels
-    return_table_labels = test_label(table, all_tests_labels, output_mode, verbose=verbose)
-    return_dict_cols_labels = prepare_output_dict(return_table_labels, output_mode)
-    return_dict["columns_labels"] = return_dict_cols_labels
+    if verbose:
+        start_routine = time()
+        if is_url(file_path):
+            logging.info("Path recognized as a URL")
-    # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
-    # This is because the fields are more important than the labels and yields a max
-    # of 1.5 for the final score.
-    return_table = return_table_fields * (
-        1
-        + return_table_labels.reindex(
-            index=return_table_fields.index, fill_value=0
-        ).values
-        / 2
+    table, analysis = load_file(
+        file_path=file_path,
+        num_rows=num_rows,
+        encoding=encoding,
+        sep=sep,
+        verbose=verbose,
+        sheet_name=sheet_name,
     )
-    # To reduce false positives: ensure these formats are detected only if the label yields
-    # a detection.
-    formats_with_mandatory_label = [
-        "code_departement",
-        "code_commune_insee",
-        "code_postal",
-        "latitude_wgs",
-        "longitude_wgs",
-        "latitude_wgs_fr_metropole",
-        "longitude_wgs_fr_metropole",
-        "latitude_l93",
-        "longitude_l93",
-    ]
-    return_table.loc[formats_with_mandatory_label, :] = np.where(
-        return_table_labels.loc[formats_with_mandatory_label, :],
-        return_table.loc[formats_with_mandatory_label, :],
-        0,
+    analysis, _col_values = detect_formats(
+        table=table,
+        analysis=analysis,
+        file_path=file_path,
+        tags=tags,
+        limited_output=limited_output,
+        skipna=skipna,
+        verbose=verbose,
     )
-    return_dict_cols = prepare_output_dict(return_table, output_mode)
-    return_dict["columns"] = return_dict_cols
-    metier_to_python_type = {
-        "booleen": "bool",
-        "int": "int",
-        "float": "float",
-        "string": "string",
-        "json": "json",
-        "json_geojson": "json",
-        "datetime": "datetime",
-        "date": "date",
-        "latitude": "float",
-        "latitude_l93": "float",
-        "latitude_wgs": "float",
-        "latitude_wgs_fr_metropole": "float",
-        "longitude": "float",
-        "longitude_l93": "float",
-        "longitude_wgs": "float",
-        "longitude_wgs_fr_metropole": "float",
-    }
-    if output_mode == "ALL":
-        for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            return_dict[detection_method] = {
-                col_name: [
-                    {
-                        "python_type": metier_to_python_type.get(
-                            detection["format"], "string"
-                        ),
-                        **detection,
-                    }
-                    for detection in detections
-                ]
-                for col_name, detections in return_dict[detection_method].items()
-            }
-    if output_mode == "LIMITED":
-        for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            return_dict[detection_method] = {
-                col_name: {
-                    "python_type": metier_to_python_type.get(
-                        detection["format"], "string"
-                    ),
-                    **detection,
-                }
-                for col_name, detection in return_dict[detection_method].items()
-            }
-        # Add detection with formats as keys
-        return_dict["formats"] = {
-            column_metadata["format"]: []
-            for column_metadata in return_dict["columns"].values()
-        }
-        for header, col_metadata in return_dict["columns"].items():
-            return_dict["formats"][col_metadata["format"]].append(header)
-    if output_profile:
-        return_dict["profile"] = create_profile(
-            table, return_dict["columns"],
-            sep,
-            encoding,
-            num_rows,
-            header_row_idx,
-            verbose=verbose
-        )
-    if save_results:
-        # Write your file as json
-        output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
-        with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
-            json.dump(return_dict, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
-    if output_schema and output_mode != "ALL":
-        return_dict["schema"] = generate_table_schema(
-            return_dict,
-            save_file=False,
-            verbose=verbose
-        )
-    if verbose:
-        display_logs_depending_process_time(
-            f'Routine completed in {round(time() - start_routine, 3)}s',
-            time() - start_routine
+    try:
+        return generate_output(
+            table=table,
+            analysis=analysis,
+            file_path=file_path,
+            num_rows=num_rows,
+            limited_output=limited_output,
+            save_results=save_results,
+            output_profile=output_profile,
+            output_schema=output_schema,
+            output_df=output_df,
+            cast_json=cast_json,
+            verbose=verbose,
+            sheet_name=sheet_name,
+            _col_values=_col_values,
         )
-    return return_dict
+    finally:
+        if verbose:
+            display_logs_depending_process_time(
+                f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
+            )
-def routine_minio(
-    csv_minio_location: Dict[str, str],
-    output_minio_location: Dict[str, str],
-    tableschema_minio_location: Dict[str, str],
-    minio_user: str,
-    minio_pwd: str,
+def validate_then_detect(
+    file_path: str,
+    previous_analysis: dict,
     num_rows: int = 500,
-    user_input_tests: Union[str, List[str]] = "ALL",
-    encoding: str = None,
-    sep: str = None,
+    tags: list[str] | None = None,
+    limited_output: bool = True,
+    save_results: bool | str = True,
+    skipna: bool = True,
+    output_profile: bool = False,
+    output_schema: bool = False,
+    output_df: bool = False,
+    cast_json: bool = True,
+    verbose: bool = False,
 ):
-    """Returns a dict with information about the csv table and possible
-    column contents.
+    """
+    Performs a validation of the given file against the given analysis.
+    If the validation fails, performs a full analysis and return it.
+    Otherwise return the previous analysis (which is therefore still valid).
+    NB: if asked, the profile is recreated in both cases.
     Args:
-        csv_minio_location: dict with Minio URL, bucket and key of the CSV file
-        output_minio_location: Minio URL, bucket and key to store output file. None if
-        not uploading to Minio.
-        tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
-        None if not uploading the tableschema to Minio.
-        minio_user: user name for the minio instance
-        minio_pwd: password for the minio instance
-        num_rows: number of rows to sample from the file for analysis ; -1 for analysis of
-        the whole file
-        user_input_tests: tests to run on the file
-        output_mode: LIMITED or ALL, whether or not to return all possible types or only
-        the most likely one for each column
-    Returns:
-        dict: a dict with information about the csv and possible types for each column
+        file_path: the path of the file to validate.
+        previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
+        num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
+        tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
+        limited_output: whether or not to return all possible types or only the most likely one for each column
+        save_results: whether or not to save the results in a json file, or the path where to dump the output
+        skipna: whether to ignore NaN values in the checks
+        output_profile: whether or not to add the 'profile' field to the output
+        output_schema: whether or not to add the 'schema' field to the output (tableschema)
+        output_df: whether or not to return the loaded DataFrame along with the analysis report
+        cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
+        verbose: whether the code displays the steps it's going through
     """
-    if (
-        (
-            any(
-                [
-                    location_dict is not None
-                    for location_dict in [
-                        csv_minio_location,
-                        output_minio_location,
-                        tableschema_minio_location,
-                    ]
-                ]
-            )
-        )
-        and (minio_user is None)
-        or (minio_pwd is None)
-    ):
-        raise ValueError("Minio credentials are required if using Minio")
-    for location_dict in [
-        csv_minio_location,
-        output_minio_location,
-        tableschema_minio_location,
-    ]:
-        if location_dict is not None:
-            if any(
-                [
-                    (location_key not in location_dict)
-                    or (location_dict[location_key] is None)
-                    for location_key in ["netloc", "bucket", "key"]
-                ]
-            ):
-                raise ValueError("Minio location dict must contain url, bucket and key")
-    csv_file_path = tempfile.NamedTemporaryFile(delete=False).name
-    download_from_minio(
-        netloc=csv_minio_location["netloc"],
-        bucket=csv_minio_location["bucket"],
-        key=csv_minio_location["key"],
-        filepath=csv_file_path,
-        minio_user=minio_user,
-        minio_pwd=minio_pwd,
-    )
-    return_dict = routine(
-        csv_file_path,
-        num_rows,
-        user_input_tests,
-        output_mode="LIMITED",
-        save_results=True,
-        encoding=encoding,
-        sep=sep,
-    )
-    # Write report JSON file.
-    output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
-    with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
-        json.dump(return_dict, fp, indent=4, separators=(",", ": "))
-    upload_to_minio(
-        netloc=output_minio_location["netloc"],
-        bucket=output_minio_location["bucket"],
-        key=output_minio_location["key"],
-        filepath=output_path_to_store_minio_file,
-        minio_user=minio_user,
-        minio_pwd=minio_pwd,
-    )
-    os.remove(output_path_to_store_minio_file)
-    os.remove(csv_file_path)
-    generate_table_schema(
-        return_dict,
-        True,
-        netloc=tableschema_minio_location["netloc"],
-        bucket=tableschema_minio_location["bucket"],
-        key=tableschema_minio_location["key"],
-        minio_user=minio_user,
-        minio_pwd=minio_pwd,
+    if verbose:
+        start_routine = time()
+        if is_url(file_path):
+            logging.info("Path recognized as a URL")
+    is_valid, table, analysis, col_values = validate(
+        file_path=file_path,
+        previous_analysis=previous_analysis,
+        verbose=verbose,
+        skipna=skipna,
     )
-    return return_dict
+    if analysis is None:
+        # if loading failed in validate, we load it from scratch
+        table, analysis = load_file(
+            file_path=file_path,
+            num_rows=num_rows,
+            verbose=verbose,
+        )
+    if not is_valid:
+        analysis, col_values = detect_formats(
+            table=table,
+            analysis=analysis,
+            file_path=file_path,
+            tags=tags,
+            limited_output=limited_output,
+            skipna=skipna,
+            verbose=verbose,
+        )
+    try:
+        return generate_output(
+            table=table,
+            analysis=analysis,
+            file_path=file_path,
+            num_rows=num_rows,
+            limited_output=limited_output,
+            save_results=save_results,
+            output_profile=output_profile,
+            output_schema=output_schema,
+            output_df=output_df,
+            cast_json=cast_json,
+            verbose=verbose,
+            sheet_name=analysis.get("sheet_name"),
+            _col_values=col_values,
+        )
+    finally:
+        if verbose:
+            display_logs_depending_process_time(
+                f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
+            )

csv_detective/format.py ADDED Viewed

@@ -0,0 +1,67 @@
+from typing import Any, Callable
+from csv_detective.parsing.text import header_score
+class Format:
+    def __init__(
+        self,
+        name: str,
+        func: Callable[[Any], bool],
+        _test_values: dict[bool, list[str]],
+        labels: list[str] = [],
+        proportion: float = 1,
+        tags: list[str] = [],
+    ) -> None:
+        """
+        Instanciates a Format object.
+        Args:
+            name: the name of the format.
+            func: the value test for the format (returns whether a string is valid).
+            _test_values: lists of valid and invalid values, used in the tests
+            labels: the list of hint headers for the header score
+            proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
+            tags: to allow users to submit a file to only a subset of formats
+        """
+        self.name: str = name
+        self.func: Callable = func
+        self._test_values: dict[bool, list[str]] = _test_values
+        self.labels: list[str] = labels
+        self.proportion: float = proportion
+        self.tags: list[str] = tags
+    def is_valid_label(self, val: str) -> float:
+        return header_score(val, self.labels)
+class FormatsManager:
+    formats: dict[str, Format]
+    def __init__(self) -> None:
+        import csv_detective.formats as formats
+        format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))]
+        self.formats = {
+            label: Format(
+                name=label,
+                func=(module := getattr(formats, label))._is,
+                _test_values=module._test_values,
+                **{
+                    attr: val
+                    for attr in ["labels", "proportion", "tags"]
+                    if (val := getattr(module, attr, None))
+                },
+            )
+            for label in format_labels
+        }
+    def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]:
+        return {
+            label: fmt
+            for label, fmt in self.formats.items()
+            if all(tag in fmt.tags for tag in tags)
+        }
+    def available_tags(self) -> set[str]:
+        return set(tag for format in self.formats.values() for tag in format.tags)

csv_detective/formats/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        module_name = file[:-3]
+        module = importlib.import_module(f"csv_detective.formats.{module_name}")
+        globals()[module_name] = module
+        del module

csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

csv-detective 0.6.7py3-none-any.whl → 0.9.3.dev2438py3-none-any.whl