PyPI - csv-detective - Versions diffs - 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2258py3-none-any.whl → 0.9.3.dev2348py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

csv_detective/detection/formats.py CHANGED Viewed

@@ -7,7 +7,7 @@ from csv_detective.detection.variables import (
     detect_categorical_variable,
     # detect_continuous_variable,
 )
-from csv_detective.load_tests import return_all_tests
+from csv_detective.format import Format, FormatsManager
 from csv_detective.output.utils import prepare_output_dict
 from csv_detective.parsing.columns import (
     MAX_NUMBER_CATEGORICAL_VALUES,
@@ -16,12 +16,14 @@ from csv_detective.parsing.columns import (
     test_label,
 )
+fmtm = FormatsManager()
 def detect_formats(
     table: pd.DataFrame,
     analysis: dict,
     file_path: str,
-    user_input_tests: str | list[str] = "ALL",
+    tags: list[str] | None = None,
     limited_output: bool = True,
     skipna: bool = True,
     verbose: bool = False,
@@ -29,15 +31,12 @@ def detect_formats(
     in_chunks = analysis.get("total_lines") is None
     # list testing to be performed
-    all_tests_fields = return_all_tests(
-        user_input_tests, detect_type="detect_fields"
-    )  # list all tests for the fields
-    all_tests_labels = return_all_tests(
-        user_input_tests, detect_type="detect_labels"
-    )  # list all tests for the labels
+    formats: dict[str, Format] = (
+        fmtm.get_formats_from_tags(tags) if tags is not None else fmtm.formats
+    )
     # if no testing then return
-    if not all_tests_fields and not all_tests_labels:
+    if len(formats) == 0:
         return analysis, None
     # Perform testing on fields
@@ -45,7 +44,7 @@ def detect_formats(
         # table is small enough to be tested in one go
         scores_table_fields = test_col(
             table=table,
-            all_tests=all_tests_fields,
+            formats=formats,
             limited_output=limited_output,
             skipna=skipna,
             verbose=verbose,
@@ -62,7 +61,7 @@ def detect_formats(
             table=table,
             file_path=file_path,
             analysis=analysis,
-            all_tests=all_tests_fields,
+            formats=formats,
             limited_output=limited_output,
             skipna=skipna,
             verbose=verbose,
@@ -70,9 +69,7 @@ def detect_formats(
     analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
     # Perform testing on labels
-    scores_table_labels = test_label(
-        analysis["header"], all_tests_labels, limited_output, verbose=verbose
-    )
+    scores_table_labels = test_label(analysis["header"], formats, limited_output, verbose=verbose)
     analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
     # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
@@ -115,7 +112,7 @@ def detect_formats(
         "float": "float",
         "string": "string",
         "json": "json",
-        "json_geojson": "json",
+        "geojson": "json",
         "datetime_aware": "datetime",
         "datetime_naive": "datetime",
         "datetime_rfc822": "datetime",

csv_detective/explore_csv.py CHANGED Viewed

@@ -15,7 +15,7 @@ logging.basicConfig(level=logging.INFO)
 def routine(
     file_path: str,
     num_rows: int = 500,
-    user_input_tests: str | list[str] = "ALL",
+    tags: list[str] | None = None,
     limited_output: bool = True,
     save_results: bool | str = True,
     encoding: str | None = None,
@@ -28,14 +28,13 @@ def routine(
     verbose: bool = False,
     sheet_name: str | int | None = None,
 ) -> dict | tuple[dict, pd.DataFrame]:
-    """Returns a dict with information about the table and possible
-    column contents, and if requested the DataFrame with columns cast according to analysis.
+    """
+    Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
     Args:
         file_path: local path or URL to file
-        num_rows: number of rows to sample from the file for analysis ; -1 for analysis
-        of the whole file
-        user_input_tests: tests to run on the file
+        num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
+        tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
         limited_output: whether or not to return all possible types or only the most likely one for each column
         save_results: whether or not to save the results in a json file, or the path where to dump the output
         output_profile: whether or not to add the 'profile' field to the output
@@ -74,7 +73,7 @@ def routine(
         table=table,
         analysis=analysis,
         file_path=file_path,
-        user_input_tests=user_input_tests,
+        tags=tags,
         limited_output=limited_output,
         skipna=skipna,
         verbose=verbose,
@@ -107,7 +106,7 @@ def validate_then_detect(
     file_path: str,
     previous_analysis: dict,
     num_rows: int = 500,
-    user_input_tests: str | list[str] = "ALL",
+    tags: list[str] | None = None,
     limited_output: bool = True,
     save_results: bool | str = True,
     skipna: bool = True,
@@ -117,6 +116,26 @@ def validate_then_detect(
     cast_json: bool = True,
     verbose: bool = False,
 ):
+    """
+    Performs a validation of the given file against the given analysis.
+    If the validation fails, performs a full analysis and return it.
+    Otherwise return the previous analysis (which is therefore still valid).
+    NB: if asked, the profile is recreated in both cases.
+    Args:
+        file_path: the path of the file to validate.
+        previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
+        num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
+        tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
+        limited_output: whether or not to return all possible types or only the most likely one for each column
+        save_results: whether or not to save the results in a json file, or the path where to dump the output
+        skipna: whether to ignore NaN values in the checks
+        output_profile: whether or not to add the 'profile' field to the output
+        output_schema: whether or not to add the 'schema' field to the output (tableschema)
+        output_df: whether or not to return the loaded DataFrame along with the analysis report
+        cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
+        verbose: whether the code displays the steps it's going through
+    """
     if verbose:
         start_routine = time()
         if is_url(file_path):
@@ -140,7 +159,7 @@ def validate_then_detect(
             table=table,
             analysis=analysis,
             file_path=file_path,
-            user_input_tests=user_input_tests,
+            tags=tags,
             limited_output=limited_output,
             skipna=skipna,
             verbose=verbose,

csv_detective/format.py ADDED Viewed

@@ -0,0 +1,67 @@
+from typing import Any, Callable
+from csv_detective.parsing.text import header_score
+class Format:
+    def __init__(
+        self,
+        name: str,
+        func: Callable[[Any], bool],
+        _test_values: dict[bool, list[str]],
+        labels: list[str] = [],
+        proportion: float = 1,
+        tags: list[str] = [],
+    ) -> None:
+        """
+        Instanciates a Format object.
+        Args:
+            name: the name of the format.
+            func: the value test for the format (returns whether a string is valid).
+            _test_values: lists of valid and invalid values, used in the tests
+            labels: the list of hint headers for the header score
+            proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
+            tags: to allow users to submit a file to only a subset of formats
+        """
+        self.name: str = name
+        self.func: Callable = func
+        self._test_values: dict[bool, list[str]] = _test_values
+        self.labels: list[str] = labels
+        self.proportion: float = proportion
+        self.tags: list[str] = tags
+    def is_valid_label(self, val: str) -> float:
+        return header_score(val, self.labels)
+class FormatsManager:
+    formats: dict[str, Format]
+    def __init__(self) -> None:
+        import csv_detective.formats as formats
+        format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))]
+        self.formats = {
+            label: Format(
+                name=label,
+                func=(module := getattr(formats, label))._is,
+                _test_values=module._test_values,
+                **{
+                    attr: val
+                    for attr in ["labels", "proportion", "tags"]
+                    if (val := getattr(module, attr, None))
+                },
+            )
+            for label in format_labels
+        }
+    def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]:
+        return {
+            label: fmt
+            for label, fmt in self.formats.items()
+            if all(tag in fmt.tags for tag in tags)
+        }
+    def available_tags(self) -> set[str]:
+        return set(tag for format in self.formats.values() for tag in format.tags)

csv_detective/formats/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        module_name = file[:-3]
+        module = importlib.import_module(f"csv_detective.formats.{module_name}")
+        globals()[module_name] = module
+        del module

csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} RENAMED Viewed

@@ -1,100 +1,116 @@
-from csv_detective.parsing.text import _process_text
-PROPORTION = 0.55
-# ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
-voies = {
-    "aire ",
-    "allee ",
-    "avenue ",
-    "base ",
-    "boulevard ",
-    "cami ",
-    "carrefour ",
-    "chemin ",
-    "cheminement ",
-    "chaussee ",
-    "cite ",
-    "clos ",
-    "coin ",
-    "corniche ",
-    "cote ",
-    "cour ",
-    "cours ",
-    "domaine ",
-    "descente ",
-    "ecart ",
-    "esplanade ",
-    "faubourg ",
-    "gare ",
-    "grande rue",
-    "hameau ",
-    "halle ",
-    "ilot ",
-    "impasse ",
-    "lieu dit",
-    "lotissement ",
-    "marche ",
-    "montee ",
-    "parc ",
-    "passage ",
-    "place ",
-    "plan ",
-    "plaine ",
-    "plateau ",
-    "pont ",
-    "port ",
-    "promenade ",
-    "parvis ",
-    "quartier ",
-    "quai ",
-    "residence ",
-    "ruelle ",
-    "rocade ",
-    "rond point",
-    "route ",
-    "rue ",
-    # 'sente - sentier',
-    "square ",
-    "tour ",
-    # 'terre-plein',
-    "traverse ",
-    "villa ",
-    "village ",
-    "voie ",
-    "zone artisanale",
-    "zone d’amenagement concerte",
-    "zone d’amenagement differe",
-    "zone industrielle",
-    "zone ",
-    # 'r',
-    "av ",
-    "pl ",
-    "bd ",
-    "cami ",
-    # 'che',
-    "chs ",
-    "dom ",
-    "ham ",
-    "ld ",
-    # 'pro',
-    # 'rte',
-    "vlge ",
-    "za ",
-    "zac ",
-    "zad ",
-    "zi ",
-    # 'car',
-    "fg ",
-    # 'lot',
-    "imp ",
-    # 'qu',
-    "mte",
-}
-def _is(val):
-    """Repere des adresses"""
-    if not isinstance(val, str) or len(val) > 150:
-        return False
-    val = _process_text(val)
-    return any(x in val for x in voies)
+from csv_detective.parsing.text import _process_text
+proportion = 0.55
+tags = ["fr", "geo"]
+labels = [
+    "adresse",
+    "localisation",
+    "adresse postale",
+    "adresse geographique",
+    "adr",
+    "adresse complete",
+    "adresse station",
+]
+voies = {
+    "aire ",
+    "allee ",
+    "avenue ",
+    "base ",
+    "boulevard ",
+    "cami ",
+    "carrefour ",
+    "chemin ",
+    "cheminement ",
+    "chaussee ",
+    "cite ",
+    "clos ",
+    "coin ",
+    "corniche ",
+    "cote ",
+    "cour ",
+    "cours ",
+    "domaine ",
+    "descente ",
+    "ecart ",
+    "esplanade ",
+    "faubourg ",
+    "gare ",
+    "grande rue",
+    "hameau ",
+    "halle ",
+    "ilot ",
+    "impasse ",
+    "lieu dit",
+    "lotissement ",
+    "marche ",
+    "montee ",
+    "parc ",
+    "passage ",
+    "place ",
+    "plan ",
+    "plaine ",
+    "plateau ",
+    "pont ",
+    "port ",
+    "promenade ",
+    "parvis ",
+    "quartier ",
+    "quai ",
+    "residence ",
+    "ruelle ",
+    "rocade ",
+    "rond point",
+    "route ",
+    "rue ",
+    # 'sente - sentier',
+    "square ",
+    "tour ",
+    # 'terre-plein',
+    "traverse ",
+    "villa ",
+    "village ",
+    "voie ",
+    "zone artisanale",
+    "zone d’amenagement concerte",
+    "zone d’amenagement differe",
+    "zone industrielle",
+    "zone ",
+    # 'r',
+    "av ",
+    "pl ",
+    "bd ",
+    "cami ",
+    # 'che',
+    "chs ",
+    "dom ",
+    "ham ",
+    "ld ",
+    # 'pro',
+    # 'rte',
+    "vlge ",
+    "za ",
+    "zac ",
+    "zad ",
+    "zi ",
+    # 'car',
+    "fg ",
+    # 'lot',
+    "imp ",
+    # 'qu',
+    "mte",
+}
+def _is(val):
+    """Repere des adresses"""
+    if not isinstance(val, str) or len(val) > 150:
+        return False
+    val = _process_text(val)
+    return any(x in val for x in voies)
+_test_values = {
+    True: ["rue du martyr"],
+    False: ["un batiment"],
+}

csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} RENAMED Viewed

@@ -1,27 +1,35 @@
-PROPORTION = 1
-bool_mapping = {
-    "1": True,
-    "0": False,
-    "vrai": True,
-    "faux": False,
-    "true": True,
-    "false": False,
-    "oui": True,
-    "non": False,
-    "yes": True,
-    "no": False,
-    "y": True,
-    "n": False,
-    "o": True,
-}
-liste_bool = set(bool_mapping.keys())
-def bool_casting(val: str) -> bool:
-    return bool_mapping.get(val.lower())
-def _is(val: str) -> bool:
-    """Détecte les booléens"""
-    return isinstance(val, str) and val.lower() in liste_bool
+proportion = 1
+tags = ["type"]
+labels = ["is ", "has ", "est "]
+bool_mapping = {
+    "1": True,
+    "0": False,
+    "vrai": True,
+    "faux": False,
+    "true": True,
+    "false": False,
+    "oui": True,
+    "non": False,
+    "yes": True,
+    "no": False,
+    "y": True,
+    "n": False,
+    "o": True,
+}
+liste_bool = set(bool_mapping.keys())
+def bool_casting(val: str) -> bool:
+    return bool_mapping.get(val.lower())
+def _is(val):
+    return isinstance(val, str) and val.lower() in liste_bool
+_test_values = {
+    True: ["oui", "0", "1", "yes", "false", "True"],
+    False: ["nein", "ja", "2", "-0"],
+}

csv_detective/formats/code_commune_insee.py ADDED Viewed

@@ -0,0 +1,26 @@
+from frformat import CodeCommuneInsee, Millesime
+proportion = 0.75
+tags = ["fr", "geo"]
+labels = [
+    "code commune insee",
+    "code insee",
+    "codes insee",
+    "code commune",
+    "code insee commune",
+    "insee",
+    "code com",
+    "com",
+]
+_code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
+def _is(val):
+    return isinstance(val, str) and _code_commune_insee.is_valid(val)
+_test_values = {
+    True: ["91471", "01053"],
+    False: ["914712", "01000"],
+}

csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} RENAMED Viewed

@@ -1,29 +1,36 @@
-import re
-from csv_detective.parsing.text import _process_text
-PROPORTION = 1
-def _is(val):
-    """Repère les code csp telles que définies par l'INSEE"""
-    if not isinstance(val, str):
-        return False
-    val = _process_text(val)
-    if len(val) != 4:
-        return False
-    a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
-    b = val in {
-        "7100",
-        "7200",
-        "7400",
-        "7500",
-        "7700",
-        "7800",
-        "8100",
-        "8300",
-        "8400",
-        "8500",
-        "8600",
-    }
-    return a or b
+import re
+from csv_detective.parsing.text import _process_text
+proportion = 1
+tags = ["fr"]
+labels = ["code csp insee", "code csp"]
+def _is(val):
+    if not isinstance(val, str):
+        return False
+    val = _process_text(val)
+    if len(val) != 4:
+        return False
+    a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
+    b = val in {
+        "7100",
+        "7200",
+        "7400",
+        "7500",
+        "7700",
+        "7800",
+        "8100",
+        "8300",
+        "8400",
+        "8500",
+        "8600",
+    }
+    return a or b
+_test_values = {
+    True: ["121f"],
+    False: ["121x"],
+}

csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} RENAMED Viewed

@@ -1,15 +1,29 @@
-from frformat import Millesime, NumeroDepartement, Options
-PROPORTION = 1
-_options = Options(
-    ignore_case=True,
-    ignore_accents=True,
-    replace_non_alphanumeric_with_space=True,
-    ignore_extra_whitespace=True,
-)
-_numero_departement = NumeroDepartement(Millesime.LATEST, _options)
-def _is(val):
-    return isinstance(val, str) and _numero_departement.is_valid(val)
+from frformat import Millesime, NumeroDepartement, Options
+proportion = 1
+tags = ["fr", "geo"]
+labels = [
+    "code departement",
+    "code_departement",
+    "dep",
+    "departement",
+    "dept",
+]
+_options = Options(
+    ignore_case=True,
+    ignore_accents=True,
+    replace_non_alphanumeric_with_space=True,
+    ignore_extra_whitespace=True,
+)
+_numero_departement = NumeroDepartement(Millesime.LATEST, _options)
+def _is(val):
+    return isinstance(val, str) and _numero_departement.is_valid(val)
+_test_values = {
+    True: ["75", "2A", "2b", "974", "01"],
+    False: ["00", "96", "101"],
+}

csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl

csv-detective 0.9.3.dev2258py3-none-any.whl → 0.9.3.dev2348py3-none-any.whl