PyPI - csv-detective - Versions diffs - 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2215py3-none-any.whl → 0.9.3.dev2241py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

csv_detective/__init__.py +2 -1
csv_detective/detection/engine.py +1 -1
csv_detective/detection/formats.py +39 -95
csv_detective/detection/variables.py +2 -2
csv_detective/explore_csv.py +5 -7
csv_detective/load_tests.py +11 -4
csv_detective/output/__init__.py +8 -4
csv_detective/output/dataframe.py +37 -0
csv_detective/output/example.py +3 -1
csv_detective/output/profile.py +65 -21
csv_detective/parsing/columns.py +133 -35
csv_detective/parsing/csv.py +26 -23
csv_detective/parsing/load.py +21 -8
csv_detective/validate.py +86 -40
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/METADATA +29 -6
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/RECORD +24 -24
tests/test_fields.py +9 -13
tests/test_file.py +85 -35
tests/test_structure.py +4 -1
tests/test_validation.py +9 -4
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/WHEEL +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/entry_points.txt +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/top_level.txt +0 -0

csv_detective/parsing/columns.py CHANGED Viewed

@@ -3,10 +3,13 @@ from time import time
 from typing import Callable
 import pandas as pd
+from more_itertools import peekable
+from csv_detective.parsing.csv import CHUNK_SIZE
 from csv_detective.utils import display_logs_depending_process_time
-MAX_ROWS_ANALYSIS = int(1e4)
+# above this threshold, a column is not considered categorical
+MAX_NUMBER_CATEGORICAL_VALUES = 25
 def test_col_val(
@@ -34,28 +37,24 @@ def test_col_val(
             serie = serie[serie.notnull()]
         ser_len = len(serie)
         if ser_len == 0:
-            return 0.0
+            # being here means the whole column is NaN, so if skipna it's a pass
+            return 1.0 if skipna else 0.0
         if not limited_output:
             result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
             return result if result >= proportion else 0.0
         else:
-            if proportion == 1:  # Then try first 1 value, then 5, then all
+            if proportion == 1:
+                # early stops (1 then 5 rows) to not waste time if directly unsuccessful
                 for _range in [
                     min(1, ser_len),
                     min(5, ser_len),
                     ser_len,
-                ]:  # Pour ne pas faire d'opérations inutiles, on commence par 1,
-                    # puis 5 valeurs puis la serie complète
-                    if all(apply_test_func(serie, test_func, _range)):
-                        pass
-                    else:
+                ]:
+                    if not all(apply_test_func(serie, test_func, _range)):
                         return 0.0
                 return 1.0
             else:
-                # if we have a proportion, statistically it's OK to analyse up to 10k rows
-                # (arbitrary number) and get a significant result
-                to_analyse = min(ser_len, MAX_ROWS_ANALYSIS)
-                result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
+                result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
                 return result if result >= proportion else 0.0
     finally:
         if verbose and time() - start > 3:
@@ -81,7 +80,7 @@ def test_col_label(
 def test_col(
     table: pd.DataFrame,
-    all_tests: list,
+    all_tests: dict[str, dict],
     limited_output: bool,
     skipna: bool = True,
     verbose: bool = False,
@@ -89,25 +88,18 @@ def test_col(
     if verbose:
         start = time()
         logging.info("Testing columns to get types")
-    test_funcs = {
-        test.__name__.split(".")[-1]: {
-            "func": test._is,
-            "prop": test.PROPORTION,
-        }
-        for test in all_tests
-    }
     return_table = pd.DataFrame(columns=table.columns)
-    for idx, (key, value) in enumerate(test_funcs.items()):
+    for idx, (name, attributes) in enumerate(all_tests.items()):
         if verbose:
             start_type = time()
-            logging.info(f"\t- Starting with type '{key}'")
+            logging.info(f"\t- Starting with type '{name}'")
         # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
         # => the following needs to change, "apply" means all columns are tested for one type at once
-        return_table.loc[key] = table.apply(
+        return_table.loc[name] = table.apply(
             lambda serie: test_col_val(
                 serie,
-                value["func"],
-                value["prop"],
+                attributes["func"],
+                attributes["prop"],
                 skipna=skipna,
                 limited_output=limited_output,
                 verbose=verbose,
@@ -115,7 +107,7 @@ def test_col(
         )
         if verbose:
             display_logs_depending_process_time(
-                f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
+                f'\t> Done with type "{name}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})',
                 time() - start_type,
             )
     if verbose:
@@ -125,26 +117,24 @@ def test_col(
     return return_table
-def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False):
+def test_label(
+    columns: list[str], all_tests: dict[str, dict], limited_output: bool, verbose: bool = False
+):
     if verbose:
         start = time()
         logging.info("Testing labels to get types")
-    test_funcs = dict()
-    for test in all_tests:
-        name = test.__name__.split(".")[-1]
-        test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
-    return_table = pd.DataFrame(columns=table.columns)
-    for idx, (key, value) in enumerate(test_funcs.items()):
+    return_table = pd.DataFrame(columns=columns)
+    for idx, (key, value) in enumerate(all_tests.items()):
         if verbose:
             start_type = time()
         return_table.loc[key] = [
             test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
-            for col_name in table.columns
+            for col_name in columns
         ]
         if verbose:
             display_logs_depending_process_time(
-                f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
+                f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})',
                 time() - start_type,
             )
     if verbose:
@@ -152,3 +142,111 @@ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbo
             f"Done testing labels in {round(time() - start, 3)}s", time() - start
         )
     return return_table
+def test_col_chunks(
+    table: pd.DataFrame,
+    file_path: str,
+    analysis: dict,
+    all_tests: list,
+    limited_output: bool,
+    skipna: bool = True,
+    verbose: bool = False,
+) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]:
+    def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]:
+        return {
+            col: [test for test in return_table.index if return_table.loc[test, col] > 0]
+            for col in return_table.columns
+        }
+    if verbose:
+        start = time()
+        logging.info("Testing columns to get types on chunks")
+    # analysing the sample to get a first guess
+    return_table = test_col(table, all_tests, limited_output, skipna=skipna, verbose=verbose)
+    remaining_tests_per_col = build_remaining_tests_per_col(return_table)
+    # hashing rows to get nb_duplicates
+    row_hashes_count = table.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
+    # getting values for profile to read the file only once
+    col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
+    # only csv files can end up here, can't chunk excel
+    chunks = pd.read_csv(
+        file_path,
+        dtype=str,
+        encoding=analysis["encoding"],
+        sep=analysis["separator"],
+        skiprows=analysis["header_row_idx"],
+        compression=analysis.get("compression"),
+        chunksize=CHUNK_SIZE,
+    )
+    analysis["total_lines"] = CHUNK_SIZE
+    batch, batch_number = [], 1
+    iterator = peekable(enumerate(chunks))
+    while iterator:
+        idx, chunk = next(iterator)
+        if idx == 0:
+            # we have read and analysed the first chunk already
+            continue
+        if len(batch) < 10:
+            # it's too slow to process chunks directly, but we want to keep the first analysis
+            # on a "small" chunk, so partial analyses are done on batches of chunks
+            batch.append(chunk)
+            # we don't know when the chunks end, and doing one additionnal step
+            # for the final batch is ugly
+            try:
+                iterator.peek()
+                continue
+            except StopIteration:
+                pass
+        if verbose:
+            logging.info(f"> Testing batch number {batch_number}")
+        batch = pd.concat(batch, ignore_index=True)
+        analysis["total_lines"] += len(batch)
+        row_hashes_count = row_hashes_count.add(
+            batch.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
+            fill_value=0,
+        )
+        for col in batch.columns:
+            col_values[col] = col_values[col].add(
+                batch[col].value_counts(dropna=False),
+                fill_value=0,
+            )
+        if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()):
+            # no more potential tests to do on any column, early stop
+            break
+        for col, tests in remaining_tests_per_col.items():
+            # testing each column with the tests that are still competing
+            # after previous batchs analyses
+            for test in tests:
+                batch_col_test = test_col_val(
+                    batch[col],
+                    all_tests[test]["func"],
+                    all_tests[test]["prop"],
+                    limited_output=limited_output,
+                    skipna=skipna,
+                )
+                return_table.loc[test, col] = (
+                    # if this batch's column tested 0 then test fails overall
+                    0
+                    if batch_col_test == 0
+                    # otherwise updating the score with weighted average
+                    else ((return_table.loc[test, col] * idx + batch_col_test) / (idx + 1))
+                )
+        remaining_tests_per_col = build_remaining_tests_per_col(return_table)
+        batch, batch_number = [], batch_number + 1
+    analysis["nb_duplicates"] = sum(row_hashes_count > 1)
+    analysis["categorical"] = [
+        col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
+    ]
+    # handling that empty columns score 1 everywhere
+    for col in return_table.columns:
+        if sum(return_table[col]) == len(return_table):
+            return_table[col] = 0
+    if verbose:
+        display_logs_depending_process_time(
+            f"Done testing chunks in {round(time() - start, 3)}s", time() - start
+        )
+    return return_table, analysis, col_values

csv_detective/parsing/csv.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import logging
 from time import time
-from typing import TextIO
+from typing import Optional, TextIO
 import pandas as pd
 from csv_detective.utils import display_logs_depending_process_time
+# the number of rows for the first analysis, and the number of rows per chunk of the df iterator
+CHUNK_SIZE = int(1e4)
 def parse_csv(
     the_file: TextIO,
@@ -15,36 +18,36 @@ def parse_csv(
     skiprows: int,
     random_state: int = 42,
     verbose: bool = False,
-) -> tuple[pd.DataFrame, int, int]:
+) -> tuple[pd.DataFrame, Optional[int], Optional[int]]:
     if verbose:
         start = time()
         logging.info("Parsing table")
-    table = None
     if not isinstance(the_file, str):
         the_file.seek(0)
-    total_lines = None
-    for encoding in [encoding, "ISO-8859-1", "utf-8"]:
-        if encoding is None:
-            continue
-        if "ISO-8859" in encoding:
-            encoding = "ISO-8859-1"
-        try:
-            table = pd.read_csv(the_file, sep=sep, dtype=str, encoding=encoding, skiprows=skiprows)
-            total_lines = len(table)
+    try:
+        table = pd.read_csv(
+            the_file,
+            sep=sep,
+            dtype=str,
+            encoding=encoding,
+            skiprows=skiprows,
+            nrows=CHUNK_SIZE,
+        )
+        total_lines = len(table)
+        # branch between small and big files starts here
+        if total_lines == CHUNK_SIZE:
+            if verbose:
+                logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
+            total_lines, nb_duplicates = None, None
+        else:
             nb_duplicates = len(table.loc[table.duplicated()])
-            if num_rows > 0:
-                num_rows = min(num_rows - 1, total_lines)
-                table = table.sample(num_rows, random_state=random_state)
-            # else : table is unchanged
-            break
-        except TypeError:
-            print("Trying encoding : {encoding}".format(encoding=encoding))
-    if table is None:
-        raise ValueError("Could not load file")
+        if num_rows > 0:
+            num_rows = min(num_rows, total_lines or len(table))
+            table = table.sample(num_rows, random_state=random_state)
+    except Exception as e:
+        raise ValueError("Could not load file") from e
     if verbose:
         display_logs_depending_process_time(
             f"Table parsed successfully in {round(time() - start, 3)}s",

csv_detective/parsing/load.py CHANGED Viewed

@@ -44,6 +44,8 @@ def load_file(
             sheet_name=sheet_name,
             verbose=verbose,
         )
+        if table.empty:
+            raise ValueError("Table seems to be empty")
         header = table.columns.to_list()
         analysis = {
             "engine": engine,
@@ -66,34 +68,45 @@ def load_file(
             binary_file.seek(0)
         # decoding and reading file
         if is_url(file_path) or engine in COMPRESSION_ENGINES:
-            str_file = StringIO(binary_file.read().decode(encoding=encoding))
+            str_file = StringIO()
+            while True:
+                chunk = binary_file.read(1024**2)
+                if not chunk:
+                    break
+                str_file.write(chunk.decode(encoding=encoding))
+            del binary_file
+            str_file.seek(0)
         else:
             str_file = open(file_path, "r", encoding=encoding)
         if sep is None:
             sep = detect_separator(str_file, verbose=verbose)
         header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
-        if header is None:
-            return {"error": True}
-        elif isinstance(header, list):
-            if any([x is None for x in header]):
-                return {"error": True}
+        if header is None or (isinstance(header, list) and any([h is None for h in header])):
+            raise ValueError("Could not retrieve headers")
         heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
         trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
         table, total_lines, nb_duplicates = parse_csv(
             str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
         )
+        del str_file
+        if table.empty:
+            raise ValueError("Table seems to be empty")
         analysis = {
             "encoding": encoding,
             "separator": sep,
             "heading_columns": heading_columns,
             "trailing_columns": trailing_columns,
         }
+        if engine is not None:
+            analysis["compression"] = engine
     analysis.update(
         {
             "header_row_idx": header_row_idx,
             "header": header,
-            "total_lines": total_lines,
-            "nb_duplicates": nb_duplicates,
         }
     )
+    if total_lines is not None:
+        analysis["total_lines"] = total_lines
+    if nb_duplicates is not None:
+        analysis["nb_duplicates"] = nb_duplicates
     return table, analysis

csv_detective/validate.py CHANGED Viewed

@@ -3,76 +3,121 @@ import logging
 import pandas as pd
 from csv_detective.load_tests import return_all_tests
-from csv_detective.parsing.columns import test_col_val
-from csv_detective.parsing.load import load_file
+from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
+VALIDATION_CHUNK_SIZE = int(1e5)
 logging.basicConfig(level=logging.INFO)
-tests = {
-    t.__name__.split(".")[-1]: {
-        "func": t._is,
-        "prop": t.PROPORTION,
-    }
-    for t in return_all_tests("ALL", "detect_fields")
-}
+tests = return_all_tests("ALL", "detect_fields")
 def validate(
     file_path: str,
     previous_analysis: dict,
-    num_rows: int = 500,
-    encoding: str | None = None,
-    sep: str | None = None,
     verbose: bool = False,
     skipna: bool = True,
-    sheet_name: str | int | None = None,
-) -> tuple[bool, pd.DataFrame | None, dict | None]:
+) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
     """
-    Verify is the given file has the same fields and types as in the previous analysis.
+    Verify is the given file has the same fields and types as in the given analysis.
     """
     try:
-        table, analysis = load_file(
-            file_path=file_path,
-            num_rows=num_rows,
-            encoding=encoding,
-            sep=sep,
-            verbose=verbose,
-            sheet_name=sheet_name,
+        if previous_analysis.get("separator"):
+            # loading the table in chunks
+            chunks = pd.read_csv(
+                file_path,
+                dtype=str,
+                sep=previous_analysis["separator"],
+                encoding=previous_analysis["encoding"],
+                skiprows=previous_analysis["header_row_idx"],
+                compression=previous_analysis.get("compression"),
+                chunksize=VALIDATION_CHUNK_SIZE,
+            )
+            analysis = {
+                k: v
+                for k, v in previous_analysis.items()
+                if k
+                in ["encoding", "separator", "compression", "heading_columns", "trailing_columns"]
+                and v is not None
+            }
+        else:
+            # or chunks-like if not chunkable
+            chunks = iter(
+                [
+                    pd.read_excel(
+                        file_path,
+                        dtype=str,
+                        engine=previous_analysis["engine"],
+                        sheet_name=previous_analysis["sheet_name"],
+                    )
+                ]
+            )
+            analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
+        first_chunk = next(chunks)
+        analysis.update(
+            {k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
         )
     except Exception as e:
         if verbose:
             logging.warning(f"> Could not load the file with previous analysis values: {e}")
-        return False, None, None
+        return False, None, None, None
     if verbose:
         logging.info("Comparing table with the previous analysis")
         logging.info("- Checking if all columns match")
-    if any(col_name not in analysis["header"] for col_name in previous_analysis["header"]) or any(
-        col_name not in previous_analysis["header"] for col_name in analysis["header"]
+    if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
+        list(first_chunk.columns)[k] != previous_analysis["header"][k]
+        for k in range(len(previous_analysis["header"]))
     ):
         if verbose:
             logging.warning("> Columns do not match, proceeding with full analysis")
-        return False, None, None
-    for col_name, args in previous_analysis["columns"].items():
+        return False, None, None, None
+    if verbose:
+        logging.info(
+            f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
+        )
+    # hashing rows to get nb_duplicates
+    row_hashes_count = first_chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
+    # getting values for profile to read the file only once
+    col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
+    analysis["total_lines"] = 0
+    for idx, chunk in enumerate([first_chunk, *chunks]):
         if verbose:
-            logging.info(f"- Testing {col_name} for {args['format']}")
-        if args["format"] == "string":
-            # no test for columns that have not been recognized as a specific format
-            continue
-        test_result: float = test_col_val(
-            serie=table[col_name],
-            test_func=tests[args["format"]]["func"],
-            proportion=tests[args["format"]]["prop"],
-            skipna=skipna,
+            logging.info(f"> Testing chunk number {idx}")
+        analysis["total_lines"] += len(chunk)
+        row_hashes_count = row_hashes_count.add(
+            chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
+            fill_value=0,
         )
-        if not bool(test_result):
+        for col in chunk.columns:
+            col_values[col] = col_values[col].add(
+                chunk[col].value_counts(dropna=False),
+                fill_value=0,
+            )
+        for col_name, args in previous_analysis["columns"].items():
             if verbose:
-                logging.warning("> Test failed, proceeding with full analysis")
-            return False, table, analysis
+                logging.info(f"- Testing {col_name} for {args['format']}")
+            if args["format"] == "string":
+                # no test for columns that have not been recognized as a specific format
+                continue
+            test_result: float = test_col_val(
+                serie=chunk[col_name],
+                test_func=tests[args["format"]]["func"],
+                proportion=tests[args["format"]]["prop"],
+                skipna=skipna,
+            )
+            if not bool(test_result):
+                if verbose:
+                    logging.warning("> Test failed, proceeding with full analysis")
+                return False, first_chunk, analysis, None
     if verbose:
         logging.info("> All checks successful")
+    analysis["nb_duplicates"] = sum(row_hashes_count > 1)
+    analysis["categorical"] = [
+        col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
+    ]
     return (
         True,
-        table,
+        first_chunk,
         analysis
         | {
             k: previous_analysis[k]
@@ -84,4 +129,5 @@ def validate(
                 "formats",
             ]
         },
+        col_values,
     )

{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.3.dev2215
+Version: 0.9.3.dev2241
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT
@@ -22,6 +22,7 @@ Requires-Dist: python-magic==0.4.27
 Requires-Dist: frformat==0.4.0
 Requires-Dist: Faker>=33.0.0
 Requires-Dist: rstr==3.2.2
+Requires-Dist: more-itertools>=10.8.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.3.0; extra == "dev"
 Requires-Dist: responses>=0.25.0; extra == "dev"
@@ -30,7 +31,7 @@ Dynamic: license-file
 # CSV Detective
-This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types. This is currently done through regex and string comparison.
+This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
 Currently supported file types: csv, xls, xlsx, ods.
@@ -50,7 +51,7 @@ pip install csv-detective
 Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
-```
+```python
 # Import the csv_detective package
 from csv_detective import routine
 import os # for this example only
@@ -158,13 +159,26 @@ The program creates a `Python` dictionnary with the following information :
 ```
 The output slightly differs depending on the file format:
-- csv files have `encoding` and `separator`
+- csv files have `encoding` and `separator` (and `compression` if relevant)
 - xls, xls, ods files have `engine` and `sheet_name`
+You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
+- the analysis (as described above)
+- an iteror of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
+```python
+inspection, df_chunks = routine(
+    file_path=file_path,
+    num_rows=-1,
+    output_df=True,
+)
+cast_df = pd.concat(df_chunks, ignore_index=True)
+# if "col1" has been detected as a float, then cast_df["col1"] contains floats
+```
 ### What Formats Can Be Detected
 Includes :
+- types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
 - Communes, Départements, Régions, Pays
 - Codes Communes, Codes Postaux, Codes Departement, ISO Pays
 - Codes CSP, Description CSP, SIREN
@@ -172,6 +186,16 @@ Includes :
 - Years, Dates, Jours de la Semaine FR
 - UUIDs, Mongo ObjectIds
+### Validation
+If you have a pre-made analysis of a file, you can check whether an other file conforms to the same analysis:
+```python
+from csv_detective import validate
+is_valid, *_ = validate(
+  file_path,
+  previous_analysis,  # exactly as it came out from the routine function
+)
+```
 ### Format detection and scoring
 For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
 - the field score based on the values contained in the column (0.0 to 1.0).
@@ -199,7 +223,6 @@ Only the format with highest score is present in the output.
 Related ideas:
 - store column names to make a learning model based on column names for (possible pre-screen)
-- normalising data based on column prediction
 - entity resolution (good luck...)
 ## Why Could This Be of Any Use ?

csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl

csv-detective 0.9.3.dev2215py3-none-any.whl → 0.9.3.dev2241py3-none-any.whl