PyPI - csv-detective - Versions diffs - 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1228__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1197py3-none-any.whl → 0.7.5.dev1228py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

csv_detective/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from .explore_csv import routine, routine_minio  # noqa
-from .create_example import create_example_csv_file  # noqa
+from .output.example import create_example_csv_file  # noqa
 __version__ = '0.7.5.dev'

csv_detective/detect_fields/FR/geo/adresse/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.55
 # ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long

csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 import re
 PROPORTION = 1

csv_detective/detect_fields/FR/other/csp_insee/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from os.path import dirname, join
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 1
 f = open(join(dirname(__file__), 'csp_insee.txt'), 'r')

csv_detective/detect_fields/FR/other/insee_ape700/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from os.path import dirname, join
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 1
 f = open(join(dirname(__file__), 'insee_ape700.txt'), 'r')

csv_detective/detect_fields/FR/other/sexe/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 1

csv_detective/detect_fields/other/float/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ def _is(val):
         if (
             not isinstance(val, str)
             or any([k in val for k in ['_', '+', 'e', 'E']])
-            or (val.startswith('0') and len(val) > 1)
+            or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
         ):
             return False
         float_casting(val)

csv_detective/detect_labels/FR/geo/adresse/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/code_departement/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/code_postal/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/code_region/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/commune/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/departement/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/insee_canton/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/pays/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/geo/region/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/code_rna/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/code_waldec/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/csp_insee/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/date_fr/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/insee_ape700/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/sexe/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/siren/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/siret/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/tel_fr/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/other/uai/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/json_geojson/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/latitude_wgs/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/latlon_wgs/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/geo/longitude_wgs/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/booleen/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/email/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/float/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/int/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/mongo_object_id/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/twitter/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/url/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/other/uuid/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/temp/date/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/temp/datetime_iso/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/temp/datetime_rfc822/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detect_labels/temp/year/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from csv_detective.utils import full_word_strictly_inside_string
-from csv_detective.process_text import _process_text
+from csv_detective.parsing.text import _process_text
 PROPORTION = 0.5

csv_detective/detection/columns.py ADDED Viewed

@@ -0,0 +1,89 @@
+import logging
+from typing import TextIO
+from time import time
+from csv_detective.utils import display_logs_depending_process_time
+def detect_extra_columns(file: TextIO, sep: str):
+    """regarde s'il y a des colonnes en trop
+    Attention, file ne doit pas avoir de ligne vide"""
+    file.seek(0)
+    retour = False
+    nb_useless_col = 99999
+    for i in range(10):
+        line = file.readline()
+        # regarde si on a un retour
+        if retour:
+            assert line[-1] == "\n"
+        if line[-1] == "\n":
+            retour = True
+        # regarde le nombre de derniere colonne inutile
+        deb = 0 + retour
+        line = line[::-1][deb:]
+        k = 0
+        for sign in line:
+            if sign != sep:
+                break
+            k += 1
+        if k == 0:
+            return 0, retour
+        nb_useless_col = min(k, nb_useless_col)
+    return nb_useless_col, retour
+def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
+    """Tests first 10 lines to see if there are empty heading columns"""
+    if verbose:
+        start = time()
+        logging.info("Detecting heading columns")
+    file.seek(0)
+    return_int = float("Inf")
+    for i in range(10):
+        line = file.readline()
+        return_int = min(return_int, len(line) - len(line.strip(sep)))
+        if return_int == 0:
+            if verbose:
+                display_logs_depending_process_time(
+                    f'No heading column detected in {round(time() - start, 3)}s',
+                    time() - start,
+                )
+            return 0
+    if verbose:
+        display_logs_depending_process_time(
+            f'{return_int} heading columns detected in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return return_int
+def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
+    """Tests first 10 lines to see if there are empty trailing columns"""
+    if verbose:
+        start = time()
+        logging.info("Detecting trailing columns")
+    file.seek(0)
+    return_int = float("Inf")
+    for i in range(10):
+        line = file.readline()
+        return_int = min(
+            return_int,
+            len(line.replace("\n", ""))
+            - len(line.replace("\n", "").strip(sep))
+            - heading_columns,
+        )
+        if return_int == 0:
+            if verbose:
+                display_logs_depending_process_time(
+                    f'No trailing column detected in {round(time() - start, 3)}s',
+                    time() - start,
+                )
+            return 0
+    if verbose:
+        display_logs_depending_process_time(
+            f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return return_int

csv_detective/detection/encoding.py ADDED Viewed

@@ -0,0 +1,27 @@
+import logging
+from time import time
+from io import BytesIO
+from cchardet import detect
+from csv_detective.utils import display_logs_depending_process_time
+def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
+    """
+    Detects file encoding using faust-cchardet (forked from the original cchardet)
+    """
+    if verbose:
+        start = time()
+        logging.info("Detecting encoding")
+    encoding_dict = detect(binary_file.read())
+    if not encoding_dict["encoding"]:
+        raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
+    if verbose:
+        message = f'Detected encoding: "{encoding_dict["encoding"]}"'
+        message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
+        display_logs_depending_process_time(
+            message,
+            time() - start,
+        )
+    return encoding_dict['encoding']

csv_detective/detection/engine.py ADDED Viewed

@@ -0,0 +1,46 @@
+from time import time
+from typing import Optional
+import magic
+import requests
+from csv_detective.utils import display_logs_depending_process_time, is_url
+COMPRESSION_ENGINES = ["gzip"]
+EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
+engine_to_file = {
+    "openpyxl": "Excel",
+    "xlrd": "old Excel",
+    "odf": "OpenOffice",
+    "gzip": "csv.gz",
+}
+def detect_engine(file_path: str, verbose=False) -> Optional[str]:
+    if verbose:
+        start = time()
+    mapping = {
+        "application/gzip": "gzip",
+        "application/x-gzip": "gzip",
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
+        'application/vnd.ms-excel': 'xlrd',
+        'application/vnd.oasis.opendocument.spreadsheet': 'odf',
+        # all these files could be recognized as zip, may need to check all cases then
+        'application/zip': 'openpyxl',
+    }
+    # if none of the above, we move forwards with the csv process
+    if is_url(file_path):
+        remote_content = requests.get(file_path).content
+        engine = mapping.get(magic.from_buffer(remote_content, mime=True))
+    else:
+        engine = mapping.get(magic.from_file(file_path, mime=True))
+    if verbose:
+        message = (
+            f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
+            if engine else "Processing the file as a csv"
+        )
+        display_logs_depending_process_time(
+            message,
+            time() - start,
+        )
+    return engine

csv_detective/detection/headers.py ADDED Viewed

@@ -0,0 +1,32 @@
+import logging
+from time import time
+from typing import Optional, TextIO
+from csv_detective.utils import display_logs_depending_process_time
+def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
+    """Tests 10 first rows for possible header (in case header is not 1st row)"""
+    if verbose:
+        start = time()
+        logging.info("Detecting headers")
+    file.seek(0)
+    for i in range(10):
+        header = file.readline()
+        position = file.tell()
+        chaine = [c for c in header.replace("\n", "").split(sep) if c]
+        if chaine[-1] not in ["", "\n"] and all(
+            [mot not in ["", "\n"] for mot in chaine[1:-1]]
+        ):
+            next_row = file.readline()
+            file.seek(position)
+            if header != next_row:
+                if verbose:
+                    display_logs_depending_process_time(
+                        f'Detected headers in {round(time() - start, 3)}s',
+                        time() - start,
+                    )
+                return i, chaine
+    if verbose:
+        logging.info('No header detected')
+    return 0, None

csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1228__py3-none-any.whl

csv-detective 0.7.5.dev1197py3-none-any.whl → 0.7.5.dev1228py3-none-any.whl