PyPI - csv-detective - Versions diffs - 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl - Mend

csv-detective 0.8.1.dev1703py3-none-any.whl → 0.8.1.dev1729py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py CHANGED Viewed

@@ -2,37 +2,37 @@ from unidecode import unidecode
 PROPORTION = 1
 mois = {
-    'janvier',
-    'fevrier',
-    'mars',
-    'avril',
-    'mai',
-    'juin',
-    'juillet',
-    'aout',
-    'septembre',
-    'octobre',
-    'novembre',
-    'decembre',
-    'jan',
-    'fev',
-    'mar',
-    'avr',
-    'mai',
-    'jun',
-    'jui',
-    'juil',
-    'aou',
-    'sep',
-    'sept',
-    'oct',
-    'nov',
-    'dec'
+    "janvier",
+    "fevrier",
+    "mars",
+    "avril",
+    "mai",
+    "juin",
+    "juillet",
+    "aout",
+    "septembre",
+    "octobre",
+    "novembre",
+    "decembre",
+    "jan",
+    "fev",
+    "mar",
+    "avr",
+    "mai",
+    "jun",
+    "jui",
+    "juil",
+    "aou",
+    "sep",
+    "sept",
+    "oct",
+    "nov",
+    "dec",
 }
 def _is(val):
-    '''Renvoie True si les champs peuvent être des mois de l'année'''
+    """Renvoie True si les champs peuvent être des mois de l'année"""
     if not isinstance(val, str):
         return False
     val = unidecode(val.lower())

csv_detective/detect_fields/__init__.py CHANGED Viewed

@@ -1,61 +1,112 @@
-# flake8: noqa
-from .FR.other import (
-    code_csp_insee,
-    csp_insee,
-    sexe,
-    siren,
-    tel_fr,
-    uai,
-    siret,
-    insee_ape700,
-    date_fr,
-    code_import,
-    code_waldec,
-    code_rna,
-)
-from .other import (
-    email,
-    url,
-    booleen,
-    money,
-    mongo_object_id,
-    percent,
-    twitter,
-    float,
-    int,
-    uuid,
-    json,
-)
 from .FR.geo import (
     adresse,
     code_commune_insee,
-    code_postal,
-    commune,
-    departement,
-    pays,
-    region,
     code_departement,
     code_fantoir,
-    longitude_wgs_fr_metropole,
-    latitude_wgs_fr_metropole,
+    code_postal,
     code_region,
+    commune,
+    departement,
+    insee_canton,
     latitude_l93,
+    latitude_wgs_fr_metropole,
     longitude_l93,
-    insee_canton,
+    longitude_wgs_fr_metropole,
+    pays,
+    region,
 )
+from .FR.other import (
+    code_csp_insee,
+    code_import,
+    code_rna,
+    code_waldec,
+    csp_insee,
+    date_fr,
+    insee_ape700,
+    sexe,
+    siren,
+    siret,
+    tel_fr,
+    uai,
+)
+from .FR.temp import jour_de_la_semaine, mois_de_annee
 from .geo import (
     iso_country_code_alpha2,
     iso_country_code_alpha3,
     iso_country_code_numeric,
+    json_geojson,
     latitude_wgs,
-    longitude_wgs,
     latlon_wgs,
+    longitude_wgs,
     lonlat_wgs,
-    json_geojson,
 )
+from .other import (
+    booleen,
+    email,
+    float,
+    int,
+    json,
+    money,
+    mongo_object_id,
+    percent,
+    twitter,
+    url,
+    uuid,
+)
+from .temp import date, datetime_aware, datetime_naive, datetime_rfc822, year
-from .FR.temp import jour_de_la_semaine, mois_de_annee
-from .temp import year, date, datetime_aware, datetime_naive, datetime_rfc822
+__all__ = [
+    "adresse",
+    "code_commune_insee",
+    "code_departement",
+    "code_fantoir",
+    "code_postal",
+    "code_region",
+    "commune",
+    "departement",
+    "insee_canton",
+    "latitude_l93",
+    "latitude_wgs_fr_metropole",
+    "longitude_l93",
+    "longitude_wgs_fr_metropole",
+    "pays",
+    "region",
+    "code_csp_insee",
+    "code_import",
+    "code_rna",
+    "code_waldec",
+    "csp_insee",
+    "date_fr",
+    "insee_ape700",
+    "sexe",
+    "siren",
+    "siret",
+    "tel_fr",
+    "uai",
+    "jour_de_la_semaine",
+    "mois_de_annee",
+    "iso_country_code_alpha2",
+    "iso_country_code_alpha3",
+    "iso_country_code_numeric",
+    "json_geojson",
+    "latitude_wgs",
+    "latlon_wgs",
+    "longitude_wgs",
+    "lonlat_wgs",
+    "booleen",
+    "email",
+    "float",
+    "int",
+    "json",
+    "money",
+    "mongo_object_id",
+    "percent",
+    "twitter",
+    "url",
+    "uuid",
+    "date",
+    "datetime_aware",
+    "datetime_naive",
+    "datetime_rfc822",
+    "year",
+]

csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py CHANGED Viewed

@@ -1,15 +1,15 @@
-from os.path import dirname, join
 import re
+from os.path import dirname, join
 PROPORTION = 1
-with open(join(dirname(__file__), 'iso_country_code_alpha2.txt'), 'r') as iofile:
-    liste_pays = iofile.read().split('\n')
+with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile:
+    liste_pays = iofile.read().split("\n")
 liste_pays = set(liste_pays)
 def _is(val):
-    '''Renvoie True si val peut etre un code iso pays alpha-2, False sinon'''
-    if not isinstance(val, str) or not bool(re.match(r'[A-Z]{2}$', val)):
+    """Renvoie True si val peut etre un code iso pays alpha-2, False sinon"""
+    if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
         return False
     return val in liste_pays

csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from os.path import dirname, join
 import re
+from os.path import dirname, join
 PROPORTION = 1
-with open(join(dirname(__file__), 'iso_country_code_alpha3.txt'), 'r') as iofile:
-    liste_pays = iofile.read().split('\n')
+with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile:
+    liste_pays = iofile.read().split("\n")
 def _is(val):
-    '''Renvoie True si val peut etre un code iso pays alpha-3, False sinon'''
-    if not isinstance(val, str) or not bool(re.match(r'[A-Z]{3}$', val)):
+    """Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
+    if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
         return False
     return val in set(liste_pays)

csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py CHANGED Viewed

@@ -1,15 +1,15 @@
-from os.path import dirname, join
 import re
+from os.path import dirname, join
 PROPORTION = 1
-with open(join(dirname(__file__), 'iso_country_code_numeric.txt'), 'r') as iofile:
-    liste_pays = iofile.read().split('\n')
+with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile:
+    liste_pays = iofile.read().split("\n")
 liste_pays = set(liste_pays)
 def _is(val):
-    '''Renvoie True si val peut etre un code iso pays numerique, False sinon'''
-    if not isinstance(val, str) or not bool(re.match(r'[0-9]{3}$', val)):
+    """Renvoie True si val peut etre un code iso pays numerique, False sinon"""
+    if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
         return False
     return val in liste_pays

csv_detective/detect_fields/geo/latitude_wgs/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ PROPORTION = 0.9
 def _is(val):
-    '''Renvoie True si val peut etre une latitude'''
+    """Renvoie True si val peut etre une latitude"""
     try:
         return is_float(val) and float(val) >= -90 and float(val) <= 90
     except ValueError:

csv_detective/detect_fields/geo/longitude_wgs/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ PROPORTION = 0.9
 def _is(val):
-    '''Renvoie True si val peut etre une longitude'''
+    """Renvoie True si val peut etre une longitude"""
     try:
         return is_float(val) and float(val) >= -180 and float(val) <= 180
     except ValueError:

csv_detective/detect_fields/other/booleen/__init__.py CHANGED Viewed

@@ -23,5 +23,5 @@ def bool_casting(val: str) -> bool:
 def _is(val: str) -> bool:
-    '''Détecte les booléens'''
+    """Détecte les booléens"""
     return isinstance(val, str) and val.lower() in liste_bool

csv_detective/detect_fields/other/email/__init__.py CHANGED Viewed

@@ -4,5 +4,7 @@ PROPORTION = 1
 def _is(val):
-    '''Detects e-mails'''
-    return isinstance(val, str) and bool(re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$', val))
+    """Detects e-mails"""
+    return isinstance(val, str) and bool(
+        re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
+    )

csv_detective/detect_fields/other/int/__init__.py CHANGED Viewed

@@ -2,11 +2,11 @@ PROPORTION = 1
 def _is(val):
-    '''Detects integers'''
+    """Detects integers"""
     if (
         not isinstance(val, str)
-        or any([v in val for v in ['.', '_', '+']])
-        or (val.startswith('0') and len(val) > 1)
+        or any([v in val for v in [".", "_", "+"]])
+        or (val.startswith("0") and len(val) > 1)
     ):
         return False
     try:

csv_detective/detect_fields/other/mongo_object_id/__init__.py CHANGED Viewed

@@ -4,5 +4,5 @@ PROPORTION = 0.8
 def _is(val):
-    '''Detects Mongo ObjectIds'''
-    return isinstance(val, str) and bool(re.match(r'^[0-9a-fA-F]{24}$', val))
+    """Detects Mongo ObjectIds"""
+    return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))

csv_detective/detect_fields/other/twitter/__init__.py CHANGED Viewed

@@ -4,5 +4,5 @@ PROPORTION = 1
 def _is(val):
-    '''Detects twitter accounts'''
-    return isinstance(val, str) and bool(re.match(r'^@[A-Za-z0-9_]+$', val))
+    """Detects twitter accounts"""
+    return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))

csv_detective/detect_fields/other/uuid/__init__.py CHANGED Viewed

@@ -4,8 +4,7 @@ PROPORTION = 0.8
 def _is(val):
-    '''Detects UUIDs'''
-    return isinstance(val, str) and bool(re.match(
-        r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
-        val
-    ))
+    """Detects UUIDs"""
+    return isinstance(val, str) and bool(
+        re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
+    )

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -2,7 +2,8 @@ from datetime import datetime
 from typing import Optional
 from dateparser import parse as date_parser
-from dateutil.parser import parse as dateutil_parser, ParserError
+from dateutil.parser import ParserError
+from dateutil.parser import parse as dateutil_parser
 PROPORTION = 1
 # /!\ this is only for dates, not datetimes which are handled by other utils
@@ -22,7 +23,7 @@ threshold = 0.3
 def _is(val):
-    '''Renvoie True si val peut être une date, False sinon'''
+    """Renvoie True si val peut être une date, False sinon"""
     # early stops, to cut processing time
     if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
         return False

csv_detective/detect_fields/temp/datetime_rfc822/__init__.py CHANGED Viewed

@@ -4,15 +4,15 @@ PROPORTION = 1
 def _is(val):
-    '''Renvoie True si val peut être une date au format rfc822, False sinon
-    Exemple: Tue, 19 Dec 2023 15:30:45 +0000'''
+    """Renvoie True si val peut être une date au format rfc822, False sinon
+    Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
     return isinstance(val, str) and bool(
         re.match(
-            r'^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} '
-            r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) '
-            r'(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$',
+            r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
+            r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
+            r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
             val.lower(),
-            re.IGNORECASE
+            re.IGNORECASE,
         )
     )

csv_detective/detect_fields/temp/year/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ PROPORTION = 1
 def _is(val):
-    '''Returns True if val can be a year'''
+    """Returns True if val can be a year"""
     try:
         val = int(val)
     except ValueError:

csv_detective/detect_labels/FR/other/tel_fr/__init__.py CHANGED Viewed

@@ -4,7 +4,6 @@ PROPORTION = 0.5
 def _is(header: str) -> float:
     words_combinations_list = [
         "telephone",
         "tel",

csv_detective/detect_labels/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# flake8: noqa
 from .FR.geo import (
     adresse,
     code_commune_insee,
@@ -42,3 +41,54 @@ from .geo import (
 )
 from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
 from .temp import date, datetime_rfc822, year
+__all__ = [
+    "adresse",
+    "code_commune_insee",
+    "code_departement",
+    "code_fantoir",
+    "code_postal",
+    "code_region",
+    "commune",
+    "departement",
+    "insee_canton",
+    "latitude_l93",
+    "latitude_wgs_fr_metropole",
+    "longitude_l93",
+    "longitude_wgs_fr_metropole",
+    "pays",
+    "region",
+    "code_csp_insee",
+    "code_rna",
+    "code_waldec",
+    "csp_insee",
+    "date_fr",
+    "insee_ape700",
+    "sexe",
+    "siren",
+    "siret",
+    "tel_fr",
+    "uai",
+    "iso_country_code_alpha2",
+    "iso_country_code_alpha3",
+    "iso_country_code_numeric",
+    "json_geojson",
+    "latitude_wgs",
+    "latlon_wgs",
+    "longitude_wgs",
+    "lonlat_wgs",
+    "jour_de_la_semaine",
+    "mois_de_annee",
+    "booleen",
+    "email",
+    "float",
+    "int",
+    "money",
+    "mongo_object_id",
+    "twitter",
+    "url",
+    "uuid",
+    "date",
+    "datetime_rfc822",
+    "year",
+]

csv_detective/detect_labels/geo/lonlat_wgs/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from csv_detective.parsing.text import header_score
 from ..latlon_wgs import COMMON_COORDS_LABELS
 PROPORTION = 0.5

csv_detective/detect_labels/other/mongo_object_id/__init__.py CHANGED Viewed

@@ -4,5 +4,5 @@ PROPORTION = 0.5
 def _is(header: str) -> float:
-    words_combinations_list = ['id', 'objectid']
+    words_combinations_list = ["id", "objectid"]
     return header_score(header, words_combinations_list)

csv_detective/detection/columns.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
-from typing import TextIO
 from time import time
+from typing import TextIO
 from csv_detective.utils import display_logs_depending_process_time
@@ -47,19 +47,21 @@ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int
         if return_int == 0:
             if verbose:
                 display_logs_depending_process_time(
-                    f'No heading column detected in {round(time() - start, 3)}s',
+                    f"No heading column detected in {round(time() - start, 3)}s",
                     time() - start,
                 )
             return 0
     if verbose:
         display_logs_depending_process_time(
-            f'{return_int} heading columns detected in {round(time() - start, 3)}s',
+            f"{return_int} heading columns detected in {round(time() - start, 3)}s",
             time() - start,
         )
     return return_int
-def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
+def detect_trailing_columns(
+    file: TextIO, sep: str, heading_columns: int, verbose: bool = False
+) -> int:
     """Tests first 10 lines to see if there are empty trailing columns"""
     if verbose:
         start = time()
@@ -70,20 +72,18 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
         line = file.readline()
         return_int = min(
             return_int,
-            len(line.replace("\n", ""))
-            - len(line.replace("\n", "").strip(sep))
-            - heading_columns,
+            len(line.replace("\n", "")) - len(line.replace("\n", "").strip(sep)) - heading_columns,
         )
         if return_int == 0:
             if verbose:
                 display_logs_depending_process_time(
-                    f'No trailing column detected in {round(time() - start, 3)}s',
+                    f"No trailing column detected in {round(time() - start, 3)}s",
                     time() - start,
                 )
             return 0
     if verbose:
         display_logs_depending_process_time(
-            f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
+            f"{return_int} trailing columns detected in {round(time() - start, 3)}s",
             time() - start,
         )
     return return_int

csv_detective/detection/encoding.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
-from time import time
 from io import BytesIO
+from time import time
 from cchardet import detect
@@ -16,12 +16,14 @@ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
         logging.info("Detecting encoding")
     encoding_dict = detect(binary_file.read())
     if not encoding_dict["encoding"]:
-        raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
+        raise ValueError(
+            "Could not detect the file's encoding. Consider specifying it in the routine call."
+        )
     if verbose:
         message = f'Detected encoding: "{encoding_dict["encoding"]}"'
-        message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
+        message += f" in {round(time() - start, 3)}s (confidence: {round(encoding_dict['confidence'] * 100)}%)"
         display_logs_depending_process_time(
             message,
             time() - start,
         )
-    return encoding_dict['encoding']
+    return encoding_dict["encoding"]

csv_detective/detection/engine.py CHANGED Viewed

@@ -22,11 +22,11 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
     mapping = {
         "application/gzip": "gzip",
         "application/x-gzip": "gzip",
-        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
-        'application/vnd.ms-excel': 'xlrd',
-        'application/vnd.oasis.opendocument.spreadsheet': 'odf',
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "openpyxl",
+        "application/vnd.ms-excel": "xlrd",
+        "application/vnd.oasis.opendocument.spreadsheet": "odf",
         # all these files could be recognized as zip, may need to check all cases then
-        'application/zip': 'openpyxl',
+        "application/zip": "openpyxl",
     }
     # if none of the above, we move forwards with the csv process
     if is_url(file_path):
@@ -37,7 +37,8 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
     if verbose:
         message = (
             f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
-            if engine else "Processing the file as a csv"
+            if engine
+            else "Processing the file as a csv"
         )
         display_logs_depending_process_time(
             message,

csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl

csv-detective 0.8.1.dev1703py3-none-any.whl → 0.8.1.dev1729py3-none-any.whl