PyPI - csv-detective - Versions diffs - 0.9.2.dev1896__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl - Mend

csv-detective 0.9.2.dev1896py3-none-any.whl → 0.9.3.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

csv_detective/__init__.py +1 -2
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
csv_detective/detect_fields/other/email/__init__.py +2 -2
csv_detective/detect_fields/temp/date/__init__.py +1 -2
csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
csv_detective/detection/engine.py +1 -2
csv_detective/detection/formats.py +14 -8
csv_detective/detection/headers.py +2 -2
csv_detective/explore_csv.py +11 -119
csv_detective/load_tests.py +1 -2
csv_detective/output/__init__.py +4 -5
csv_detective/output/dataframe.py +1 -2
csv_detective/output/example.py +12 -12
csv_detective/output/schema.py +7 -86
csv_detective/parsing/excel.py +2 -3
csv_detective/parsing/load.py +3 -4
csv_detective/utils.py +1 -2
csv_detective/validate.py +4 -5
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +33 -35
tests/test_fields.py +37 -4
tests/test_file.py +18 -0
venv/bin/activate_this.py +1 -1
csv_detective/s3_utils.py +0 -44
venv/bin/jp.py +0 -54
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0

csv_detective/__init__.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
+from csv_detective.explore_csv import routine, validate_then_detect
 __all__ = [
     "routine",
-    "routine_minio",
     "validate_then_detect",
 ]

csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ from frformat import LatitudeL93
 from csv_detective.detect_fields.other.float import _is as is_float
 from csv_detective.detect_fields.other.float import float_casting
-PROPORTION = 0.9
+PROPORTION = 1
 _latitudel93 = LatitudeL93()

csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from csv_detective.detect_fields.other.float import _is as is_float
-PROPORTION = 0.9
+PROPORTION = 1
 def _is(val):

csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ from frformat import LongitudeL93
 from csv_detective.detect_fields.other.float import _is as is_float
 from csv_detective.detect_fields.other.float import float_casting
-PROPORTION = 0.9
+PROPORTION = 1
 _longitudel93 = LongitudeL93()

csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from csv_detective.detect_fields.other.float import _is as is_float
-PROPORTION = 0.9
+PROPORTION = 1
 def _is(val):

csv_detective/detect_fields/geo/latitude_wgs/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from csv_detective.detect_fields.other.float import _is as is_float
-PROPORTION = 0.9
+PROPORTION = 1
 def _is(val):

csv_detective/detect_fields/geo/longitude_wgs/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from csv_detective.detect_fields.other.float import _is as is_float
-PROPORTION = 0.9
+PROPORTION = 1
 def _is(val):

csv_detective/detect_fields/other/email/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import re
-PROPORTION = 1
+PROPORTION = 0.9
 def _is(val):
     """Detects e-mails"""
     return isinstance(val, str) and bool(
-        re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
+        re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
     )

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import re
 from datetime import datetime
-from typing import Optional
 from dateparser import parse as date_parser
 from dateutil.parser import ParserError
@@ -10,7 +9,7 @@ PROPORTION = 1
 # /!\ this is only for dates, not datetimes which are handled by other utils
-def date_casting(val: str) -> Optional[datetime]:
+def date_casting(val: str) -> datetime | None:
     """For performance reasons, we try first with dateutil and fallback on dateparser"""
     try:
         return dateutil_parser(val)

csv_detective/detect_fields/temp/datetime_aware/__init__.py CHANGED Viewed

@@ -1,24 +1,25 @@
 import re
-from typing import Any, Optional
+from typing import Any
 from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
 PROPORTION = 1
 threshold = 0.7
-# matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
+# matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
 pat = (
     aaaammjj_pattern.replace("$", "")
-    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
+    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
+    + r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
 )
-def _is(val: Optional[Any]) -> bool:
+def _is(val: Any | None) -> bool:
     """Detects timezone-aware datetimes only"""
     # early stops, to cut processing time
-    # 21 is the minimal length of a datetime format YYMMDDTHH:MM:SS+HH:MM
+    # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
     # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
-    if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
+    if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
         return False
     # if usual format, no need to parse
     if bool(re.match(pat, val)):

csv_detective/detect_fields/temp/datetime_naive/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import re
-from typing import Any, Optional
+from typing import Any
 from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
@@ -9,11 +9,11 @@ threshold = 0.7
 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
 pat = (
     aaaammjj_pattern.replace("$", "")
-    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
+    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
 )
-def _is(val: Optional[Any]) -> bool:
+def _is(val: Any | None) -> bool:
     """Detects naive datetimes only"""
     # early stops, to cut processing time
     # 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
@@ -26,8 +26,4 @@ def _is(val: Optional[Any]) -> bool:
     if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
         return False
     res = date_casting(val)
-    return (
-        res is not None
-        and bool(res.hour or res.minute or res.second or res.microsecond)
-        and not bool(res.tzinfo)
-    )
+    return res is not None and not bool(res.tzinfo)

csv_detective/detection/engine.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from time import time
-from typing import Optional
 import magic
 import requests
@@ -16,7 +15,7 @@ engine_to_file = {
 }
-def detect_engine(file_path: str, verbose=False) -> Optional[str]:
+def detect_engine(file_path: str, verbose=False) -> str | None:
     if verbose:
         start = time()
     mapping = {

csv_detective/detection/formats.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from collections import defaultdict
-from typing import Union
 import numpy as np
 import pandas as pd
@@ -22,7 +21,7 @@ def detect_formats(
     table: pd.DataFrame,
     analysis: dict,
     file_path: str,
-    user_input_tests: Union[str, list[str]] = "ALL",
+    user_input_tests: str | list[str] = "ALL",
     limited_output: bool = True,
     skipna: bool = True,
     verbose: bool = False,
@@ -30,7 +29,7 @@ def detect_formats(
     on_sample = len(table) > MAX_ROWS_ANALYSIS
     if on_sample:
         if verbose:
-            logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
+            logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
         table = build_sample(table)
     if table.empty:
@@ -183,13 +182,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
     samples = pd.concat(
         [
             # one row with the minimum of the column
-            table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
+            table.loc[table[col] == val].iloc[[0]]
             for col in table.columns
+            if not pd.isna(val := table[col].dropna().min())
         ]
         + [
             # one row with the maximum of the column
-            table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
+            table.loc[table[col] == val].iloc[[0]]
             for col in table.columns
+            if not pd.isna(val := table[col].dropna().max())
         ]
         + [
             # one row with a NaN value if the column has any
@@ -199,7 +200,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
         ],
         ignore_index=True,
     )
-    return pd.concat(
-        [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
-        ignore_index=True,
+    return (
+        pd.concat(
+            [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
+            ignore_index=True,
+        )
+        # this is very unlikely but we never know
+        if len(samples) <= MAX_ROWS_ANALYSIS
+        else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
     )

csv_detective/detection/headers.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import logging
 from time import time
-from typing import Optional, TextIO
+from typing import TextIO
 from csv_detective.utils import display_logs_depending_process_time
-def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
+def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
     """Tests 10 first rows for possible header (in case header is not 1st row)"""
     if verbose:
         start = time()

csv_detective/explore_csv.py CHANGED Viewed

@@ -1,16 +1,11 @@
-import json
 import logging
-import os
-import tempfile
 from time import time
-from typing import Optional, Union
 import pandas as pd
 from csv_detective.detection.formats import detect_formats
-from csv_detective.output import generate_output, generate_table_schema
+from csv_detective.output import generate_output
 from csv_detective.parsing.load import load_file
-from csv_detective.s3_utils import download_from_minio, upload_to_minio
 from csv_detective.utils import display_logs_depending_process_time, is_url
 from csv_detective.validate import validate
@@ -20,24 +15,24 @@ logging.basicConfig(level=logging.INFO)
 def routine(
     file_path: str,
     num_rows: int = 500,
-    user_input_tests: Union[str, list[str]] = "ALL",
+    user_input_tests: str | list[str] = "ALL",
     limited_output: bool = True,
-    save_results: Union[bool, str] = True,
-    encoding: Optional[str] = None,
-    sep: Optional[str] = None,
+    save_results: bool | str = True,
+    encoding: str | None = None,
+    sep: str | None = None,
     skipna: bool = True,
     output_profile: bool = False,
     output_schema: bool = False,
     output_df: bool = False,
     cast_json: bool = True,
     verbose: bool = False,
-    sheet_name: Optional[Union[str, int]] = None,
-) -> Union[dict, tuple[dict, pd.DataFrame]]:
-    """Returns a dict with information about the csv table and possible
+    sheet_name: str | int | None = None,
+) -> dict | tuple[dict, pd.DataFrame]:
+    """Returns a dict with information about the table and possible
     column contents, and if requested the DataFrame with columns cast according to analysis.
     Args:
-        file_path: local path to CSV file if not using Minio
+        file_path: local path or URL to file
         num_rows: number of rows to sample from the file for analysis ; -1 for analysis
         of the whole file
         user_input_tests: tests to run on the file
@@ -111,9 +106,9 @@ def validate_then_detect(
     file_path: str,
     previous_analysis: dict,
     num_rows: int = 500,
-    user_input_tests: Union[str, list[str]] = "ALL",
+    user_input_tests: str | list[str] = "ALL",
     limited_output: bool = True,
-    save_results: Union[bool, str] = True,
+    save_results: bool | str = True,
     skipna: bool = True,
     output_profile: bool = False,
     output_schema: bool = False,
@@ -173,106 +168,3 @@ def validate_then_detect(
             display_logs_depending_process_time(
                 f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
             )
-def routine_minio(
-    csv_minio_location: dict[str, str],
-    output_minio_location: dict[str, str],
-    tableschema_minio_location: dict[str, str],
-    minio_user: str,
-    minio_pwd: str,
-    **kwargs,
-):
-    """Returns a dict with information about the csv table and possible
-    column contents.
-    Args:
-        csv_minio_location: dict with Minio URL, bucket and key of the CSV file
-        output_minio_location: Minio URL, bucket and key to store output file. None if
-        not uploading to Minio.
-        tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
-        None if not uploading the tableschema to Minio.
-        minio_user: user name for the minio instance
-        minio_pwd: password for the minio instance
-        kwargs: arguments for routine
-    Returns:
-        dict: a dict with information about the csv and possible types for each column
-    """
-    if (
-        (
-            any(
-                [
-                    location_dict is not None
-                    for location_dict in [
-                        csv_minio_location,
-                        output_minio_location,
-                        tableschema_minio_location,
-                    ]
-                ]
-            )
-        )
-        and (minio_user is None)
-        or (minio_pwd is None)
-    ):
-        raise ValueError("Minio credentials are required if using Minio")
-    for location_dict in [
-        csv_minio_location,
-        output_minio_location,
-        tableschema_minio_location,
-    ]:
-        if location_dict is not None:
-            if any(
-                [
-                    (location_key not in location_dict) or (location_dict[location_key] is None)
-                    for location_key in ["netloc", "bucket", "key"]
-                ]
-            ):
-                raise ValueError("Minio location dict must contain url, bucket and key")
-    file_path = tempfile.NamedTemporaryFile(delete=False).name
-    download_from_minio(
-        netloc=csv_minio_location["netloc"],
-        bucket=csv_minio_location["bucket"],
-        key=csv_minio_location["key"],
-        filepath=file_path,
-        minio_user=minio_user,
-        minio_pwd=minio_pwd,
-    )
-    analysis = routine(
-        file_path,
-        save_results=True,
-        **kwargs,
-    )
-    # Write report JSON file.
-    output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
-    with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
-        json.dump(analysis, fp, indent=4, separators=(",", ": "))
-    upload_to_minio(
-        netloc=output_minio_location["netloc"],
-        bucket=output_minio_location["bucket"],
-        key=output_minio_location["key"],
-        filepath=output_path_to_store_minio_file,
-        minio_user=minio_user,
-        minio_pwd=minio_pwd,
-    )
-    os.remove(output_path_to_store_minio_file)
-    os.remove(file_path)
-    generate_table_schema(
-        analysis_report=analysis,
-        save_file=True,
-        netloc=tableschema_minio_location["netloc"],
-        bucket=tableschema_minio_location["bucket"],
-        key=tableschema_minio_location["key"],
-        minio_user=minio_user,
-        minio_pwd=minio_pwd,
-    )
-    return analysis

csv_detective/load_tests.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-from typing import Union
 from csv_detective import detect_fields, detect_labels  # noqa
@@ -18,7 +17,7 @@ def get_all_packages(detect_type) -> list:
 def return_all_tests(
-    user_input_tests: Union[str, list],
+    user_input_tests: str | list,
     detect_type: str,
 ) -> list:
     """

csv_detective/output/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import os
-from typing import Optional, Union
 import pandas as pd
@@ -17,14 +16,14 @@ def generate_output(
     file_path: str,
     num_rows: int = 500,
     limited_output: bool = True,
-    save_results: Union[bool, str] = True,
+    save_results: bool | str = True,
     output_profile: bool = False,
     output_schema: bool = False,
     output_df: bool = False,
     cast_json: bool = True,
     verbose: bool = False,
-    sheet_name: Optional[Union[str, int]] = None,
-) -> Union[dict, tuple[dict, pd.DataFrame]]:
+    sheet_name: str | int | None = None,
+) -> dict | tuple[dict, pd.DataFrame]:
     if output_profile:
         analysis["profile"] = create_profile(
             table=table,
@@ -51,7 +50,7 @@ def generate_output(
             )
     if output_schema:
-        analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
+        analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
     if output_df:
         return analysis, cast_df(

csv_detective/output/dataframe.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import json
 from datetime import date, datetime
 from time import time
-from typing import Optional, Union
 import pandas as pd
@@ -11,7 +10,7 @@ from csv_detective.detect_fields.temp.date import date_casting
 from csv_detective.utils import display_logs_depending_process_time
-def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
+def cast(value: str, _type: str) -> str | float | bool | date | datetime | None:
     if not isinstance(value, str) or not value:
         # None is the current default value in hydra, should we keep this?
         return None

csv_detective/output/example.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 import string
 import uuid
 from datetime import datetime
-from typing import Any, Optional, Type, Union
+from typing import Any, Type
 import pandas as pd
 import requests
@@ -14,10 +14,10 @@ fake = Faker()
 def create_example_csv_file(
-    fields: Optional[dict] = None,
-    schema_path: Optional[str] = None,
+    fields: dict | None = None,
+    schema_path: str | None = None,
     file_length: int = 10,
-    output_name: Optional[str] = "example_file.csv",
+    output_name: str | None = "example_file.csv",
     output_sep: str = ";",
     encoding: str = "utf-8",
     ignore_required: bool = False,
@@ -49,8 +49,8 @@ def create_example_csv_file(
     def _string(
         length: int = 10,
         required: bool = True,
-        pattern: Optional[str] = None,
-        enum: Optional[str] = None,
+        pattern: str | None = None,
+        enum: str | None = None,
     ) -> str:
         if potential_skip(required):
             return ""
@@ -70,7 +70,7 @@ def create_example_csv_file(
         return str(uuid.uuid4())
     def _date(
-        date_range: Optional[list[str]] = None,
+        date_range: list[str] | None = None,
         format: str = "%Y-%m-%d",
         required: bool = True,
     ) -> str:
@@ -99,7 +99,7 @@ def create_example_csv_file(
         return fake.time(format)
     def _datetime(
-        datetime_range: Optional[list[str]] = None,
+        datetime_range: list[str] | None = None,
         format: str = "%Y-%m-%d %H-%M-%S",
         required: bool = True,
     ) -> str:
@@ -123,11 +123,11 @@ def create_example_csv_file(
         return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
     def _number(
-        num_type: Type[Union[int, float]] = int,
-        num_range: Optional[list[float]] = None,
-        enum: Optional[list] = None,
+        num_type: Type[int | float] = int,
+        num_range: list[float] | None = None,
+        enum: list | None = None,
         required: bool = True,
-    ) -> Union[int, float]:
+    ) -> int | float:
         assert num_range is None or len(num_range) == 2
         if potential_skip(required):
             return ""

csv_detective/output/schema.py CHANGED Viewed

@@ -1,14 +1,8 @@
 import json
 import logging
-import os
-import tempfile
 from datetime import datetime
 from time import time
-from typing import Optional
-from botocore.exceptions import ClientError
-from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
 from csv_detective.utils import display_logs_depending_process_time
@@ -202,25 +196,14 @@ def get_constraints(format: str) -> dict:
 def generate_table_schema(
     analysis_report: dict,
-    save_file: bool,
-    netloc: Optional[str] = None,
-    bucket: Optional[str] = None,
-    key: Optional[str] = None,
-    minio_user: Optional[str] = None,
-    minio_pwd: Optional[str] = None,
+    save_results: bool | str = True,
     verbose: bool = False,
 ) -> dict:
     """Generates a table schema from the analysis report
     Args:
         analysis_report (dict): The analysis report from csv_detective
-        save_file (bool): indicate if schema should be saved into minio or just returned
-        netloc (str): The netloc of the minio instance to upload the tableschema
-        bucket (str): The bucket to save the schema in
-        key (str): The key to save the schema in (without extension as we will append
-        version number and extension)
-        minio_user (str): The minio user
-        minio_pwd (str): The minio password
+        save_results (bool or str): whether and where to save the results
     Returns:
     """
@@ -277,71 +260,9 @@ def generate_table_schema(
             f"Created schema in {round(time() - start, 3)}s", time() - start
         )
-    if not save_file:
-        return schema
-    if save_file:
-        if not all([netloc, key, bucket, minio_user, minio_pwd]):
-            raise Exception(
-                "To save schema into minio, parameters : netloc, key, bucket, "
-                "minio_user, minio_pwd should be provided"
-            )
-        # Create bucket if does not exist
-        client = get_s3_client(netloc, minio_user, minio_pwd)
-        try:
-            client.head_bucket(Bucket=bucket)
-        except ClientError:
-            client.create_bucket(Bucket=bucket)
-        tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
-        if "Contents" in tableschema_objects:
-            tableschema_keys = [
-                tableschema["Key"]
-                for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
-                    "Contents"
-                ]
-            ]
-            tableschema_versions = [
-                os.path.splitext(tableschema_key)[0].split("_")[-1]
-                for tableschema_key in tableschema_keys
-            ]
-            latest_version = max(tableschema_versions)
+    if save_results:
+        output_path = save_results if isinstance(save_results, str) else "schema.json"
+        with open(output_path, "w", encoding="utf8") as fp:
+            json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
-            with tempfile.NamedTemporaryFile() as latest_schema_file:
-                with open(latest_schema_file.name, "w") as fp:
-                    download_from_minio(
-                        netloc,
-                        bucket,
-                        f"{key}_{latest_version}.json",
-                        latest_schema_file.name,
-                        minio_user,
-                        minio_pwd,
-                    )
-                    # Check if files are different
-                    with open(latest_schema_file.name, "r") as fp:
-                        latest_schema = json.load(fp)
-                        if latest_schema["fields"] != fields:
-                            latest_version_split = latest_version.split(".")
-                            new_version = (
-                                latest_version_split[0]
-                                + "."
-                                + latest_version_split[1]
-                                + "."
-                                + str(int(latest_version_split[2]) + 1)
-                            )
-                        else:
-                            return None
-            schema["version"] = new_version
-        tableschema_file = tempfile.NamedTemporaryFile(delete=False)
-        with open(tableschema_file.name, "w") as fp:
-            json.dump(schema, fp, indent=4)
-        new_version_key = f"{key}_{new_version}.json"
-        upload_to_minio(
-            netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
-        )
-        os.unlink(tableschema_file.name)
-        return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
+    return schema

csv_detective/parsing/excel.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from io import BytesIO
 from time import time
-from typing import Optional
 import openpyxl
 import pandas as pd
@@ -23,8 +22,8 @@ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
 def parse_excel(
     file_path: str,
     num_rows: int = -1,
-    engine: Optional[str] = None,
-    sheet_name: Optional[str] = None,
+    engine: str | None = None,
+    sheet_name: str | None = None,
     random_state: int = 42,
     verbose: bool = False,
 ) -> tuple[pd.DataFrame, int, int, str, str, int]:

csv_detective/parsing/load.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from io import BytesIO, StringIO
-from typing import Optional, Union
 import pandas as pd
 import requests
@@ -26,10 +25,10 @@ from .excel import (
 def load_file(
     file_path: str,
     num_rows: int = 500,
-    encoding: Optional[str] = None,
-    sep: Optional[str] = None,
+    encoding: str | None = None,
+    sep: str | None = None,
     verbose: bool = False,
-    sheet_name: Optional[Union[str, int]] = None,
+    sheet_name: str | int | None = None,
 ) -> tuple[pd.DataFrame, dict]:
     file_name = file_path.split("/")[-1]
     engine = None

csv_detective/utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Optional, Union
 import pandas as pd
@@ -31,7 +30,7 @@ def is_url(file_path: str) -> bool:
     return file_path.startswith("http")
-def cast_prevent_nan(value: float, _type: str) -> Optional[Union[float, int]]:
+def cast_prevent_nan(value: float, _type: str) -> float | int | None:
     if _type not in {"int", "float"}:
         raise ValueError(f"Invalid type was passed: {_type}")
     return None if pd.isna(value) else eval(_type)(value)

csv_detective/validate.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Optional, Union
 import pandas as pd
@@ -22,12 +21,12 @@ def validate(
     file_path: str,
     previous_analysis: dict,
     num_rows: int = 500,
-    encoding: Optional[str] = None,
-    sep: Optional[str] = None,
+    encoding: str | None = None,
+    sep: str | None = None,
     verbose: bool = False,
     skipna: bool = True,
-    sheet_name: Optional[Union[str, int]] = None,
-) -> tuple[bool, Optional[pd.DataFrame], Optional[dict]]:
+    sheet_name: str | int | None = None,
+) -> tuple[bool, pd.DataFrame | None, dict | None]:
     """
     Verify is the given file has the same fields and types as in the previous analysis.
     """

{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA RENAMED Viewed

@@ -1,15 +1,14 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.2.dev1896
+Version: 0.9.3.dev0
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT
 Project-URL: Source, https://github.com/datagouv/csv_detective
 Keywords: CSV,data processing,encoding,guess,parser,tabular
-Requires-Python: <3.14,>=3.9
+Requires-Python: <3.14,>=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: boto3<2,>=1.34.0
 Requires-Dist: dateparser<2,>=1.2.0
 Requires-Dist: faust-cchardet==2.1.19
 Requires-Dist: pandas<3,>=2.2.0
@@ -26,7 +25,6 @@ Requires-Dist: rstr==3.2.2
 Provides-Extra: dev
 Requires-Dist: pytest>=8.3.0; extra == "dev"
 Requires-Dist: responses>=0.25.0; extra == "dev"
-Requires-Dist: bumpx>=0.3.10; extra == "dev"
 Requires-Dist: ruff>=0.9.3; extra == "dev"
 Dynamic: license-file
@@ -221,32 +219,26 @@ ruff check --fix .
 ruff format .
 ```
-## Release
+### 🏷️ Release
-The release process uses `bumpx`.
+The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
-```shell
-pip install -e .[dev]
-```
-### Process
-1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
-2. It will update the CHANGELOG according to the new version being published
-3. It will push a tag with the given version to github
-4. CircleCI will pickup this tag, build the package and publish it to pypi
-5. `bumpx` will have everything ready for the next version (version, changelog...)
+```bash
+# Create a new release
+./tag_version.sh <version>
-### Dry run
+# Example
+./tag_version.sh 2.5.0
-```shell
-bumpx -d -v
+# Dry run to see what would happen
+./tag_version.sh 2.5.0 --dry-run
 ```
-### Release
-This will release a patch version:
+**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
-```shell
-bumpx -v
-```
+The script automatically:
+- Updates the version in pyproject.toml
+- Extracts commits since the last tag and formats them for CHANGELOG.md
+- Identifies breaking changes (commits with `!:` in the subject)
+- Creates a git tag and pushes it to the remote repository
+- Creates a GitHub release with the changelog content

{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,9 @@
-csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
+csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
 csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
-csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
-csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
-csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
-csv_detective/utils.py,sha256=xiIO7ZDqkTm9Rnhnq6RaDdnrPIfoG0JV9AsmaOG6plA,1162
-csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
+csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
+csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
+csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
+csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
 csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
 csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
 csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
 csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
-csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=RjkDSZzIbp4nnvDpa5GomDpyIJGvwErX7TgC4dlBJ14,437
-csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=7xmYpTYoHvFfcuocAhm6dP_j4sMII_hG1PMSrWId4FY,344
-csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=JbKuGK5UmUGAQKPFpN4RSLf3axJ5D1aCjzRXYHW-iXU,441
-csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=5VWDaHZvGhJAJu5XQrj6gLx5CVA9dNOE30eTXQ3pSf0,344
+csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
+csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
+csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
+csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
 csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
 csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
 csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,13 +49,13 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
 csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
 csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
 csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
-csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=sdor-L1WDHv5opg1Le13mru4ImSA-yEbxchlWENuUFE,327
+csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
 csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
-csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-30VCJiK6IVZttj6Cy6zu1IL5907Y,330
+csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
 csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
-csv_detective/detect_fields/other/email/__init__.py,sha256=p235wILf0fR9TeSEuyuPgoysAv9zg23a4vzdy3YJlxE,192
+csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
 csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
 csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
@@ -67,9 +66,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
 csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
-csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
-csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
+csv_detective/detect_fields/temp/date/__init__.py,sha256=j066luXADCti4Mbb-jvznrL1jf3p5TpEpVzW8vThRDE,2124
+csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=oDaZIhkW0SXSYeuK5R5TIzajvSmu-XjUn8GpqITFLnY,1250
+csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=z5wpuHiDl8j7ZeQjfZ5wO9lG6H9Ps6X218ANNw19Dag,1073
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
 csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
 csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -131,38 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
 csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
-csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
-csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
-csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
+csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
+csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
+csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
-csv_detective/output/__init__.py,sha256=bMsLp-XCVf4sNymIof_kdMdqFIY7GocOas-lPNekfQg,1930
-csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
-csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
+csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
+csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
+csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
 csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
-csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
+csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
 csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
-csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
-csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
+csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
+csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.2.dev1896.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.3.dev0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
-tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
+tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
+tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
 tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
-venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
-venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
+venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.2.dev1896.dist-info/METADATA,sha256=2ZrcsJkSf2uY3pxlmwvui5uFbicmYpa8nDnxmkp4-xM,9767
-csv_detective-0.9.2.dev1896.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.2.dev1896.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.2.dev1896.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.2.dev1896.dist-info/RECORD,,
+csv_detective-0.9.3.dev0.dist-info/METADATA,sha256=Xga9fj8KjfrMOhp5ZIoXsJLcAI2Jz31yNsdfFJca2DU,9928
+csv_detective-0.9.3.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.3.dev0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.3.dev0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.3.dev0.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
 from csv_detective.load_tests import return_all_tests
 from csv_detective.output.dataframe import cast
 from csv_detective.output.utils import prepare_output_dict
+from csv_detective.parsing.columns import test_col as col_test  # to prevent pytest from testing it
 def test_all_tests_return_bool():
@@ -284,7 +285,7 @@ fields = {
         False: ["nein", "ja", "2", "-0"],
     },
     email: {
-        True: ["cdo_intern@data.gouv.fr"],
+        True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
         False: ["cdo@@gouv.sfd"],
     },
     json: {
@@ -356,17 +357,25 @@ fields = {
         True: [
             "2021-06-22 10:20:10-04:00",
             "2030-06-22 00:00:00.0028+02:00",
+            "2000-12-21 10:20:10.1Z",
             "2024-12-19T10:53:36.428000+00:00",
             "1996/06/22 10:20:10 GMT",
         ],
         False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
     },
     datetime_naive: {
-        True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
+        True: [
+            "2021-06-22 10:20:10",
+            "2030/06-22   00:00:00",
+            "2030/06/22 00:00:00.0028",
+        ],
         False: [
             "2021-06-22T30:20:10",
             "Sun, 06 Nov 1994 08:49:37 GMT",
             "2021-06-44 10:20:10+02:00",
+            "1999-12-01T00:00:00Z",
+            "2021-06-44",
+            "15 décembre 1985",
         ],
     },
     datetime_rfc822: {
@@ -451,8 +460,8 @@ def test_priority(args):
         ("28/01/2000", date),
         ("2025-08-20T14:30:00+02:00", datetime_aware),
         ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
-        ("1925_12_20T14:30:00.2763Z", datetime_naive),
-        ("1925 12 20 14:30:00Z", datetime_naive),
+        ("1925_12_20T14:30:00.2763", datetime_naive),
+        ("1925 12 20 14:30:00Z", datetime_aware),
     ),
 )
 def test_early_detection(args):
@@ -461,3 +470,27 @@ def test_early_detection(args):
         res = module._is(value)
         assert res
         mock_func.assert_not_called()
+def test_all_proportion_1():
+    all_tests = return_all_tests("ALL", "detect_fields")
+    prop_1 = {
+        t.__name__.split(".")[-1]: eval(
+            t.__name__.split(".")[-1]
+            if t.__name__.split(".")[-1] not in ["int", "float"]
+            else "test_" + t.__name__.split(".")[-1]
+        )
+        for t in all_tests
+        if t.PROPORTION == 1
+    }
+    # building a table that uses only correct values for these formats, except on one row
+    table = pd.DataFrame(
+        {
+            test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
+            for test_name, test_module in prop_1.items()
+        }
+    )
+    # testing columns for all formats
+    returned_table = col_test(table, all_tests, limited_output=True)
+    # the analysis should have found no match on any format
+    assert all(returned_table[col].sum() == 0 for col in table.columns)

tests/test_file.py CHANGED Viewed

@@ -6,6 +6,7 @@ import responses
 from csv_detective import routine
 from csv_detective.output.profile import create_profile
+from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
 @pytest.mark.parametrize(
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
         save_results=False,
     )
     assert analysis["columns"][col_name]["format"] == "int"
+def test_full_nan_column(mocked_responses):
+    # we want a file that needs sampling
+    expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
+    mocked_responses.get(
+        "http://example.com/test.csv",
+        body=expected_content,
+        status=200,
+    )
+    # just testing it doesn't fail
+    routine(
+        file_path="http://example.com/test.csv",
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+    )

venv/bin/activate_this.py CHANGED Viewed

@@ -29,7 +29,7 @@ os.environ["VIRTUAL_ENV_PROMPT"] = '' or os.path.basename(base)
 # add the virtual environments libraries to the host python import mechanism
 prev_length = len(sys.path)
-for lib in '../lib/python3.9/site-packages'.split(os.pathsep):
+for lib in '../lib/python3.11/site-packages'.split(os.pathsep):
     path = os.path.realpath(os.path.join(bin_dir, lib))
     site.addsitedir(path.decode("utf-8") if '' else path)
 sys.path[:] = sys.path[prev_length:] + sys.path[0:prev_length]

csv_detective/s3_utils.py DELETED Viewed

@@ -1,44 +0,0 @@
-import logging
-import boto3
-from botocore.client import Config
-from botocore.exceptions import ClientError
-def get_minio_url(netloc: str, bucket: str, key: str) -> str:
-    """Returns location of given resource in minio once it is saved"""
-    return netloc + "/" + bucket + "/" + key
-def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
-    return boto3.client(
-        "s3",
-        endpoint_url=url,
-        aws_access_key_id=minio_user,
-        aws_secret_access_key=minio_pwd,
-        config=Config(signature_version="s3v4"),
-    )
-def download_from_minio(
-    netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
-) -> None:
-    logging.info("Downloading from minio")
-    s3 = get_s3_client(netloc, minio_user, minio_pwd)
-    try:
-        s3.download_file(bucket, key, filepath)
-        logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
-    except ClientError as e:
-        logging.error(e)
-def upload_to_minio(
-    netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
-) -> None:
-    logging.info("Saving to minio")
-    s3 = get_s3_client(netloc, minio_user, minio_pwd)
-    try:
-        s3.upload_file(filepath, bucket, key)
-        logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
-    except ClientError as e:
-        logging.error(e)

venv/bin/jp.py DELETED Viewed

@@ -1,54 +0,0 @@
-#!/home/circleci/project/venv/bin/python
-import sys
-import json
-import argparse
-from pprint import pformat
-import jmespath
-from jmespath import exceptions
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('expression')
-    parser.add_argument('-f', '--filename',
-                        help=('The filename containing the input data.  '
-                              'If a filename is not given then data is '
-                              'read from stdin.'))
-    parser.add_argument('--ast', action='store_true',
-                        help=('Pretty print the AST, do not search the data.'))
-    args = parser.parse_args()
-    expression = args.expression
-    if args.ast:
-        # Only print the AST
-        expression = jmespath.compile(args.expression)
-        sys.stdout.write(pformat(expression.parsed))
-        sys.stdout.write('\n')
-        return 0
-    if args.filename:
-        with open(args.filename, 'r') as f:
-            data = json.load(f)
-    else:
-        data = sys.stdin.read()
-        data = json.loads(data)
-    try:
-        sys.stdout.write(json.dumps(
-            jmespath.search(expression, data), indent=4, ensure_ascii=False))
-        sys.stdout.write('\n')
-    except exceptions.ArityError as e:
-        sys.stderr.write("invalid-arity: %s\n" % e)
-        return 1
-    except exceptions.JMESPathTypeError as e:
-        sys.stderr.write("invalid-type: %s\n" % e)
-        return 1
-    except exceptions.UnknownFunctionError as e:
-        sys.stderr.write("unknown-function: %s\n" % e)
-        return 1
-    except exceptions.ParseError as e:
-        sys.stderr.write("syntax-error: %s\n" % e)
-        return 1
-if __name__ == '__main__':
-    sys.exit(main())

{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.2.dev1896__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl

csv-detective 0.9.2.dev1896py3-none-any.whl → 0.9.3.dev0py3-none-any.whl