PyPI - csv-detective - Versions diffs - 0.9.2.dev1874__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl - Mend

csv-detective 0.9.2.dev1874py3-none-any.whl → 0.9.3.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

csv_detective/__init__.py +1 -2
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
csv_detective/detect_fields/other/email/__init__.py +2 -2
csv_detective/detect_fields/temp/date/__init__.py +1 -2
csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
csv_detective/detection/engine.py +1 -2
csv_detective/detection/formats.py +14 -8
csv_detective/detection/headers.py +2 -2
csv_detective/explore_csv.py +11 -119
csv_detective/load_tests.py +1 -2
csv_detective/output/__init__.py +11 -14
csv_detective/output/dataframe.py +1 -2
csv_detective/output/example.py +12 -12
csv_detective/output/profile.py +13 -10
csv_detective/output/schema.py +7 -86
csv_detective/parsing/excel.py +2 -3
csv_detective/parsing/load.py +3 -4
csv_detective/utils.py +4 -3
csv_detective/validate.py +4 -5
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +34 -36
tests/test_fields.py +37 -4
tests/test_file.py +68 -0
venv/bin/activate_this.py +1 -1
csv_detective/s3_utils.py +0 -44
venv/bin/jp.py +0 -54
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0

csv_detective/output/schema.py CHANGED Viewed

@@ -1,14 +1,8 @@
 import json
 import logging
-import os
-import tempfile
 from datetime import datetime
 from time import time
-from typing import Optional
-from botocore.exceptions import ClientError
-from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
 from csv_detective.utils import display_logs_depending_process_time
@@ -202,25 +196,14 @@ def get_constraints(format: str) -> dict:
 def generate_table_schema(
     analysis_report: dict,
-    save_file: bool,
-    netloc: Optional[str] = None,
-    bucket: Optional[str] = None,
-    key: Optional[str] = None,
-    minio_user: Optional[str] = None,
-    minio_pwd: Optional[str] = None,
+    save_results: bool | str = True,
     verbose: bool = False,
 ) -> dict:
     """Generates a table schema from the analysis report
     Args:
         analysis_report (dict): The analysis report from csv_detective
-        save_file (bool): indicate if schema should be saved into minio or just returned
-        netloc (str): The netloc of the minio instance to upload the tableschema
-        bucket (str): The bucket to save the schema in
-        key (str): The key to save the schema in (without extension as we will append
-        version number and extension)
-        minio_user (str): The minio user
-        minio_pwd (str): The minio password
+        save_results (bool or str): whether and where to save the results
     Returns:
     """
@@ -277,71 +260,9 @@ def generate_table_schema(
             f"Created schema in {round(time() - start, 3)}s", time() - start
         )
-    if not save_file:
-        return schema
-    if save_file:
-        if not all([netloc, key, bucket, minio_user, minio_pwd]):
-            raise Exception(
-                "To save schema into minio, parameters : netloc, key, bucket, "
-                "minio_user, minio_pwd should be provided"
-            )
-        # Create bucket if does not exist
-        client = get_s3_client(netloc, minio_user, minio_pwd)
-        try:
-            client.head_bucket(Bucket=bucket)
-        except ClientError:
-            client.create_bucket(Bucket=bucket)
-        tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
-        if "Contents" in tableschema_objects:
-            tableschema_keys = [
-                tableschema["Key"]
-                for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
-                    "Contents"
-                ]
-            ]
-            tableschema_versions = [
-                os.path.splitext(tableschema_key)[0].split("_")[-1]
-                for tableschema_key in tableschema_keys
-            ]
-            latest_version = max(tableschema_versions)
+    if save_results:
+        output_path = save_results if isinstance(save_results, str) else "schema.json"
+        with open(output_path, "w", encoding="utf8") as fp:
+            json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
-            with tempfile.NamedTemporaryFile() as latest_schema_file:
-                with open(latest_schema_file.name, "w") as fp:
-                    download_from_minio(
-                        netloc,
-                        bucket,
-                        f"{key}_{latest_version}.json",
-                        latest_schema_file.name,
-                        minio_user,
-                        minio_pwd,
-                    )
-                    # Check if files are different
-                    with open(latest_schema_file.name, "r") as fp:
-                        latest_schema = json.load(fp)
-                        if latest_schema["fields"] != fields:
-                            latest_version_split = latest_version.split(".")
-                            new_version = (
-                                latest_version_split[0]
-                                + "."
-                                + latest_version_split[1]
-                                + "."
-                                + str(int(latest_version_split[2]) + 1)
-                            )
-                        else:
-                            return None
-            schema["version"] = new_version
-        tableschema_file = tempfile.NamedTemporaryFile(delete=False)
-        with open(tableschema_file.name, "w") as fp:
-            json.dump(schema, fp, indent=4)
-        new_version_key = f"{key}_{new_version}.json"
-        upload_to_minio(
-            netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
-        )
-        os.unlink(tableschema_file.name)
-        return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
+    return schema

csv_detective/parsing/excel.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from io import BytesIO
 from time import time
-from typing import Optional
 import openpyxl
 import pandas as pd
@@ -23,8 +22,8 @@ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
 def parse_excel(
     file_path: str,
     num_rows: int = -1,
-    engine: Optional[str] = None,
-    sheet_name: Optional[str] = None,
+    engine: str | None = None,
+    sheet_name: str | None = None,
     random_state: int = 42,
     verbose: bool = False,
 ) -> tuple[pd.DataFrame, int, int, str, str, int]:

csv_detective/parsing/load.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from io import BytesIO, StringIO
-from typing import Optional, Union
 import pandas as pd
 import requests
@@ -26,10 +25,10 @@ from .excel import (
 def load_file(
     file_path: str,
     num_rows: int = 500,
-    encoding: Optional[str] = None,
-    sep: Optional[str] = None,
+    encoding: str | None = None,
+    sep: str | None = None,
     verbose: bool = False,
-    sheet_name: Optional[Union[str, int]] = None,
+    sheet_name: str | int | None = None,
 ) -> tuple[pd.DataFrame, dict]:
     file_name = file_path.split("/")[-1]
     engine = None

csv_detective/utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Optional
 import pandas as pd
@@ -31,5 +30,7 @@ def is_url(file_path: str) -> bool:
     return file_path.startswith("http")
-def prevent_nan(value: float) -> Optional[float]:
-    return None if pd.isna(value) else value
+def cast_prevent_nan(value: float, _type: str) -> float | int | None:
+    if _type not in {"int", "float"}:
+        raise ValueError(f"Invalid type was passed: {_type}")
+    return None if pd.isna(value) else eval(_type)(value)

csv_detective/validate.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Optional, Union
 import pandas as pd
@@ -22,12 +21,12 @@ def validate(
     file_path: str,
     previous_analysis: dict,
     num_rows: int = 500,
-    encoding: Optional[str] = None,
-    sep: Optional[str] = None,
+    encoding: str | None = None,
+    sep: str | None = None,
     verbose: bool = False,
     skipna: bool = True,
-    sheet_name: Optional[Union[str, int]] = None,
-) -> tuple[bool, Optional[pd.DataFrame], Optional[dict]]:
+    sheet_name: str | int | None = None,
+) -> tuple[bool, pd.DataFrame | None, dict | None]:
     """
     Verify is the given file has the same fields and types as in the previous analysis.
     """

{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA RENAMED Viewed

@@ -1,15 +1,14 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.2.dev1874
+Version: 0.9.3.dev0
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT
 Project-URL: Source, https://github.com/datagouv/csv_detective
 Keywords: CSV,data processing,encoding,guess,parser,tabular
-Requires-Python: <3.14,>=3.9
+Requires-Python: <3.14,>=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: boto3<2,>=1.34.0
 Requires-Dist: dateparser<2,>=1.2.0
 Requires-Dist: faust-cchardet==2.1.19
 Requires-Dist: pandas<3,>=2.2.0
@@ -26,7 +25,6 @@ Requires-Dist: rstr==3.2.2
 Provides-Extra: dev
 Requires-Dist: pytest>=8.3.0; extra == "dev"
 Requires-Dist: responses>=0.25.0; extra == "dev"
-Requires-Dist: bumpx>=0.3.10; extra == "dev"
 Requires-Dist: ruff>=0.9.3; extra == "dev"
 Dynamic: license-file
@@ -221,32 +219,26 @@ ruff check --fix .
 ruff format .
 ```
-## Release
+### 🏷️ Release
-The release process uses `bumpx`.
+The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
-```shell
-pip install -e .[dev]
-```
-### Process
-1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
-2. It will update the CHANGELOG according to the new version being published
-3. It will push a tag with the given version to github
-4. CircleCI will pickup this tag, build the package and publish it to pypi
-5. `bumpx` will have everything ready for the next version (version, changelog...)
+```bash
+# Create a new release
+./tag_version.sh <version>
-### Dry run
+# Example
+./tag_version.sh 2.5.0
-```shell
-bumpx -d -v
+# Dry run to see what would happen
+./tag_version.sh 2.5.0 --dry-run
 ```
-### Release
-This will release a patch version:
+**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
-```shell
-bumpx -v
-```
+The script automatically:
+- Updates the version in pyproject.toml
+- Extracts commits since the last tag and formats them for CHANGELOG.md
+- Identifies breaking changes (commits with `!:` in the subject)
+- Creates a git tag and pushes it to the remote repository
+- Creates a GitHub release with the changelog content

{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,9 @@
-csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
+csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
 csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
-csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
-csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
-csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
-csv_detective/utils.py,sha256=u9I1tsyMfVr2eIYiGCD7Iu30d55H3za44-N3cV2nj8M,1013
-csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
+csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
+csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
+csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
+csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
 csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
 csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
 csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
 csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
-csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=RjkDSZzIbp4nnvDpa5GomDpyIJGvwErX7TgC4dlBJ14,437
-csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=7xmYpTYoHvFfcuocAhm6dP_j4sMII_hG1PMSrWId4FY,344
-csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=JbKuGK5UmUGAQKPFpN4RSLf3axJ5D1aCjzRXYHW-iXU,441
-csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=5VWDaHZvGhJAJu5XQrj6gLx5CVA9dNOE30eTXQ3pSf0,344
+csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
+csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
+csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
+csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
 csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
 csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
 csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,13 +49,13 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
 csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
 csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
 csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
-csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=sdor-L1WDHv5opg1Le13mru4ImSA-yEbxchlWENuUFE,327
+csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
 csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
-csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-30VCJiK6IVZttj6Cy6zu1IL5907Y,330
+csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
 csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
-csv_detective/detect_fields/other/email/__init__.py,sha256=p235wILf0fR9TeSEuyuPgoysAv9zg23a4vzdy3YJlxE,192
+csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
 csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
 csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
@@ -67,9 +66,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
 csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
-csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
-csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
+csv_detective/detect_fields/temp/date/__init__.py,sha256=j066luXADCti4Mbb-jvznrL1jf3p5TpEpVzW8vThRDE,2124
+csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=oDaZIhkW0SXSYeuK5R5TIzajvSmu-XjUn8GpqITFLnY,1250
+csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=z5wpuHiDl8j7ZeQjfZ5wO9lG6H9Ps6X218ANNw19Dag,1073
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
 csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
 csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -131,38 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
 csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
-csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
-csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
-csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
+csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
+csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
+csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
-csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
-csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
-csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
-csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
-csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
+csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
+csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
+csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
+csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
+csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
 csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
-csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
-csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
+csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
+csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.2.dev1874.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.3.dev0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
-tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
+tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
+tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
 tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
-venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
-venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
+venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.2.dev1874.dist-info/METADATA,sha256=uQd1XKNz46xPglKQ-wEpbAjy0CAXOgPCYR-NCeBckS0,9767
-csv_detective-0.9.2.dev1874.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.2.dev1874.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.2.dev1874.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.2.dev1874.dist-info/RECORD,,
+csv_detective-0.9.3.dev0.dist-info/METADATA,sha256=Xga9fj8KjfrMOhp5ZIoXsJLcAI2Jz31yNsdfFJca2DU,9928
+csv_detective-0.9.3.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.3.dev0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.3.dev0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.3.dev0.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
 from csv_detective.load_tests import return_all_tests
 from csv_detective.output.dataframe import cast
 from csv_detective.output.utils import prepare_output_dict
+from csv_detective.parsing.columns import test_col as col_test  # to prevent pytest from testing it
 def test_all_tests_return_bool():
@@ -284,7 +285,7 @@ fields = {
         False: ["nein", "ja", "2", "-0"],
     },
     email: {
-        True: ["cdo_intern@data.gouv.fr"],
+        True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
         False: ["cdo@@gouv.sfd"],
     },
     json: {
@@ -356,17 +357,25 @@ fields = {
         True: [
             "2021-06-22 10:20:10-04:00",
             "2030-06-22 00:00:00.0028+02:00",
+            "2000-12-21 10:20:10.1Z",
             "2024-12-19T10:53:36.428000+00:00",
             "1996/06/22 10:20:10 GMT",
         ],
         False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
     },
     datetime_naive: {
-        True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
+        True: [
+            "2021-06-22 10:20:10",
+            "2030/06-22   00:00:00",
+            "2030/06/22 00:00:00.0028",
+        ],
         False: [
             "2021-06-22T30:20:10",
             "Sun, 06 Nov 1994 08:49:37 GMT",
             "2021-06-44 10:20:10+02:00",
+            "1999-12-01T00:00:00Z",
+            "2021-06-44",
+            "15 décembre 1985",
         ],
     },
     datetime_rfc822: {
@@ -451,8 +460,8 @@ def test_priority(args):
         ("28/01/2000", date),
         ("2025-08-20T14:30:00+02:00", datetime_aware),
         ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
-        ("1925_12_20T14:30:00.2763Z", datetime_naive),
-        ("1925 12 20 14:30:00Z", datetime_naive),
+        ("1925_12_20T14:30:00.2763", datetime_naive),
+        ("1925 12 20 14:30:00Z", datetime_aware),
     ),
 )
 def test_early_detection(args):
@@ -461,3 +470,27 @@ def test_early_detection(args):
         res = module._is(value)
         assert res
         mock_func.assert_not_called()
+def test_all_proportion_1():
+    all_tests = return_all_tests("ALL", "detect_fields")
+    prop_1 = {
+        t.__name__.split(".")[-1]: eval(
+            t.__name__.split(".")[-1]
+            if t.__name__.split(".")[-1] not in ["int", "float"]
+            else "test_" + t.__name__.split(".")[-1]
+        )
+        for t in all_tests
+        if t.PROPORTION == 1
+    }
+    # building a table that uses only correct values for these formats, except on one row
+    table = pd.DataFrame(
+        {
+            test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
+            for test_name, test_module in prop_1.items()
+        }
+    )
+    # testing columns for all formats
+    returned_table = col_test(table, all_tests, limited_output=True)
+    # the analysis should have found no match on any format
+    assert all(returned_table[col].sum() == 0 for col in table.columns)

tests/test_file.py CHANGED Viewed

@@ -5,6 +5,8 @@ import pytest
 import responses
 from csv_detective import routine
+from csv_detective.output.profile import create_profile
+from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
 @pytest.mark.parametrize(
@@ -97,6 +99,55 @@ def test_profile_with_num_rows():
         )
+@pytest.mark.parametrize(
+    "params",
+    (
+        (
+            True,
+            {
+                "int_with_nan": {"format": "int", "python_type": "int"},
+                "date": {"format": "date", "python_type": "date"},
+            },
+        ),
+        (
+            False,
+            {
+                "int_with_nan": [{"format": "int", "python_type": "int"}],
+                "date": [{"format": "date", "python_type": "date"}],
+            },
+        ),
+    ),
+)
+def test_profile_specific_cases(params):
+    limited_output, columns = params
+    table = pd.DataFrame(
+        {
+            "int_with_nan": ["1", pd.NA, pd.NA],
+            "date": ["1996-01-02", "1996-01-02", "2024-11-12"],
+        }
+    )
+    profile = create_profile(
+        table=table,
+        columns=columns,
+        limited_output=limited_output,
+        num_rows=-1,
+    )
+    assert profile["int_with_nan"] == {
+        "min": 1,
+        "max": 1,
+        "mean": 1,
+        "std": None,
+        "tops": [{"count": 1, "value": "1"}],
+        "nb_distinct": 1,
+        "nb_missing_values": 2,
+    }
+    assert profile["date"] == {
+        "tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
+        "nb_distinct": 2,
+        "nb_missing_values": 0,
+    }
 def test_exception_different_number_of_columns():
     """
     A ValueError should be raised if the number of columns differs between the first rows
@@ -293,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
         save_results=False,
     )
     assert analysis["columns"][col_name]["format"] == "int"
+def test_full_nan_column(mocked_responses):
+    # we want a file that needs sampling
+    expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
+    mocked_responses.get(
+        "http://example.com/test.csv",
+        body=expected_content,
+        status=200,
+    )
+    # just testing it doesn't fail
+    routine(
+        file_path="http://example.com/test.csv",
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+    )

venv/bin/activate_this.py CHANGED Viewed

@@ -29,7 +29,7 @@ os.environ["VIRTUAL_ENV_PROMPT"] = '' or os.path.basename(base)
 # add the virtual environments libraries to the host python import mechanism
 prev_length = len(sys.path)
-for lib in '../lib/python3.9/site-packages'.split(os.pathsep):
+for lib in '../lib/python3.11/site-packages'.split(os.pathsep):
     path = os.path.realpath(os.path.join(bin_dir, lib))
     site.addsitedir(path.decode("utf-8") if '' else path)
 sys.path[:] = sys.path[prev_length:] + sys.path[0:prev_length]

csv_detective/s3_utils.py DELETED Viewed

@@ -1,44 +0,0 @@
-import logging
-import boto3
-from botocore.client import Config
-from botocore.exceptions import ClientError
-def get_minio_url(netloc: str, bucket: str, key: str) -> str:
-    """Returns location of given resource in minio once it is saved"""
-    return netloc + "/" + bucket + "/" + key
-def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
-    return boto3.client(
-        "s3",
-        endpoint_url=url,
-        aws_access_key_id=minio_user,
-        aws_secret_access_key=minio_pwd,
-        config=Config(signature_version="s3v4"),
-    )
-def download_from_minio(
-    netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
-) -> None:
-    logging.info("Downloading from minio")
-    s3 = get_s3_client(netloc, minio_user, minio_pwd)
-    try:
-        s3.download_file(bucket, key, filepath)
-        logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
-    except ClientError as e:
-        logging.error(e)
-def upload_to_minio(
-    netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
-) -> None:
-    logging.info("Saving to minio")
-    s3 = get_s3_client(netloc, minio_user, minio_pwd)
-    try:
-        s3.upload_file(filepath, bucket, key)
-        logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
-    except ClientError as e:
-        logging.error(e)

csv-detective 0.9.2.dev1874__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl

csv-detective 0.9.2.dev1874py3-none-any.whl → 0.9.3.dev0py3-none-any.whl