csv-detective 0.9.2.dev1874__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. csv_detective/__init__.py +1 -2
  2. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  6. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  7. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  8. csv_detective/detect_fields/other/email/__init__.py +2 -2
  9. csv_detective/detect_fields/temp/date/__init__.py +1 -2
  10. csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
  11. csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
  12. csv_detective/detection/engine.py +1 -2
  13. csv_detective/detection/formats.py +14 -8
  14. csv_detective/detection/headers.py +2 -2
  15. csv_detective/explore_csv.py +11 -119
  16. csv_detective/load_tests.py +1 -2
  17. csv_detective/output/__init__.py +11 -14
  18. csv_detective/output/dataframe.py +1 -2
  19. csv_detective/output/example.py +12 -12
  20. csv_detective/output/profile.py +13 -10
  21. csv_detective/output/schema.py +7 -86
  22. csv_detective/parsing/excel.py +2 -3
  23. csv_detective/parsing/load.py +3 -4
  24. csv_detective/utils.py +4 -3
  25. csv_detective/validate.py +4 -5
  26. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
  27. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +34 -36
  28. tests/test_fields.py +37 -4
  29. tests/test_file.py +68 -0
  30. venv/bin/activate_this.py +1 -1
  31. csv_detective/s3_utils.py +0 -44
  32. venv/bin/jp.py +0 -54
  33. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
  34. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
  35. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
  36. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
1
1
  import json
2
2
  import logging
3
- import os
4
- import tempfile
5
3
  from datetime import datetime
6
4
  from time import time
7
- from typing import Optional
8
5
 
9
- from botocore.exceptions import ClientError
10
-
11
- from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
12
6
  from csv_detective.utils import display_logs_depending_process_time
13
7
 
14
8
 
@@ -202,25 +196,14 @@ def get_constraints(format: str) -> dict:
202
196
 
203
197
  def generate_table_schema(
204
198
  analysis_report: dict,
205
- save_file: bool,
206
- netloc: Optional[str] = None,
207
- bucket: Optional[str] = None,
208
- key: Optional[str] = None,
209
- minio_user: Optional[str] = None,
210
- minio_pwd: Optional[str] = None,
199
+ save_results: bool | str = True,
211
200
  verbose: bool = False,
212
201
  ) -> dict:
213
202
  """Generates a table schema from the analysis report
214
203
 
215
204
  Args:
216
205
  analysis_report (dict): The analysis report from csv_detective
217
- save_file (bool): indicate if schema should be saved into minio or just returned
218
- netloc (str): The netloc of the minio instance to upload the tableschema
219
- bucket (str): The bucket to save the schema in
220
- key (str): The key to save the schema in (without extension as we will append
221
- version number and extension)
222
- minio_user (str): The minio user
223
- minio_pwd (str): The minio password
206
+ save_results (bool or str): whether and where to save the results
224
207
 
225
208
  Returns:
226
209
  """
@@ -277,71 +260,9 @@ def generate_table_schema(
277
260
  f"Created schema in {round(time() - start, 3)}s", time() - start
278
261
  )
279
262
 
280
- if not save_file:
281
- return schema
282
-
283
- if save_file:
284
- if not all([netloc, key, bucket, minio_user, minio_pwd]):
285
- raise Exception(
286
- "To save schema into minio, parameters : netloc, key, bucket, "
287
- "minio_user, minio_pwd should be provided"
288
- )
289
-
290
- # Create bucket if does not exist
291
- client = get_s3_client(netloc, minio_user, minio_pwd)
292
- try:
293
- client.head_bucket(Bucket=bucket)
294
- except ClientError:
295
- client.create_bucket(Bucket=bucket)
296
-
297
- tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
298
- if "Contents" in tableschema_objects:
299
- tableschema_keys = [
300
- tableschema["Key"]
301
- for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
302
- "Contents"
303
- ]
304
- ]
305
- tableschema_versions = [
306
- os.path.splitext(tableschema_key)[0].split("_")[-1]
307
- for tableschema_key in tableschema_keys
308
- ]
309
- latest_version = max(tableschema_versions)
263
+ if save_results:
264
+ output_path = save_results if isinstance(save_results, str) else "schema.json"
265
+ with open(output_path, "w", encoding="utf8") as fp:
266
+ json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
310
267
 
311
- with tempfile.NamedTemporaryFile() as latest_schema_file:
312
- with open(latest_schema_file.name, "w") as fp:
313
- download_from_minio(
314
- netloc,
315
- bucket,
316
- f"{key}_{latest_version}.json",
317
- latest_schema_file.name,
318
- minio_user,
319
- minio_pwd,
320
- )
321
- # Check if files are different
322
- with open(latest_schema_file.name, "r") as fp:
323
- latest_schema = json.load(fp)
324
- if latest_schema["fields"] != fields:
325
- latest_version_split = latest_version.split(".")
326
- new_version = (
327
- latest_version_split[0]
328
- + "."
329
- + latest_version_split[1]
330
- + "."
331
- + str(int(latest_version_split[2]) + 1)
332
- )
333
- else:
334
- return None
335
-
336
- schema["version"] = new_version
337
-
338
- tableschema_file = tempfile.NamedTemporaryFile(delete=False)
339
- with open(tableschema_file.name, "w") as fp:
340
- json.dump(schema, fp, indent=4)
341
-
342
- new_version_key = f"{key}_{new_version}.json"
343
- upload_to_minio(
344
- netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
345
- )
346
- os.unlink(tableschema_file.name)
347
- return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
268
+ return schema
@@ -1,6 +1,5 @@
1
1
  from io import BytesIO
2
2
  from time import time
3
- from typing import Optional
4
3
 
5
4
  import openpyxl
6
5
  import pandas as pd
@@ -23,8 +22,8 @@ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
23
22
  def parse_excel(
24
23
  file_path: str,
25
24
  num_rows: int = -1,
26
- engine: Optional[str] = None,
27
- sheet_name: Optional[str] = None,
25
+ engine: str | None = None,
26
+ sheet_name: str | None = None,
28
27
  random_state: int = 42,
29
28
  verbose: bool = False,
30
29
  ) -> tuple[pd.DataFrame, int, int, str, str, int]:
@@ -1,5 +1,4 @@
1
1
  from io import BytesIO, StringIO
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
  import requests
@@ -26,10 +25,10 @@ from .excel import (
26
25
  def load_file(
27
26
  file_path: str,
28
27
  num_rows: int = 500,
29
- encoding: Optional[str] = None,
30
- sep: Optional[str] = None,
28
+ encoding: str | None = None,
29
+ sep: str | None = None,
31
30
  verbose: bool = False,
32
- sheet_name: Optional[Union[str, int]] = None,
31
+ sheet_name: str | int | None = None,
33
32
  ) -> tuple[pd.DataFrame, dict]:
34
33
  file_name = file_path.split("/")[-1]
35
34
  engine = None
csv_detective/utils.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Optional
3
2
 
4
3
  import pandas as pd
5
4
 
@@ -31,5 +30,7 @@ def is_url(file_path: str) -> bool:
31
30
  return file_path.startswith("http")
32
31
 
33
32
 
34
- def prevent_nan(value: float) -> Optional[float]:
35
- return None if pd.isna(value) else value
33
+ def cast_prevent_nan(value: float, _type: str) -> float | int | None:
34
+ if _type not in {"int", "float"}:
35
+ raise ValueError(f"Invalid type was passed: {_type}")
36
+ return None if pd.isna(value) else eval(_type)(value)
csv_detective/validate.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
 
@@ -22,12 +21,12 @@ def validate(
22
21
  file_path: str,
23
22
  previous_analysis: dict,
24
23
  num_rows: int = 500,
25
- encoding: Optional[str] = None,
26
- sep: Optional[str] = None,
24
+ encoding: str | None = None,
25
+ sep: str | None = None,
27
26
  verbose: bool = False,
28
27
  skipna: bool = True,
29
- sheet_name: Optional[Union[str, int]] = None,
30
- ) -> tuple[bool, Optional[pd.DataFrame], Optional[dict]]:
28
+ sheet_name: str | int | None = None,
29
+ ) -> tuple[bool, pd.DataFrame | None, dict | None]:
31
30
  """
32
31
  Verify is the given file has the same fields and types as in the previous analysis.
33
32
  """
@@ -1,15 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.2.dev1874
3
+ Version: 0.9.3.dev0
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
7
7
  Project-URL: Source, https://github.com/datagouv/csv_detective
8
8
  Keywords: CSV,data processing,encoding,guess,parser,tabular
9
- Requires-Python: <3.14,>=3.9
9
+ Requires-Python: <3.14,>=3.10
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: boto3<2,>=1.34.0
13
12
  Requires-Dist: dateparser<2,>=1.2.0
14
13
  Requires-Dist: faust-cchardet==2.1.19
15
14
  Requires-Dist: pandas<3,>=2.2.0
@@ -26,7 +25,6 @@ Requires-Dist: rstr==3.2.2
26
25
  Provides-Extra: dev
27
26
  Requires-Dist: pytest>=8.3.0; extra == "dev"
28
27
  Requires-Dist: responses>=0.25.0; extra == "dev"
29
- Requires-Dist: bumpx>=0.3.10; extra == "dev"
30
28
  Requires-Dist: ruff>=0.9.3; extra == "dev"
31
29
  Dynamic: license-file
32
30
 
@@ -221,32 +219,26 @@ ruff check --fix .
221
219
  ruff format .
222
220
  ```
223
221
 
224
- ## Release
222
+ ### 🏷️ Release
225
223
 
226
- The release process uses `bumpx`.
224
+ The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
227
225
 
228
- ```shell
229
- pip install -e .[dev]
230
- ```
231
-
232
- ### Process
233
-
234
- 1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
235
- 2. It will update the CHANGELOG according to the new version being published
236
- 3. It will push a tag with the given version to github
237
- 4. CircleCI will pickup this tag, build the package and publish it to pypi
238
- 5. `bumpx` will have everything ready for the next version (version, changelog...)
226
+ ```bash
227
+ # Create a new release
228
+ ./tag_version.sh <version>
239
229
 
240
- ### Dry run
230
+ # Example
231
+ ./tag_version.sh 2.5.0
241
232
 
242
- ```shell
243
- bumpx -d -v
233
+ # Dry run to see what would happen
234
+ ./tag_version.sh 2.5.0 --dry-run
244
235
  ```
245
236
 
246
- ### Release
247
-
248
- This will release a patch version:
237
+ **Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
249
238
 
250
- ```shell
251
- bumpx -v
252
- ```
239
+ The script automatically:
240
+ - Updates the version in pyproject.toml
241
+ - Extracts commits since the last tag and formats them for CHANGELOG.md
242
+ - Identifies breaking changes (commits with `!:` in the subject)
243
+ - Creates a git tag and pushes it to the remote repository
244
+ - Creates a GitHub release with the changelog content
@@ -1,10 +1,9 @@
1
- csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
1
+ csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
4
- csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
5
- csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
6
- csv_detective/utils.py,sha256=u9I1tsyMfVr2eIYiGCD7Iu30d55H3za44-N3cV2nj8M,1013
7
- csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
3
+ csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
4
+ csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
5
+ csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
+ csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
8
7
  csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
9
8
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
9
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
17
16
  csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
18
17
  csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
19
18
  csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
20
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=RjkDSZzIbp4nnvDpa5GomDpyIJGvwErX7TgC4dlBJ14,437
21
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=7xmYpTYoHvFfcuocAhm6dP_j4sMII_hG1PMSrWId4FY,344
22
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=JbKuGK5UmUGAQKPFpN4RSLf3axJ5D1aCjzRXYHW-iXU,441
23
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=5VWDaHZvGhJAJu5XQrj6gLx5CVA9dNOE30eTXQ3pSf0,344
19
+ csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
20
+ csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
21
+ csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
22
+ csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
24
23
  csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
25
24
  csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
26
25
  csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,13 +49,13 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
50
49
  csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
51
50
  csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
52
51
  csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
53
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=sdor-L1WDHv5opg1Le13mru4ImSA-yEbxchlWENuUFE,327
52
+ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
54
53
  csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
55
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-30VCJiK6IVZttj6Cy6zu1IL5907Y,330
54
+ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
56
55
  csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
57
56
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
57
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
59
- csv_detective/detect_fields/other/email/__init__.py,sha256=p235wILf0fR9TeSEuyuPgoysAv9zg23a4vzdy3YJlxE,192
58
+ csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
60
59
  csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
61
60
  csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
62
61
  csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
@@ -67,9 +66,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
67
66
  csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
68
67
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
69
68
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
71
- csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
72
- csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
69
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=j066luXADCti4Mbb-jvznrL1jf3p5TpEpVzW8vThRDE,2124
70
+ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=oDaZIhkW0SXSYeuK5R5TIzajvSmu-XjUn8GpqITFLnY,1250
71
+ csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=z5wpuHiDl8j7ZeQjfZ5wO9lG6H9Ps6X218ANNw19Dag,1073
73
72
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
74
73
  csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
75
74
  csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -131,38 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
131
130
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
131
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
133
132
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
134
- csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
135
- csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
136
- csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
133
+ csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
134
+ csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
135
+ csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
137
136
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
137
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
139
138
  csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
140
- csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
141
- csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
142
- csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
143
- csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
144
- csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
139
+ csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
140
+ csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
141
+ csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
142
+ csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
143
+ csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
145
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
146
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
146
  csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
148
147
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
149
148
  csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
150
- csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
- csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
149
+ csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
150
+ csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
152
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.2.dev1874.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
- tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
157
- tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
155
+ tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
156
+ tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
158
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
158
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
159
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
- venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
- venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
160
+ venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
163
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.2.dev1874.dist-info/METADATA,sha256=uQd1XKNz46xPglKQ-wEpbAjy0CAXOgPCYR-NCeBckS0,9767
165
- csv_detective-0.9.2.dev1874.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.2.dev1874.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.2.dev1874.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.2.dev1874.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev0.dist-info/METADATA,sha256=Xga9fj8KjfrMOhp5ZIoXsJLcAI2Jz31yNsdfFJca2DU,9928
163
+ csv_detective-0.9.3.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev0.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
79
79
  from csv_detective.load_tests import return_all_tests
80
80
  from csv_detective.output.dataframe import cast
81
81
  from csv_detective.output.utils import prepare_output_dict
82
+ from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
82
83
 
83
84
 
84
85
  def test_all_tests_return_bool():
@@ -284,7 +285,7 @@ fields = {
284
285
  False: ["nein", "ja", "2", "-0"],
285
286
  },
286
287
  email: {
287
- True: ["cdo_intern@data.gouv.fr"],
288
+ True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
288
289
  False: ["cdo@@gouv.sfd"],
289
290
  },
290
291
  json: {
@@ -356,17 +357,25 @@ fields = {
356
357
  True: [
357
358
  "2021-06-22 10:20:10-04:00",
358
359
  "2030-06-22 00:00:00.0028+02:00",
360
+ "2000-12-21 10:20:10.1Z",
359
361
  "2024-12-19T10:53:36.428000+00:00",
360
362
  "1996/06/22 10:20:10 GMT",
361
363
  ],
362
364
  False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
363
365
  },
364
366
  datetime_naive: {
365
- True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
367
+ True: [
368
+ "2021-06-22 10:20:10",
369
+ "2030/06-22 00:00:00",
370
+ "2030/06/22 00:00:00.0028",
371
+ ],
366
372
  False: [
367
373
  "2021-06-22T30:20:10",
368
374
  "Sun, 06 Nov 1994 08:49:37 GMT",
369
375
  "2021-06-44 10:20:10+02:00",
376
+ "1999-12-01T00:00:00Z",
377
+ "2021-06-44",
378
+ "15 décembre 1985",
370
379
  ],
371
380
  },
372
381
  datetime_rfc822: {
@@ -451,8 +460,8 @@ def test_priority(args):
451
460
  ("28/01/2000", date),
452
461
  ("2025-08-20T14:30:00+02:00", datetime_aware),
453
462
  ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
454
- ("1925_12_20T14:30:00.2763Z", datetime_naive),
455
- ("1925 12 20 14:30:00Z", datetime_naive),
463
+ ("1925_12_20T14:30:00.2763", datetime_naive),
464
+ ("1925 12 20 14:30:00Z", datetime_aware),
456
465
  ),
457
466
  )
458
467
  def test_early_detection(args):
@@ -461,3 +470,27 @@ def test_early_detection(args):
461
470
  res = module._is(value)
462
471
  assert res
463
472
  mock_func.assert_not_called()
473
+
474
+
475
+ def test_all_proportion_1():
476
+ all_tests = return_all_tests("ALL", "detect_fields")
477
+ prop_1 = {
478
+ t.__name__.split(".")[-1]: eval(
479
+ t.__name__.split(".")[-1]
480
+ if t.__name__.split(".")[-1] not in ["int", "float"]
481
+ else "test_" + t.__name__.split(".")[-1]
482
+ )
483
+ for t in all_tests
484
+ if t.PROPORTION == 1
485
+ }
486
+ # building a table that uses only correct values for these formats, except on one row
487
+ table = pd.DataFrame(
488
+ {
489
+ test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
490
+ for test_name, test_module in prop_1.items()
491
+ }
492
+ )
493
+ # testing columns for all formats
494
+ returned_table = col_test(table, all_tests, limited_output=True)
495
+ # the analysis should have found no match on any format
496
+ assert all(returned_table[col].sum() == 0 for col in table.columns)
tests/test_file.py CHANGED
@@ -5,6 +5,8 @@ import pytest
5
5
  import responses
6
6
 
7
7
  from csv_detective import routine
8
+ from csv_detective.output.profile import create_profile
9
+ from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
8
10
 
9
11
 
10
12
  @pytest.mark.parametrize(
@@ -97,6 +99,55 @@ def test_profile_with_num_rows():
97
99
  )
98
100
 
99
101
 
102
+ @pytest.mark.parametrize(
103
+ "params",
104
+ (
105
+ (
106
+ True,
107
+ {
108
+ "int_with_nan": {"format": "int", "python_type": "int"},
109
+ "date": {"format": "date", "python_type": "date"},
110
+ },
111
+ ),
112
+ (
113
+ False,
114
+ {
115
+ "int_with_nan": [{"format": "int", "python_type": "int"}],
116
+ "date": [{"format": "date", "python_type": "date"}],
117
+ },
118
+ ),
119
+ ),
120
+ )
121
+ def test_profile_specific_cases(params):
122
+ limited_output, columns = params
123
+ table = pd.DataFrame(
124
+ {
125
+ "int_with_nan": ["1", pd.NA, pd.NA],
126
+ "date": ["1996-01-02", "1996-01-02", "2024-11-12"],
127
+ }
128
+ )
129
+ profile = create_profile(
130
+ table=table,
131
+ columns=columns,
132
+ limited_output=limited_output,
133
+ num_rows=-1,
134
+ )
135
+ assert profile["int_with_nan"] == {
136
+ "min": 1,
137
+ "max": 1,
138
+ "mean": 1,
139
+ "std": None,
140
+ "tops": [{"count": 1, "value": "1"}],
141
+ "nb_distinct": 1,
142
+ "nb_missing_values": 2,
143
+ }
144
+ assert profile["date"] == {
145
+ "tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
146
+ "nb_distinct": 2,
147
+ "nb_missing_values": 0,
148
+ }
149
+
150
+
100
151
  def test_exception_different_number_of_columns():
101
152
  """
102
153
  A ValueError should be raised if the number of columns differs between the first rows
@@ -293,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
293
344
  save_results=False,
294
345
  )
295
346
  assert analysis["columns"][col_name]["format"] == "int"
347
+
348
+
349
+ def test_full_nan_column(mocked_responses):
350
+ # we want a file that needs sampling
351
+ expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
352
+ mocked_responses.get(
353
+ "http://example.com/test.csv",
354
+ body=expected_content,
355
+ status=200,
356
+ )
357
+ # just testing it doesn't fail
358
+ routine(
359
+ file_path="http://example.com/test.csv",
360
+ num_rows=-1,
361
+ output_profile=False,
362
+ save_results=False,
363
+ )
venv/bin/activate_this.py CHANGED
@@ -29,7 +29,7 @@ os.environ["VIRTUAL_ENV_PROMPT"] = '' or os.path.basename(base)
29
29
 
30
30
  # add the virtual environments libraries to the host python import mechanism
31
31
  prev_length = len(sys.path)
32
- for lib in '../lib/python3.9/site-packages'.split(os.pathsep):
32
+ for lib in '../lib/python3.11/site-packages'.split(os.pathsep):
33
33
  path = os.path.realpath(os.path.join(bin_dir, lib))
34
34
  site.addsitedir(path.decode("utf-8") if '' else path)
35
35
  sys.path[:] = sys.path[prev_length:] + sys.path[0:prev_length]
csv_detective/s3_utils.py DELETED
@@ -1,44 +0,0 @@
1
- import logging
2
-
3
- import boto3
4
- from botocore.client import Config
5
- from botocore.exceptions import ClientError
6
-
7
-
8
- def get_minio_url(netloc: str, bucket: str, key: str) -> str:
9
- """Returns location of given resource in minio once it is saved"""
10
- return netloc + "/" + bucket + "/" + key
11
-
12
-
13
- def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
14
- return boto3.client(
15
- "s3",
16
- endpoint_url=url,
17
- aws_access_key_id=minio_user,
18
- aws_secret_access_key=minio_pwd,
19
- config=Config(signature_version="s3v4"),
20
- )
21
-
22
-
23
- def download_from_minio(
24
- netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
25
- ) -> None:
26
- logging.info("Downloading from minio")
27
- s3 = get_s3_client(netloc, minio_user, minio_pwd)
28
- try:
29
- s3.download_file(bucket, key, filepath)
30
- logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
31
- except ClientError as e:
32
- logging.error(e)
33
-
34
-
35
- def upload_to_minio(
36
- netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
37
- ) -> None:
38
- logging.info("Saving to minio")
39
- s3 = get_s3_client(netloc, minio_user, minio_pwd)
40
- try:
41
- s3.upload_file(filepath, bucket, key)
42
- logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
43
- except ClientError as e:
44
- logging.error(e)