csv-detective 0.9.2.dev1874__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +2 -2
- csv_detective/detect_fields/temp/date/__init__.py +1 -2
- csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
- csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
- csv_detective/detection/engine.py +1 -2
- csv_detective/detection/formats.py +14 -8
- csv_detective/detection/headers.py +2 -2
- csv_detective/explore_csv.py +11 -119
- csv_detective/load_tests.py +1 -2
- csv_detective/output/__init__.py +11 -14
- csv_detective/output/dataframe.py +1 -2
- csv_detective/output/example.py +12 -12
- csv_detective/output/profile.py +13 -10
- csv_detective/output/schema.py +7 -86
- csv_detective/parsing/excel.py +2 -3
- csv_detective/parsing/load.py +3 -4
- csv_detective/utils.py +4 -3
- csv_detective/validate.py +4 -5
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +34 -36
- tests/test_fields.py +37 -4
- tests/test_file.py +68 -0
- venv/bin/activate_this.py +1 -1
- csv_detective/s3_utils.py +0 -44
- venv/bin/jp.py +0 -54
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0
csv_detective/output/schema.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
|
-
import tempfile
|
|
5
3
|
from datetime import datetime
|
|
6
4
|
from time import time
|
|
7
|
-
from typing import Optional
|
|
8
5
|
|
|
9
|
-
from botocore.exceptions import ClientError
|
|
10
|
-
|
|
11
|
-
from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
|
|
12
6
|
from csv_detective.utils import display_logs_depending_process_time
|
|
13
7
|
|
|
14
8
|
|
|
@@ -202,25 +196,14 @@ def get_constraints(format: str) -> dict:
|
|
|
202
196
|
|
|
203
197
|
def generate_table_schema(
|
|
204
198
|
analysis_report: dict,
|
|
205
|
-
|
|
206
|
-
netloc: Optional[str] = None,
|
|
207
|
-
bucket: Optional[str] = None,
|
|
208
|
-
key: Optional[str] = None,
|
|
209
|
-
minio_user: Optional[str] = None,
|
|
210
|
-
minio_pwd: Optional[str] = None,
|
|
199
|
+
save_results: bool | str = True,
|
|
211
200
|
verbose: bool = False,
|
|
212
201
|
) -> dict:
|
|
213
202
|
"""Generates a table schema from the analysis report
|
|
214
203
|
|
|
215
204
|
Args:
|
|
216
205
|
analysis_report (dict): The analysis report from csv_detective
|
|
217
|
-
|
|
218
|
-
netloc (str): The netloc of the minio instance to upload the tableschema
|
|
219
|
-
bucket (str): The bucket to save the schema in
|
|
220
|
-
key (str): The key to save the schema in (without extension as we will append
|
|
221
|
-
version number and extension)
|
|
222
|
-
minio_user (str): The minio user
|
|
223
|
-
minio_pwd (str): The minio password
|
|
206
|
+
save_results (bool or str): whether and where to save the results
|
|
224
207
|
|
|
225
208
|
Returns:
|
|
226
209
|
"""
|
|
@@ -277,71 +260,9 @@ def generate_table_schema(
|
|
|
277
260
|
f"Created schema in {round(time() - start, 3)}s", time() - start
|
|
278
261
|
)
|
|
279
262
|
|
|
280
|
-
if
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
if not all([netloc, key, bucket, minio_user, minio_pwd]):
|
|
285
|
-
raise Exception(
|
|
286
|
-
"To save schema into minio, parameters : netloc, key, bucket, "
|
|
287
|
-
"minio_user, minio_pwd should be provided"
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
# Create bucket if does not exist
|
|
291
|
-
client = get_s3_client(netloc, minio_user, minio_pwd)
|
|
292
|
-
try:
|
|
293
|
-
client.head_bucket(Bucket=bucket)
|
|
294
|
-
except ClientError:
|
|
295
|
-
client.create_bucket(Bucket=bucket)
|
|
296
|
-
|
|
297
|
-
tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
|
|
298
|
-
if "Contents" in tableschema_objects:
|
|
299
|
-
tableschema_keys = [
|
|
300
|
-
tableschema["Key"]
|
|
301
|
-
for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
|
|
302
|
-
"Contents"
|
|
303
|
-
]
|
|
304
|
-
]
|
|
305
|
-
tableschema_versions = [
|
|
306
|
-
os.path.splitext(tableschema_key)[0].split("_")[-1]
|
|
307
|
-
for tableschema_key in tableschema_keys
|
|
308
|
-
]
|
|
309
|
-
latest_version = max(tableschema_versions)
|
|
263
|
+
if save_results:
|
|
264
|
+
output_path = save_results if isinstance(save_results, str) else "schema.json"
|
|
265
|
+
with open(output_path, "w", encoding="utf8") as fp:
|
|
266
|
+
json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
|
|
310
267
|
|
|
311
|
-
|
|
312
|
-
with open(latest_schema_file.name, "w") as fp:
|
|
313
|
-
download_from_minio(
|
|
314
|
-
netloc,
|
|
315
|
-
bucket,
|
|
316
|
-
f"{key}_{latest_version}.json",
|
|
317
|
-
latest_schema_file.name,
|
|
318
|
-
minio_user,
|
|
319
|
-
minio_pwd,
|
|
320
|
-
)
|
|
321
|
-
# Check if files are different
|
|
322
|
-
with open(latest_schema_file.name, "r") as fp:
|
|
323
|
-
latest_schema = json.load(fp)
|
|
324
|
-
if latest_schema["fields"] != fields:
|
|
325
|
-
latest_version_split = latest_version.split(".")
|
|
326
|
-
new_version = (
|
|
327
|
-
latest_version_split[0]
|
|
328
|
-
+ "."
|
|
329
|
-
+ latest_version_split[1]
|
|
330
|
-
+ "."
|
|
331
|
-
+ str(int(latest_version_split[2]) + 1)
|
|
332
|
-
)
|
|
333
|
-
else:
|
|
334
|
-
return None
|
|
335
|
-
|
|
336
|
-
schema["version"] = new_version
|
|
337
|
-
|
|
338
|
-
tableschema_file = tempfile.NamedTemporaryFile(delete=False)
|
|
339
|
-
with open(tableschema_file.name, "w") as fp:
|
|
340
|
-
json.dump(schema, fp, indent=4)
|
|
341
|
-
|
|
342
|
-
new_version_key = f"{key}_{new_version}.json"
|
|
343
|
-
upload_to_minio(
|
|
344
|
-
netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
|
|
345
|
-
)
|
|
346
|
-
os.unlink(tableschema_file.name)
|
|
347
|
-
return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
|
|
268
|
+
return schema
|
csv_detective/parsing/excel.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import openpyxl
|
|
6
5
|
import pandas as pd
|
|
@@ -23,8 +22,8 @@ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
|
|
|
23
22
|
def parse_excel(
|
|
24
23
|
file_path: str,
|
|
25
24
|
num_rows: int = -1,
|
|
26
|
-
engine:
|
|
27
|
-
sheet_name:
|
|
25
|
+
engine: str | None = None,
|
|
26
|
+
sheet_name: str | None = None,
|
|
28
27
|
random_state: int = 42,
|
|
29
28
|
verbose: bool = False,
|
|
30
29
|
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
csv_detective/parsing/load.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from io import BytesIO, StringIO
|
|
2
|
-
from typing import Optional, Union
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
import requests
|
|
@@ -26,10 +25,10 @@ from .excel import (
|
|
|
26
25
|
def load_file(
|
|
27
26
|
file_path: str,
|
|
28
27
|
num_rows: int = 500,
|
|
29
|
-
encoding:
|
|
30
|
-
sep:
|
|
28
|
+
encoding: str | None = None,
|
|
29
|
+
sep: str | None = None,
|
|
31
30
|
verbose: bool = False,
|
|
32
|
-
sheet_name:
|
|
31
|
+
sheet_name: str | int | None = None,
|
|
33
32
|
) -> tuple[pd.DataFrame, dict]:
|
|
34
33
|
file_name = file_path.split("/")[-1]
|
|
35
34
|
engine = None
|
csv_detective/utils.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
@@ -31,5 +30,7 @@ def is_url(file_path: str) -> bool:
|
|
|
31
30
|
return file_path.startswith("http")
|
|
32
31
|
|
|
33
32
|
|
|
34
|
-
def
|
|
35
|
-
|
|
33
|
+
def cast_prevent_nan(value: float, _type: str) -> float | int | None:
|
|
34
|
+
if _type not in {"int", "float"}:
|
|
35
|
+
raise ValueError(f"Invalid type was passed: {_type}")
|
|
36
|
+
return None if pd.isna(value) else eval(_type)(value)
|
csv_detective/validate.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Optional, Union
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
@@ -22,12 +21,12 @@ def validate(
|
|
|
22
21
|
file_path: str,
|
|
23
22
|
previous_analysis: dict,
|
|
24
23
|
num_rows: int = 500,
|
|
25
|
-
encoding:
|
|
26
|
-
sep:
|
|
24
|
+
encoding: str | None = None,
|
|
25
|
+
sep: str | None = None,
|
|
27
26
|
verbose: bool = False,
|
|
28
27
|
skipna: bool = True,
|
|
29
|
-
sheet_name:
|
|
30
|
-
) -> tuple[bool,
|
|
28
|
+
sheet_name: str | int | None = None,
|
|
29
|
+
) -> tuple[bool, pd.DataFrame | None, dict | None]:
|
|
31
30
|
"""
|
|
32
31
|
Verify is the given file has the same fields and types as in the previous analysis.
|
|
33
32
|
"""
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.3.dev0
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: Etalab <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Source, https://github.com/datagouv/csv_detective
|
|
8
8
|
Keywords: CSV,data processing,encoding,guess,parser,tabular
|
|
9
|
-
Requires-Python: <3.14,>=3.
|
|
9
|
+
Requires-Python: <3.14,>=3.10
|
|
10
10
|
Description-Content-Type: text/markdown
|
|
11
11
|
License-File: LICENSE
|
|
12
|
-
Requires-Dist: boto3<2,>=1.34.0
|
|
13
12
|
Requires-Dist: dateparser<2,>=1.2.0
|
|
14
13
|
Requires-Dist: faust-cchardet==2.1.19
|
|
15
14
|
Requires-Dist: pandas<3,>=2.2.0
|
|
@@ -26,7 +25,6 @@ Requires-Dist: rstr==3.2.2
|
|
|
26
25
|
Provides-Extra: dev
|
|
27
26
|
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
28
27
|
Requires-Dist: responses>=0.25.0; extra == "dev"
|
|
29
|
-
Requires-Dist: bumpx>=0.3.10; extra == "dev"
|
|
30
28
|
Requires-Dist: ruff>=0.9.3; extra == "dev"
|
|
31
29
|
Dynamic: license-file
|
|
32
30
|
|
|
@@ -221,32 +219,26 @@ ruff check --fix .
|
|
|
221
219
|
ruff format .
|
|
222
220
|
```
|
|
223
221
|
|
|
224
|
-
|
|
222
|
+
### 🏷️ Release
|
|
225
223
|
|
|
226
|
-
The release process uses `
|
|
224
|
+
The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
|
|
227
225
|
|
|
228
|
-
```
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
### Process
|
|
233
|
-
|
|
234
|
-
1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
|
|
235
|
-
2. It will update the CHANGELOG according to the new version being published
|
|
236
|
-
3. It will push a tag with the given version to github
|
|
237
|
-
4. CircleCI will pickup this tag, build the package and publish it to pypi
|
|
238
|
-
5. `bumpx` will have everything ready for the next version (version, changelog...)
|
|
226
|
+
```bash
|
|
227
|
+
# Create a new release
|
|
228
|
+
./tag_version.sh <version>
|
|
239
229
|
|
|
240
|
-
|
|
230
|
+
# Example
|
|
231
|
+
./tag_version.sh 2.5.0
|
|
241
232
|
|
|
242
|
-
|
|
243
|
-
|
|
233
|
+
# Dry run to see what would happen
|
|
234
|
+
./tag_version.sh 2.5.0 --dry-run
|
|
244
235
|
```
|
|
245
236
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
This will release a patch version:
|
|
237
|
+
**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
|
|
249
238
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
239
|
+
The script automatically:
|
|
240
|
+
- Updates the version in pyproject.toml
|
|
241
|
+
- Extracts commits since the last tag and formats them for CHANGELOG.md
|
|
242
|
+
- Identifies breaking changes (commits with `!:` in the subject)
|
|
243
|
+
- Creates a git tag and pushes it to the remote repository
|
|
244
|
+
- Creates a GitHub release with the changelog content
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/load_tests.py,sha256=
|
|
5
|
-
csv_detective/
|
|
6
|
-
csv_detective/
|
|
7
|
-
csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
|
|
3
|
+
csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
|
|
4
|
+
csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
|
|
5
|
+
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
|
+
csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
|
|
8
7
|
csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
|
|
9
8
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
9
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
|
|
|
17
16
|
csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
|
|
18
17
|
csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
|
|
19
18
|
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
|
|
20
|
-
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=
|
|
21
|
-
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=
|
|
22
|
-
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=
|
|
23
|
-
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=
|
|
19
|
+
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
|
|
20
|
+
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
|
|
21
|
+
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
|
|
22
|
+
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
|
|
24
23
|
csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
|
|
25
24
|
csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
|
|
26
25
|
csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -50,13 +49,13 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
|
|
|
50
49
|
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
|
|
51
50
|
csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
52
51
|
csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
|
|
53
|
-
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=
|
|
52
|
+
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
|
|
54
53
|
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
|
|
55
|
-
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=
|
|
54
|
+
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
|
|
56
55
|
csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
|
|
57
56
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
57
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
|
|
59
|
-
csv_detective/detect_fields/other/email/__init__.py,sha256=
|
|
58
|
+
csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
|
|
60
59
|
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
61
60
|
csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
|
|
62
61
|
csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
|
|
@@ -67,9 +66,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
|
|
|
67
66
|
csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
|
|
68
67
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
|
|
69
68
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
|
-
csv_detective/detect_fields/temp/date/__init__.py,sha256=
|
|
71
|
-
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=
|
|
72
|
-
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=
|
|
69
|
+
csv_detective/detect_fields/temp/date/__init__.py,sha256=j066luXADCti4Mbb-jvznrL1jf3p5TpEpVzW8vThRDE,2124
|
|
70
|
+
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=oDaZIhkW0SXSYeuK5R5TIzajvSmu-XjUn8GpqITFLnY,1250
|
|
71
|
+
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=z5wpuHiDl8j7ZeQjfZ5wO9lG6H9Ps6X218ANNw19Dag,1073
|
|
73
72
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
|
|
74
73
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
|
|
75
74
|
csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
|
|
@@ -131,38 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
|
|
|
131
130
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
131
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
133
132
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
134
|
-
csv_detective/detection/engine.py,sha256=
|
|
135
|
-
csv_detective/detection/formats.py,sha256=
|
|
136
|
-
csv_detective/detection/headers.py,sha256=
|
|
133
|
+
csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
|
|
134
|
+
csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
|
|
135
|
+
csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
|
|
137
136
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
137
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
139
138
|
csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
|
|
140
|
-
csv_detective/output/__init__.py,sha256=
|
|
141
|
-
csv_detective/output/dataframe.py,sha256=
|
|
142
|
-
csv_detective/output/example.py,sha256=
|
|
143
|
-
csv_detective/output/profile.py,sha256=
|
|
144
|
-
csv_detective/output/schema.py,sha256=
|
|
139
|
+
csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
|
|
140
|
+
csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
|
|
141
|
+
csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
|
|
142
|
+
csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
|
|
143
|
+
csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
|
|
145
144
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
146
145
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
147
146
|
csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
|
|
148
147
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
149
148
|
csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
|
|
150
|
-
csv_detective/parsing/excel.py,sha256=
|
|
151
|
-
csv_detective/parsing/load.py,sha256
|
|
149
|
+
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
150
|
+
csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
|
|
152
151
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.9.
|
|
152
|
+
csv_detective-0.9.3.dev0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
153
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
154
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
|
-
tests/test_fields.py,sha256=
|
|
157
|
-
tests/test_file.py,sha256=
|
|
155
|
+
tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
|
|
156
|
+
tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
|
|
158
157
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
159
158
|
tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
|
|
160
159
|
tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
161
|
-
venv/bin/activate_this.py,sha256=
|
|
162
|
-
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
160
|
+
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
163
161
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.9.
|
|
165
|
-
csv_detective-0.9.
|
|
166
|
-
csv_detective-0.9.
|
|
167
|
-
csv_detective-0.9.
|
|
168
|
-
csv_detective-0.9.
|
|
162
|
+
csv_detective-0.9.3.dev0.dist-info/METADATA,sha256=Xga9fj8KjfrMOhp5ZIoXsJLcAI2Jz31yNsdfFJca2DU,9928
|
|
163
|
+
csv_detective-0.9.3.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
csv_detective-0.9.3.dev0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
165
|
+
csv_detective-0.9.3.dev0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
166
|
+
csv_detective-0.9.3.dev0.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
|
|
|
79
79
|
from csv_detective.load_tests import return_all_tests
|
|
80
80
|
from csv_detective.output.dataframe import cast
|
|
81
81
|
from csv_detective.output.utils import prepare_output_dict
|
|
82
|
+
from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
def test_all_tests_return_bool():
|
|
@@ -284,7 +285,7 @@ fields = {
|
|
|
284
285
|
False: ["nein", "ja", "2", "-0"],
|
|
285
286
|
},
|
|
286
287
|
email: {
|
|
287
|
-
True: ["cdo_intern@data.gouv.fr"],
|
|
288
|
+
True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
|
|
288
289
|
False: ["cdo@@gouv.sfd"],
|
|
289
290
|
},
|
|
290
291
|
json: {
|
|
@@ -356,17 +357,25 @@ fields = {
|
|
|
356
357
|
True: [
|
|
357
358
|
"2021-06-22 10:20:10-04:00",
|
|
358
359
|
"2030-06-22 00:00:00.0028+02:00",
|
|
360
|
+
"2000-12-21 10:20:10.1Z",
|
|
359
361
|
"2024-12-19T10:53:36.428000+00:00",
|
|
360
362
|
"1996/06/22 10:20:10 GMT",
|
|
361
363
|
],
|
|
362
364
|
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
|
|
363
365
|
},
|
|
364
366
|
datetime_naive: {
|
|
365
|
-
True: [
|
|
367
|
+
True: [
|
|
368
|
+
"2021-06-22 10:20:10",
|
|
369
|
+
"2030/06-22 00:00:00",
|
|
370
|
+
"2030/06/22 00:00:00.0028",
|
|
371
|
+
],
|
|
366
372
|
False: [
|
|
367
373
|
"2021-06-22T30:20:10",
|
|
368
374
|
"Sun, 06 Nov 1994 08:49:37 GMT",
|
|
369
375
|
"2021-06-44 10:20:10+02:00",
|
|
376
|
+
"1999-12-01T00:00:00Z",
|
|
377
|
+
"2021-06-44",
|
|
378
|
+
"15 décembre 1985",
|
|
370
379
|
],
|
|
371
380
|
},
|
|
372
381
|
datetime_rfc822: {
|
|
@@ -451,8 +460,8 @@ def test_priority(args):
|
|
|
451
460
|
("28/01/2000", date),
|
|
452
461
|
("2025-08-20T14:30:00+02:00", datetime_aware),
|
|
453
462
|
("2025/08/20 14:30:00.2763-12:00", datetime_aware),
|
|
454
|
-
("1925_12_20T14:30:00.
|
|
455
|
-
("1925 12 20 14:30:00Z",
|
|
463
|
+
("1925_12_20T14:30:00.2763", datetime_naive),
|
|
464
|
+
("1925 12 20 14:30:00Z", datetime_aware),
|
|
456
465
|
),
|
|
457
466
|
)
|
|
458
467
|
def test_early_detection(args):
|
|
@@ -461,3 +470,27 @@ def test_early_detection(args):
|
|
|
461
470
|
res = module._is(value)
|
|
462
471
|
assert res
|
|
463
472
|
mock_func.assert_not_called()
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def test_all_proportion_1():
|
|
476
|
+
all_tests = return_all_tests("ALL", "detect_fields")
|
|
477
|
+
prop_1 = {
|
|
478
|
+
t.__name__.split(".")[-1]: eval(
|
|
479
|
+
t.__name__.split(".")[-1]
|
|
480
|
+
if t.__name__.split(".")[-1] not in ["int", "float"]
|
|
481
|
+
else "test_" + t.__name__.split(".")[-1]
|
|
482
|
+
)
|
|
483
|
+
for t in all_tests
|
|
484
|
+
if t.PROPORTION == 1
|
|
485
|
+
}
|
|
486
|
+
# building a table that uses only correct values for these formats, except on one row
|
|
487
|
+
table = pd.DataFrame(
|
|
488
|
+
{
|
|
489
|
+
test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
|
|
490
|
+
for test_name, test_module in prop_1.items()
|
|
491
|
+
}
|
|
492
|
+
)
|
|
493
|
+
# testing columns for all formats
|
|
494
|
+
returned_table = col_test(table, all_tests, limited_output=True)
|
|
495
|
+
# the analysis should have found no match on any format
|
|
496
|
+
assert all(returned_table[col].sum() == 0 for col in table.columns)
|
tests/test_file.py
CHANGED
|
@@ -5,6 +5,8 @@ import pytest
|
|
|
5
5
|
import responses
|
|
6
6
|
|
|
7
7
|
from csv_detective import routine
|
|
8
|
+
from csv_detective.output.profile import create_profile
|
|
9
|
+
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
@pytest.mark.parametrize(
|
|
@@ -97,6 +99,55 @@ def test_profile_with_num_rows():
|
|
|
97
99
|
)
|
|
98
100
|
|
|
99
101
|
|
|
102
|
+
@pytest.mark.parametrize(
|
|
103
|
+
"params",
|
|
104
|
+
(
|
|
105
|
+
(
|
|
106
|
+
True,
|
|
107
|
+
{
|
|
108
|
+
"int_with_nan": {"format": "int", "python_type": "int"},
|
|
109
|
+
"date": {"format": "date", "python_type": "date"},
|
|
110
|
+
},
|
|
111
|
+
),
|
|
112
|
+
(
|
|
113
|
+
False,
|
|
114
|
+
{
|
|
115
|
+
"int_with_nan": [{"format": "int", "python_type": "int"}],
|
|
116
|
+
"date": [{"format": "date", "python_type": "date"}],
|
|
117
|
+
},
|
|
118
|
+
),
|
|
119
|
+
),
|
|
120
|
+
)
|
|
121
|
+
def test_profile_specific_cases(params):
|
|
122
|
+
limited_output, columns = params
|
|
123
|
+
table = pd.DataFrame(
|
|
124
|
+
{
|
|
125
|
+
"int_with_nan": ["1", pd.NA, pd.NA],
|
|
126
|
+
"date": ["1996-01-02", "1996-01-02", "2024-11-12"],
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
profile = create_profile(
|
|
130
|
+
table=table,
|
|
131
|
+
columns=columns,
|
|
132
|
+
limited_output=limited_output,
|
|
133
|
+
num_rows=-1,
|
|
134
|
+
)
|
|
135
|
+
assert profile["int_with_nan"] == {
|
|
136
|
+
"min": 1,
|
|
137
|
+
"max": 1,
|
|
138
|
+
"mean": 1,
|
|
139
|
+
"std": None,
|
|
140
|
+
"tops": [{"count": 1, "value": "1"}],
|
|
141
|
+
"nb_distinct": 1,
|
|
142
|
+
"nb_missing_values": 2,
|
|
143
|
+
}
|
|
144
|
+
assert profile["date"] == {
|
|
145
|
+
"tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
|
|
146
|
+
"nb_distinct": 2,
|
|
147
|
+
"nb_missing_values": 0,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
100
151
|
def test_exception_different_number_of_columns():
|
|
101
152
|
"""
|
|
102
153
|
A ValueError should be raised if the number of columns differs between the first rows
|
|
@@ -293,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
293
344
|
save_results=False,
|
|
294
345
|
)
|
|
295
346
|
assert analysis["columns"][col_name]["format"] == "int"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_full_nan_column(mocked_responses):
|
|
350
|
+
# we want a file that needs sampling
|
|
351
|
+
expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
|
|
352
|
+
mocked_responses.get(
|
|
353
|
+
"http://example.com/test.csv",
|
|
354
|
+
body=expected_content,
|
|
355
|
+
status=200,
|
|
356
|
+
)
|
|
357
|
+
# just testing it doesn't fail
|
|
358
|
+
routine(
|
|
359
|
+
file_path="http://example.com/test.csv",
|
|
360
|
+
num_rows=-1,
|
|
361
|
+
output_profile=False,
|
|
362
|
+
save_results=False,
|
|
363
|
+
)
|
venv/bin/activate_this.py
CHANGED
|
@@ -29,7 +29,7 @@ os.environ["VIRTUAL_ENV_PROMPT"] = '' or os.path.basename(base)
|
|
|
29
29
|
|
|
30
30
|
# add the virtual environments libraries to the host python import mechanism
|
|
31
31
|
prev_length = len(sys.path)
|
|
32
|
-
for lib in '../lib/python3.
|
|
32
|
+
for lib in '../lib/python3.11/site-packages'.split(os.pathsep):
|
|
33
33
|
path = os.path.realpath(os.path.join(bin_dir, lib))
|
|
34
34
|
site.addsitedir(path.decode("utf-8") if '' else path)
|
|
35
35
|
sys.path[:] = sys.path[prev_length:] + sys.path[0:prev_length]
|
csv_detective/s3_utils.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
import boto3
|
|
4
|
-
from botocore.client import Config
|
|
5
|
-
from botocore.exceptions import ClientError
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def get_minio_url(netloc: str, bucket: str, key: str) -> str:
|
|
9
|
-
"""Returns location of given resource in minio once it is saved"""
|
|
10
|
-
return netloc + "/" + bucket + "/" + key
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
|
|
14
|
-
return boto3.client(
|
|
15
|
-
"s3",
|
|
16
|
-
endpoint_url=url,
|
|
17
|
-
aws_access_key_id=minio_user,
|
|
18
|
-
aws_secret_access_key=minio_pwd,
|
|
19
|
-
config=Config(signature_version="s3v4"),
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def download_from_minio(
|
|
24
|
-
netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
|
|
25
|
-
) -> None:
|
|
26
|
-
logging.info("Downloading from minio")
|
|
27
|
-
s3 = get_s3_client(netloc, minio_user, minio_pwd)
|
|
28
|
-
try:
|
|
29
|
-
s3.download_file(bucket, key, filepath)
|
|
30
|
-
logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
|
|
31
|
-
except ClientError as e:
|
|
32
|
-
logging.error(e)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def upload_to_minio(
|
|
36
|
-
netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
|
|
37
|
-
) -> None:
|
|
38
|
-
logging.info("Saving to minio")
|
|
39
|
-
s3 = get_s3_client(netloc, minio_user, minio_pwd)
|
|
40
|
-
try:
|
|
41
|
-
s3.upload_file(filepath, bucket, key)
|
|
42
|
-
logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
|
|
43
|
-
except ClientError as e:
|
|
44
|
-
logging.error(e)
|