csv-detective 0.8.1.dev1500__py3-none-any.whl → 0.8.1.dev1526__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/__init__.py +1 -1
- csv_detective/detect_fields/temp/date/__init__.py +1 -1
- csv_detective/detect_fields/temp/datetime_aware/__init__.py +21 -0
- csv_detective/detect_fields/temp/{datetime → datetime_naive}/__init__.py +7 -5
- csv_detective/detection/formats.py +2 -1
- csv_detective/output/schema.py +2 -0
- csv_detective/output/utils.py +4 -0
- csv_detective/parsing/columns.py +1 -1
- {csv_detective-0.8.1.dev1500.data → csv_detective-0.8.1.dev1526.data}/data/share/csv_detective/CHANGELOG.md +2 -1
- {csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/RECORD +19 -18
- tests/test_fields.py +17 -5
- tests/test_file.py +4 -5
- {csv_detective-0.8.1.dev1500.data → csv_detective-0.8.1.dev1526.data}/data/share/csv_detective/LICENSE +0 -0
- {csv_detective-0.8.1.dev1500.data → csv_detective-0.8.1.dev1526.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
4
|
+
|
|
5
|
+
PROPORTION = 1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is(val: Optional[Any]) -> bool:
|
|
9
|
+
"""Detects timezone-aware datetimes only"""
|
|
10
|
+
# early stops, to cut processing time
|
|
11
|
+
if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
|
|
12
|
+
return False
|
|
13
|
+
threshold = 0.7
|
|
14
|
+
if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
|
|
15
|
+
return False
|
|
16
|
+
res = date_casting(val)
|
|
17
|
+
return (
|
|
18
|
+
res is not None
|
|
19
|
+
and bool(res.hour or res.minute or res.second or res.microsecond)
|
|
20
|
+
and bool(res.tzinfo)
|
|
21
|
+
)
|
|
@@ -6,14 +6,16 @@ PROPORTION = 1
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def _is(val: Optional[Any]) -> bool:
|
|
9
|
-
|
|
9
|
+
"""Detects naive datetimes only"""
|
|
10
10
|
# early stops, to cut processing time
|
|
11
11
|
if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
|
|
12
12
|
return False
|
|
13
13
|
threshold = 0.7
|
|
14
|
-
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
14
|
+
if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
|
|
15
15
|
return False
|
|
16
16
|
res = date_casting(val)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
return (
|
|
18
|
+
res is not None
|
|
19
|
+
and bool(res.hour or res.minute or res.second or res.microsecond)
|
|
20
|
+
and not bool(res.tzinfo)
|
|
21
|
+
)
|
|
@@ -106,8 +106,9 @@ def detect_formats(
|
|
|
106
106
|
"string": "string",
|
|
107
107
|
"json": "json",
|
|
108
108
|
"json_geojson": "json",
|
|
109
|
-
"
|
|
109
|
+
"datetime_aware": "datetime",
|
|
110
110
|
"datetime_iso": "datetime",
|
|
111
|
+
"datetime_naive": "datetime",
|
|
111
112
|
"datetime_rfc822": "datetime",
|
|
112
113
|
"date": "date",
|
|
113
114
|
"latitude": "float",
|
csv_detective/output/schema.py
CHANGED
|
@@ -106,7 +106,9 @@ def get_validata_type(format: str) -> str:
|
|
|
106
106
|
"float": "number",
|
|
107
107
|
"string": "string",
|
|
108
108
|
"date": "date",
|
|
109
|
+
"datetime_aware": "datetime",
|
|
109
110
|
"datetime_iso": "datetime",
|
|
111
|
+
"datetime_naive": "datetime",
|
|
110
112
|
"datetime_rfc822": "datetime",
|
|
111
113
|
"json_geojson": "geojson",
|
|
112
114
|
"latitude": "number",
|
csv_detective/output/utils.py
CHANGED
|
@@ -34,6 +34,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
34
34
|
formats_to_remove.add("longitude_l93")
|
|
35
35
|
if "code_region" in formats_detected:
|
|
36
36
|
formats_to_remove.add("code_departement")
|
|
37
|
+
if "datetime_iso" in formats_detected:
|
|
38
|
+
formats_to_remove.add("datetime_naive")
|
|
39
|
+
if "datetime_rfc822" in formats_detected:
|
|
40
|
+
formats_to_remove.add("datetime_aware")
|
|
37
41
|
|
|
38
42
|
formats_to_keep = formats_detected - formats_to_remove
|
|
39
43
|
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
- Refactor label testing [#119](https://github.com/datagouv/csv-detective/pull/119)
|
|
6
6
|
- Refactor repo metadata and requirements [#120](https://github.com/datagouv/csv-detective/pull/120) [#122](https://github.com/datagouv/csv-detective/pull/122)
|
|
7
7
|
- Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
|
|
8
|
-
- For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124)
|
|
8
|
+
- For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124) [#129](https://github.com/datagouv/csv-detective/pull/129)
|
|
9
9
|
- Fix imports [#125](https://github.com/datagouv/csv-detective/pull/125) [#126](https://github.com/datagouv/csv-detective/pull/126) [#127](https://github.com/datagouv/csv-detective/pull/127) [#128](https://github.com/datagouv/csv-detective/pull/128)
|
|
10
|
+
- Split aware and naive datetimes for hydra to cast them separately [#130](https://github.com/datagouv/csv-detective/pull/130)
|
|
10
11
|
|
|
11
12
|
## 0.8.0 (2025-05-20)
|
|
12
13
|
|
|
@@ -5,7 +5,7 @@ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2
|
|
|
5
5
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
6
6
|
csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
|
|
7
7
|
csv_detective/validate.py,sha256=d_4Phmjk6Y0Z0YYVw4vpoZy8E79K370reGgkpzx1mcQ,2644
|
|
8
|
-
csv_detective/detect_fields/__init__.py,sha256=
|
|
8
|
+
csv_detective/detect_fields/__init__.py,sha256=HYSy0P_aH6R8Z8Hvd8aMaBAQaZ1QwcsWHT0YPm0iYs0,998
|
|
9
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
|
|
@@ -66,9 +66,10 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
|
|
|
66
66
|
csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
|
|
67
67
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
|
|
68
68
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
69
|
-
csv_detective/detect_fields/temp/date/__init__.py,sha256=
|
|
70
|
-
csv_detective/detect_fields/temp/
|
|
69
|
+
csv_detective/detect_fields/temp/date/__init__.py,sha256=VC4_C5lQbjqTweC4T2p9GZAIO64zERhAuf53CPfXgw4,983
|
|
70
|
+
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=Xi3fWiqm_S09AaMeHVrgx6bSieX1gEdjjM7GYsKqEx8,667
|
|
71
71
|
csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
|
|
72
|
+
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=q5Ow1yH9nCz8aY4uOHIKv8CCYIEPLUZlHzg8Nr59kBo,662
|
|
72
73
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
|
|
73
74
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
|
|
74
75
|
csv_detective/detect_labels/__init__.py,sha256=BJjWlwTnnDe9nomABDUreu9EMu6IFG3T47d7YCJZbRc,878
|
|
@@ -131,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
131
132
|
csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
|
|
132
133
|
csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
|
|
133
134
|
csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
|
|
134
|
-
csv_detective/detection/formats.py,sha256=
|
|
135
|
+
csv_detective/detection/formats.py,sha256=LDrstnAJccDeOEvGbWA5Ppx4gdlJrKbqd7qqWRG2tHI,6382
|
|
135
136
|
csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
|
|
136
137
|
csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
|
|
137
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
@@ -140,28 +141,28 @@ csv_detective/output/__init__.py,sha256=5KTevPfp_4MRxByJyOntQjToNfeG7dPQn-_13wSq
|
|
|
140
141
|
csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
|
|
141
142
|
csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5fA,8649
|
|
142
143
|
csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
|
|
143
|
-
csv_detective/output/schema.py,sha256=
|
|
144
|
-
csv_detective/output/utils.py,sha256=
|
|
144
|
+
csv_detective/output/schema.py,sha256=WxgajFuLfUTQQtmEdlO8ve2ULDzw2BYfz8QFwUsdDh0,13558
|
|
145
|
+
csv_detective/output/utils.py,sha256=qFYhxJmkKrTUefdH7Owh-liZijswomCafic4cXYSyCg,2506
|
|
145
146
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
|
-
csv_detective/parsing/columns.py,sha256=
|
|
147
|
+
csv_detective/parsing/columns.py,sha256=VzgG9Nwph5C_fLW_TuQC5BZVlPmOyjrH7Plvm_c8kWc,5675
|
|
147
148
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
148
149
|
csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
|
|
149
150
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
150
151
|
csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
|
|
151
152
|
csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
|
|
152
|
-
csv_detective-0.8.1.
|
|
153
|
-
csv_detective-0.8.1.
|
|
154
|
-
csv_detective-0.8.1.
|
|
155
|
-
csv_detective-0.8.1.
|
|
153
|
+
csv_detective-0.8.1.dev1526.data/data/share/csv_detective/CHANGELOG.md,sha256=QBkuYfCNZtm-waJYz1YEITwR8kCMDKKZH6-ef7oj8tQ,9161
|
|
154
|
+
csv_detective-0.8.1.dev1526.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
155
|
+
csv_detective-0.8.1.dev1526.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
|
|
156
|
+
csv_detective-0.8.1.dev1526.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
156
157
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
157
158
|
tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
|
|
158
|
-
tests/test_fields.py,sha256=
|
|
159
|
-
tests/test_file.py,sha256=
|
|
159
|
+
tests/test_fields.py,sha256=zeEQbHs0ougLzydmZLZs1l2UdrhKBEtdCCK64B4dhSU,10700
|
|
160
|
+
tests/test_file.py,sha256=0bHV9wx9mSRoav_DVF19g694yohb1p0bw7rtcBeKG-8,8451
|
|
160
161
|
tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
|
|
161
162
|
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
162
163
|
tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
|
|
163
|
-
csv_detective-0.8.1.
|
|
164
|
-
csv_detective-0.8.1.
|
|
165
|
-
csv_detective-0.8.1.
|
|
166
|
-
csv_detective-0.8.1.
|
|
167
|
-
csv_detective-0.8.1.
|
|
164
|
+
csv_detective-0.8.1.dev1526.dist-info/METADATA,sha256=6w8386meaPhTcYjmslsOqjkqvpLPZme5ikCsx7zJizo,10443
|
|
165
|
+
csv_detective-0.8.1.dev1526.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.8.1.dev1526.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.8.1.dev1526.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
168
|
+
csv_detective-0.8.1.dev1526.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -58,7 +58,14 @@ from csv_detective.detect_fields.other import (
|
|
|
58
58
|
int as test_int,
|
|
59
59
|
float as test_float,
|
|
60
60
|
)
|
|
61
|
-
from csv_detective.detect_fields.temp import
|
|
61
|
+
from csv_detective.detect_fields.temp import (
|
|
62
|
+
date,
|
|
63
|
+
datetime_aware,
|
|
64
|
+
datetime_iso,
|
|
65
|
+
datetime_naive,
|
|
66
|
+
datetime_rfc822,
|
|
67
|
+
year,
|
|
68
|
+
)
|
|
62
69
|
from csv_detective.detection.variables import (
|
|
63
70
|
detect_continuous_variable,
|
|
64
71
|
detect_categorical_variable,
|
|
@@ -70,7 +77,7 @@ from csv_detective.output.dataframe import cast
|
|
|
70
77
|
def test_all_tests_return_bool():
|
|
71
78
|
all_tests = return_all_tests("ALL", "detect_fields")
|
|
72
79
|
for test in all_tests:
|
|
73
|
-
for tmp in ["a", "9", "3.14", "[]", float("nan")]:
|
|
80
|
+
for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
|
|
74
81
|
assert isinstance(test._is(tmp), bool)
|
|
75
82
|
|
|
76
83
|
|
|
@@ -337,9 +344,13 @@ fields = {
|
|
|
337
344
|
"02052003",
|
|
338
345
|
],
|
|
339
346
|
},
|
|
340
|
-
|
|
341
|
-
True: ["2021-06-
|
|
342
|
-
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
347
|
+
datetime_aware: {
|
|
348
|
+
True: ["2021-06-22 10:20:10-04:00", "2030-06-22 00:00:00.0028+02:00", "1996/06/22 10:20:10 GMT"],
|
|
349
|
+
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
|
|
350
|
+
},
|
|
351
|
+
datetime_naive: {
|
|
352
|
+
True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
|
|
353
|
+
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10+02:00"],
|
|
343
354
|
},
|
|
344
355
|
datetime_iso: {
|
|
345
356
|
True: ["2021-06-22T10:20:10"],
|
|
@@ -388,6 +399,7 @@ def test_fields_with_values(args):
|
|
|
388
399
|
('{"a": 1}', "json", dict),
|
|
389
400
|
("2022-08-01", "date", _date),
|
|
390
401
|
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
402
|
+
("2024-09-23 17:32:07+02:00", "datetime", _datetime),
|
|
391
403
|
),
|
|
392
404
|
)
|
|
393
405
|
def test_cast(args):
|
tests/test_file.py
CHANGED
|
@@ -7,12 +7,11 @@ from csv_detective import routine
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@pytest.mark.parametrize(
|
|
10
|
-
"
|
|
11
|
-
(
|
|
10
|
+
"max_rows_analysis",
|
|
11
|
+
(100, int(1e5)),
|
|
12
12
|
)
|
|
13
|
-
def test_columns_output_on_file(
|
|
14
|
-
|
|
15
|
-
with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", patched):
|
|
13
|
+
def test_columns_output_on_file(max_rows_analysis):
|
|
14
|
+
with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", max_rows_analysis):
|
|
16
15
|
output = routine(
|
|
17
16
|
file_path="tests/data/a_test_file.csv",
|
|
18
17
|
num_rows=-1,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/top_level.txt
RENAMED
|
File without changes
|