csv-detective 0.8.1.dev1509__py3-none-any.whl → 0.8.1.dev1526__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,4 +57,4 @@ from .geo import (
57
57
  )
58
58
 
59
59
  from .FR.temp import jour_de_la_semaine, mois_de_annee
60
- from .temp import year, date, datetime, datetime_iso, datetime_rfc822
60
+ from .temp import year, date, datetime_aware, datetime_iso, datetime_naive, datetime_rfc822
@@ -14,7 +14,7 @@ def date_casting(val: str) -> Optional[datetime]:
14
14
  return dateutil_parser(val)
15
15
  except ParserError:
16
16
  return date_parser(val)
17
- except OverflowError:
17
+ except Exception:
18
18
  return None
19
19
 
20
20
 
@@ -0,0 +1,21 @@
1
+ from typing import Any, Optional
2
+
3
+ from csv_detective.detect_fields.temp.date import date_casting
4
+
5
+ PROPORTION = 1
6
+
7
+
8
+ def _is(val: Optional[Any]) -> bool:
9
+ """Detects timezone-aware datetimes only"""
10
+ # early stops, to cut processing time
11
+ if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
12
+ return False
13
+ threshold = 0.7
14
+ if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
15
+ return False
16
+ res = date_casting(val)
17
+ return (
18
+ res is not None
19
+ and bool(res.hour or res.minute or res.second or res.microsecond)
20
+ and bool(res.tzinfo)
21
+ )
@@ -6,14 +6,16 @@ PROPORTION = 1
6
6
 
7
7
 
8
8
  def _is(val: Optional[Any]) -> bool:
9
- '''Renvoie True si val peut être un datetime, False sinon'''
9
+ """Detects naive datetimes only"""
10
10
  # early stops, to cut processing time
11
11
  if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
12
12
  return False
13
13
  threshold = 0.7
14
- if sum([char.isdigit() for char in val]) / len(val) < threshold:
14
+ if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
15
15
  return False
16
16
  res = date_casting(val)
17
- if res and (res.hour or res.minute or res.second):
18
- return True
19
- return False
17
+ return (
18
+ res is not None
19
+ and bool(res.hour or res.minute or res.second or res.microsecond)
20
+ and not bool(res.tzinfo)
21
+ )
@@ -106,8 +106,9 @@ def detect_formats(
106
106
  "string": "string",
107
107
  "json": "json",
108
108
  "json_geojson": "json",
109
- "datetime": "datetime",
109
+ "datetime_aware": "datetime",
110
110
  "datetime_iso": "datetime",
111
+ "datetime_naive": "datetime",
111
112
  "datetime_rfc822": "datetime",
112
113
  "date": "date",
113
114
  "latitude": "float",
@@ -106,7 +106,9 @@ def get_validata_type(format: str) -> str:
106
106
  "float": "number",
107
107
  "string": "string",
108
108
  "date": "date",
109
+ "datetime_aware": "datetime",
109
110
  "datetime_iso": "datetime",
111
+ "datetime_naive": "datetime",
110
112
  "datetime_rfc822": "datetime",
111
113
  "json_geojson": "geojson",
112
114
  "latitude": "number",
@@ -34,6 +34,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
34
34
  formats_to_remove.add("longitude_l93")
35
35
  if "code_region" in formats_detected:
36
36
  formats_to_remove.add("code_departement")
37
+ if "datetime_iso" in formats_detected:
38
+ formats_to_remove.add("datetime_naive")
39
+ if "datetime_rfc822" in formats_detected:
40
+ formats_to_remove.add("datetime_aware")
37
41
 
38
42
  formats_to_keep = formats_detected - formats_to_remove
39
43
 
@@ -7,6 +7,7 @@
7
7
  - Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
8
8
  - For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124) [#129](https://github.com/datagouv/csv-detective/pull/129)
9
9
  - Fix imports [#125](https://github.com/datagouv/csv-detective/pull/125) [#126](https://github.com/datagouv/csv-detective/pull/126) [#127](https://github.com/datagouv/csv-detective/pull/127) [#128](https://github.com/datagouv/csv-detective/pull/128)
10
+ - Split aware and naive datetimes for hydra to cast them separately [#130](https://github.com/datagouv/csv-detective/pull/130)
10
11
 
11
12
  ## 0.8.0 (2025-05-20)
12
13
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.8.1.dev1509
3
+ Version: 0.8.1.dev1526
4
4
  Summary: Detect tabular files column content
5
5
  Home-page: https://github.com/datagouv/csv_detective
6
6
  Author: Etalab
@@ -5,7 +5,7 @@ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2
5
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
6
6
  csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
7
7
  csv_detective/validate.py,sha256=d_4Phmjk6Y0Z0YYVw4vpoZy8E79K370reGgkpzx1mcQ,2644
8
- csv_detective/detect_fields/__init__.py,sha256=7Tz0Niaz0BboA3YVsp_6WPA6ywciwDN4-lOy_Ie_0Y8,976
8
+ csv_detective/detect_fields/__init__.py,sha256=HYSy0P_aH6R8Z8Hvd8aMaBAQaZ1QwcsWHT0YPm0iYs0,998
9
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
@@ -66,9 +66,10 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
66
66
  csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
67
67
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
68
68
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- csv_detective/detect_fields/temp/date/__init__.py,sha256=1a_Ra9fmT4wgGMrcknXP7eN7A2QiaMF0Yjy0-BMihtA,987
70
- csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
69
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=VC4_C5lQbjqTweC4T2p9GZAIO64zERhAuf53CPfXgw4,983
70
+ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=Xi3fWiqm_S09AaMeHVrgx6bSieX1gEdjjM7GYsKqEx8,667
71
71
  csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
72
+ csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=q5Ow1yH9nCz8aY4uOHIKv8CCYIEPLUZlHzg8Nr59kBo,662
72
73
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
73
74
  csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
74
75
  csv_detective/detect_labels/__init__.py,sha256=BJjWlwTnnDe9nomABDUreu9EMu6IFG3T47d7YCJZbRc,878
@@ -131,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
131
132
  csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
132
133
  csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
133
134
  csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
134
- csv_detective/detection/formats.py,sha256=5ZW7gmhyQt6BB7xLcVVhui17oGn1udAWI9w22EAOHy4,6337
135
+ csv_detective/detection/formats.py,sha256=LDrstnAJccDeOEvGbWA5Ppx4gdlJrKbqd7qqWRG2tHI,6382
135
136
  csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
136
137
  csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
137
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -140,8 +141,8 @@ csv_detective/output/__init__.py,sha256=5KTevPfp_4MRxByJyOntQjToNfeG7dPQn-_13wSq
140
141
  csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
141
142
  csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5fA,8649
142
143
  csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
143
- csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
144
- csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
144
+ csv_detective/output/schema.py,sha256=WxgajFuLfUTQQtmEdlO8ve2ULDzw2BYfz8QFwUsdDh0,13558
145
+ csv_detective/output/utils.py,sha256=qFYhxJmkKrTUefdH7Owh-liZijswomCafic4cXYSyCg,2506
145
146
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
147
  csv_detective/parsing/columns.py,sha256=VzgG9Nwph5C_fLW_TuQC5BZVlPmOyjrH7Plvm_c8kWc,5675
147
148
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
@@ -149,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
149
150
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
150
151
  csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
151
152
  csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
152
- csv_detective-0.8.1.dev1509.data/data/share/csv_detective/CHANGELOG.md,sha256=em97ZkXiZcpRQnOj7zCgZRtGL6sbaQvyPN6C1UvT4Dk,9034
153
- csv_detective-0.8.1.dev1509.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
- csv_detective-0.8.1.dev1509.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
155
- csv_detective-0.8.1.dev1509.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.8.1.dev1526.data/data/share/csv_detective/CHANGELOG.md,sha256=QBkuYfCNZtm-waJYz1YEITwR8kCMDKKZH6-ef7oj8tQ,9161
154
+ csv_detective-0.8.1.dev1526.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
155
+ csv_detective-0.8.1.dev1526.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
156
+ csv_detective-0.8.1.dev1526.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
156
157
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
157
158
  tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
158
- tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
159
+ tests/test_fields.py,sha256=zeEQbHs0ougLzydmZLZs1l2UdrhKBEtdCCK64B4dhSU,10700
159
160
  tests/test_file.py,sha256=0bHV9wx9mSRoav_DVF19g694yohb1p0bw7rtcBeKG-8,8451
160
161
  tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
161
162
  tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
162
163
  tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
163
- csv_detective-0.8.1.dev1509.dist-info/METADATA,sha256=V_xchinM2b_RyUVCxSmEB4UA66Q0IdIjkMfSi2bwf6E,10443
164
- csv_detective-0.8.1.dev1509.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
165
- csv_detective-0.8.1.dev1509.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
166
- csv_detective-0.8.1.dev1509.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
167
- csv_detective-0.8.1.dev1509.dist-info/RECORD,,
164
+ csv_detective-0.8.1.dev1526.dist-info/METADATA,sha256=6w8386meaPhTcYjmslsOqjkqvpLPZme5ikCsx7zJizo,10443
165
+ csv_detective-0.8.1.dev1526.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.8.1.dev1526.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.8.1.dev1526.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
168
+ csv_detective-0.8.1.dev1526.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -58,7 +58,14 @@ from csv_detective.detect_fields.other import (
58
58
  int as test_int,
59
59
  float as test_float,
60
60
  )
61
- from csv_detective.detect_fields.temp import date, datetime, datetime_iso, datetime_rfc822, year
61
+ from csv_detective.detect_fields.temp import (
62
+ date,
63
+ datetime_aware,
64
+ datetime_iso,
65
+ datetime_naive,
66
+ datetime_rfc822,
67
+ year,
68
+ )
62
69
  from csv_detective.detection.variables import (
63
70
  detect_continuous_variable,
64
71
  detect_categorical_variable,
@@ -70,7 +77,7 @@ from csv_detective.output.dataframe import cast
70
77
  def test_all_tests_return_bool():
71
78
  all_tests = return_all_tests("ALL", "detect_fields")
72
79
  for test in all_tests:
73
- for tmp in ["a", "9", "3.14", "[]", float("nan")]:
80
+ for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
74
81
  assert isinstance(test._is(tmp), bool)
75
82
 
76
83
 
@@ -337,9 +344,13 @@ fields = {
337
344
  "02052003",
338
345
  ],
339
346
  },
340
- datetime: {
341
- True: ["2021-06-22T10:20:10"],
342
- False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
347
+ datetime_aware: {
348
+ True: ["2021-06-22 10:20:10-04:00", "2030-06-22 00:00:00.0028+02:00", "1996/06/22 10:20:10 GMT"],
349
+ False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
350
+ },
351
+ datetime_naive: {
352
+ True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
353
+ False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10+02:00"],
343
354
  },
344
355
  datetime_iso: {
345
356
  True: ["2021-06-22T10:20:10"],
@@ -388,6 +399,7 @@ def test_fields_with_values(args):
388
399
  ('{"a": 1}', "json", dict),
389
400
  ("2022-08-01", "date", _date),
390
401
  ("2024-09-23 17:32:07", "datetime", _datetime),
402
+ ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
391
403
  ),
392
404
  )
393
405
  def test_cast(args):