csv-detective 0.9.1.dev1801__py3-none-any.whl → 0.9.1.dev1830__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from datetime import datetime
2
3
  from typing import Optional
3
4
 
@@ -19,6 +20,23 @@ def date_casting(val: str) -> Optional[datetime]:
19
20
  return None
20
21
 
21
22
 
23
+ seps = r"[\s/\-\*_\|;.,]"
24
+ # matches JJ-MM-AAAA with any of the listed separators
25
+ jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
26
+ "SEP", seps
27
+ )
28
+ # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
29
+ aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
30
+ "SEP", seps + "?"
31
+ )
32
+ # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
33
+ string_month_pattern = (
34
+ r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
35
+ r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
36
+ r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
37
+ r"([0-9]{2}$|(19|20)[0-9]{2}$)"
38
+ ).replace("SEP", seps + "?")
39
+
22
40
  threshold = 0.3
23
41
 
24
42
 
@@ -27,6 +45,16 @@ def _is(val):
27
45
  # early stops, to cut processing time
28
46
  if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
29
47
  return False
48
+ # if it's a usual date pattern
49
+ if any(
50
+ # with this syntax, if any of the first value is True, the next ones are not computed
51
+ [
52
+ bool(re.match(jjmmaaaa_pattern, val))
53
+ or bool(re.match(aaaammjj_pattern, val))
54
+ or bool(re.match(string_month_pattern, val, re.IGNORECASE))
55
+ ]
56
+ ):
57
+ return True
30
58
  if sum([char.isdigit() for char in val]) / len(val) < threshold:
31
59
  return False
32
60
  res = date_casting(val)
@@ -1,8 +1,16 @@
1
+ import re
1
2
  from typing import Any, Optional
2
3
 
3
- from csv_detective.detect_fields.temp.date import date_casting
4
+ from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
4
5
 
5
6
  PROPORTION = 1
7
+ threshold = 0.7
8
+
9
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
10
+ pat = (
11
+ aaaammjj_pattern.replace("$", "")
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
13
+ )
6
14
 
7
15
 
8
16
  def _is(val: Optional[Any]) -> bool:
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
12
20
  # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
13
21
  if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
14
22
  return False
15
- threshold = 0.7
23
+ # if usual format, no need to parse
24
+ if bool(re.match(pat, val)):
25
+ return True
16
26
  if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
17
27
  return False
18
28
  res = date_casting(val)
@@ -1,8 +1,16 @@
1
+ import re
1
2
  from typing import Any, Optional
2
3
 
3
- from csv_detective.detect_fields.temp.date import date_casting
4
+ from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
4
5
 
5
6
  PROPORTION = 1
7
+ threshold = 0.7
8
+
9
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
10
+ pat = (
11
+ aaaammjj_pattern.replace("$", "")
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
13
+ )
6
14
 
7
15
 
8
16
  def _is(val: Optional[Any]) -> bool:
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
12
20
  # 26 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd, keeping some slack
13
21
  if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
14
22
  return False
15
- threshold = 0.7
23
+ # if usual format, no need to parse
24
+ if bool(re.match(pat, val)):
25
+ return True
16
26
  if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
17
27
  return False
18
28
  res = date_casting(val)
@@ -33,27 +33,23 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
33
33
  def cast_df(
34
34
  df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
35
35
  ) -> pd.DataFrame:
36
+ # for efficiency this modifies the dataframe in place as we don't need it anymore afterwards
36
37
  if verbose:
37
38
  start = time()
38
- output_df = pd.DataFrame()
39
39
  for col_name, detection in columns.items():
40
40
  if detection["python_type"] == "string" or (
41
41
  detection["python_type"] == "json" and not cast_json
42
42
  ):
43
43
  # no change if detected type is string
44
- output_df[col_name] = df[col_name].copy()
44
+ continue
45
45
  elif detection["python_type"] == "int":
46
46
  # to allow having ints and NaN in the same column
47
- output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
47
+ df[col_name] = df[col_name].astype(pd.Int64Dtype())
48
48
  else:
49
- output_df[col_name] = df[col_name].apply(
50
- lambda col: cast(col, _type=detection["python_type"])
51
- )
52
- # to save RAM
53
- del df[col_name]
49
+ df[col_name] = df[col_name].apply(lambda col: cast(col, _type=detection["python_type"]))
54
50
  if verbose:
55
51
  display_logs_depending_process_time(
56
52
  f"Casting columns completed in {round(time() - start, 3)}s",
57
53
  time() - start,
58
54
  )
59
- return output_df
55
+ return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.1.dev1801
3
+ Version: 0.9.1.dev1830
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -67,9 +67,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
67
67
  csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
68
68
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
69
69
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- csv_detective/detect_fields/temp/date/__init__.py,sha256=uVOszufihKqiQmS0wz7nUuQ2Dz-Tq9fSk1nf3S00mg4,1010
71
- csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=bEfWvXx_GNCRUxMGJYqfOK4wRDr3WMaGVAmIa_C2pXE,853
72
- csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=GtQo55SrrXfoT-L7ZXW63jrlAYvNT5m56wMfhuY3pyI,836
70
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
71
+ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
72
+ csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
73
73
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
74
74
  csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
75
75
  csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -138,7 +138,7 @@ csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2V
138
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
139
139
  csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
140
140
  csv_detective/output/__init__.py,sha256=f-UFv_iULpVF_Fy39H4sfACEnrthjK4N3mCAVPkjnKw,1860
141
- csv_detective/output/dataframe.py,sha256=UpLuSxx_SFbKpem1n-xY7jF16MXGpKQYEWjaSMIiB4s,2215
141
+ csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
142
142
  csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
143
143
  csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77lcA,3130
144
144
  csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
@@ -150,10 +150,10 @@ csv_detective/parsing/csv.py,sha256=qZFLOT3YCPoHF0svfVfQBnS8eHtucjDZ7dFITAPgLhc,
150
150
  csv_detective/parsing/excel.py,sha256=ULUDw76z6hs1Xm2yL9KBM0EOvIsfBLkxwqTZfDEx6aE,7045
151
151
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
152
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.1.dev1801.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.9.1.dev1830.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
- tests/test_fields.py,sha256=Y2mBfV9ZdxTHYwHnkzGbpo1k_qJRLC8nU-zzAUxFmAE,11964
156
+ tests/test_fields.py,sha256=VhhQny2Jqy_Z6SplpnN_qAXqBRQCuA42IgSNu37R2cc,12560
157
157
  tests/test_file.py,sha256=YuVbSfeo_ASPiLT8CyxXqJENcDpj4wAFXzLwu_GzsOA,8437
158
158
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
159
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
@@ -161,8 +161,8 @@ tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
161
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
162
  venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
163
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.1.dev1801.dist-info/METADATA,sha256=v6wVh2pCJfMUKK3tKjDm23UXJ1tKMAfnaLSrHFUMrKI,9767
165
- csv_detective-0.9.1.dev1801.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.1.dev1801.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.1.dev1801.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.1.dev1801.dist-info/RECORD,,
164
+ csv_detective-0.9.1.dev1830.dist-info/METADATA,sha256=eYNe6QPycRGL5VnIyx_kj0e79azipmi7qu5jh766OD0,9767
165
+ csv_detective-0.9.1.dev1830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.9.1.dev1830.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.9.1.dev1830.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
+ csv_detective-0.9.1.dev1830.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from datetime import date as _date
2
2
  from datetime import datetime as _datetime
3
+ from unittest.mock import patch
3
4
 
4
5
  import pandas as pd
5
6
  import pytest
@@ -441,3 +442,22 @@ def test_priority(args):
441
442
  col = "col1"
442
443
  output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
443
444
  assert output[col]["format"] == expected
445
+
446
+
447
+ @pytest.mark.parametrize(
448
+ "args",
449
+ (
450
+ ("1996-02-13", date),
451
+ ("28/01/2000", date),
452
+ ("2025-08-20T14:30:00+02:00", datetime_aware),
453
+ ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
454
+ ("1925_12_20T14:30:00.2763Z", datetime_naive),
455
+ ("1925 12 20 14:30:00Z", datetime_naive),
456
+ ),
457
+ )
458
+ def test_early_detection(args):
459
+ value, module = args
460
+ with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func:
461
+ res = module._is(value)
462
+ assert res
463
+ mock_func.assert_not_called()