csv-detective 0.10.3.dev2__py3-none-any.whl → 0.10.3.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,24 +5,22 @@ from typing import TextIO
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
7
7
 
8
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
8
+ def detect_header_position(file: TextIO, verbose: bool = False) -> int:
9
9
  """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
10
  if verbose:
11
11
  start = time()
12
- logging.info("Detecting headers")
12
+ logging.info("Detecting header position")
13
13
  file.seek(0)
14
14
  for i in range(10):
15
15
  row = file.readline()
16
16
  position = file.tell()
17
- headers = [c for c in row.replace("\n", "").split(sep) if c]
18
- if not any(col == "" for col in headers):
19
- next_row = file.readline()
20
- file.seek(position)
21
- if row != next_row:
22
- if verbose:
23
- display_logs_depending_process_time(
24
- f"Detected headers in {round(time() - start, 3)}s",
25
- time() - start,
26
- )
27
- return i, headers
17
+ next_row = file.readline()
18
+ file.seek(position)
19
+ if row != next_row:
20
+ if verbose:
21
+ display_logs_depending_process_time(
22
+ f"Detected header position in {round(time() - start, 3)}s",
23
+ time() - start,
24
+ )
25
+ return i
28
26
  raise ValueError("Could not retrieve headers")
@@ -2,7 +2,7 @@ import pandas as pd
2
2
 
3
3
 
4
4
  def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
- """Analog process to detect_headers for csv files, determines how many rows to skip
5
+ """Analog process to detect_header_position for csv files, determines how many rows to skip
6
6
  to end up with the header at the right place"""
7
7
  idx = 0
8
8
  if all([str(c).startswith("Unnamed:") for c in table.columns]):
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
14
14
 
15
15
 
16
16
  def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
17
- if not isinstance(value, str) or not value:
18
- # None is the current default value in hydra, should we keep this?
17
+ if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
18
+ # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
19
19
  return None
20
20
  match _type:
21
21
  case "string":
@@ -11,7 +11,7 @@ from csv_detective.detection.engine import (
11
11
  EXCEL_ENGINES,
12
12
  detect_engine,
13
13
  )
14
- from csv_detective.detection.headers import detect_headers
14
+ from csv_detective.detection.headers import detect_header_position
15
15
  from csv_detective.detection.separator import detect_separator
16
16
  from csv_detective.parsing.compression import unzip
17
17
  from csv_detective.parsing.csv import parse_csv
@@ -83,9 +83,7 @@ def load_file(
83
83
  str_file = open(file_path, "r", encoding=encoding)
84
84
  if sep is None:
85
85
  sep = detect_separator(str_file, verbose=verbose)
86
- header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
87
- if header is None or (isinstance(header, list) and any([h is None for h in header])):
88
- raise ValueError("Could not retrieve headers")
86
+ header_row_idx = detect_header_position(str_file, verbose=verbose)
89
87
  heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
90
88
  trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
91
89
  table, total_lines, nb_duplicates = parse_csv(
@@ -102,9 +100,11 @@ def load_file(
102
100
  }
103
101
  if engine is not None:
104
102
  analysis["compression"] = engine
103
+ if any(col.startswith("Unnamed:") for col in table.columns):
104
+ raise ValueError("Columns are not properly set")
105
105
  analysis |= {
106
106
  "header_row_idx": header_row_idx,
107
- "header": header,
107
+ "header": list(table.columns),
108
108
  }
109
109
  if total_lines is not None:
110
110
  analysis["total_lines"] = total_lines
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.3.dev2
3
+ Version: 0.10.3.dev4
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -9,8 +9,8 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
9
9
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
10
10
  csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
11
11
  csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
12
- csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
13
- csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
12
+ csv_detective/detection/headers.py,sha256=R-Cuo67laOrj6l9Jq_SCFLkRMj3CEzb2HMo_Sa50g9A,879
13
+ csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
14
14
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
15
15
  csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
16
16
  csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
@@ -74,7 +74,7 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
74
74
  csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
75
75
  csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
76
76
  csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
77
- csv_detective/output/dataframe.py,sha256=QX5vplx0AOKgnwwJ6dKvDHWRX9IGPStax-svXEyweJ8,3584
77
+ csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
78
78
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
79
79
  csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
80
80
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
@@ -84,13 +84,13 @@ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcO
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
85
  csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
86
86
  csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
87
- csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
87
+ csv_detective/parsing/load.py,sha256=uWX4r_2K9bf-9qKL6IaiGNkiSPnSSNqng420zdrFkDg,4371
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.3.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.3.dev4.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
- tests/test_fields.py,sha256=EuD2F1JUR8y88Hm-AYuJ5X7AKkGSyLIQfsGdxYgIWng,5680
93
- tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
92
+ tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
93
+ tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
94
94
  tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
95
95
  tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
96
96
  tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.3.dev2.dist-info/METADATA,sha256=QhAD5N5OZx1L_9ajLLuEjhSYSz6q05eAEwVd6_kDPFc,11082
108
- csv_detective-0.10.3.dev2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
109
- csv_detective-0.10.3.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.3.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.3.dev2.dist-info/RECORD,,
107
+ csv_detective-0.10.3.dev4.dist-info/METADATA,sha256=FRE6DkDtXSuva4aVTV1ws8tj53w03fjesToRlgh_78s,11082
108
+ csv_detective-0.10.3.dev4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
+ csv_detective-0.10.3.dev4.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.3.dev4.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.3.dev4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -104,11 +104,17 @@ def test_fields_with_values(args):
104
104
  ("2022-08-01", "date", _date),
105
105
  ("2024-09-23 17:32:07", "datetime", _datetime),
106
106
  ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
107
+ ("N/A", "int", None),
108
+ ("nan", "bool", None),
109
+ ("", "date", None), # all NaN-like values should be cast as None for all type
107
110
  ),
108
111
  )
109
112
  def test_cast(args):
110
113
  value, detected_type, cast_type = args
111
- assert isinstance(cast(value, detected_type), cast_type)
114
+ if cast_type is None:
115
+ assert cast(value, detected_type) is None
116
+ else:
117
+ assert isinstance(cast(value, detected_type), cast_type)
112
118
 
113
119
 
114
120
  @pytest.mark.parametrize(
tests/test_file.py CHANGED
@@ -9,6 +9,12 @@ from csv_detective.output.profile import create_profile
9
9
  from csv_detective.parsing.csv import CHUNK_SIZE
10
10
 
11
11
 
12
+ @pytest.fixture
13
+ def mocked_responses():
14
+ with responses.RequestsMock() as rsps:
15
+ yield rsps
16
+
17
+
12
18
  @pytest.mark.parametrize(
13
19
  "chunk_size",
14
20
  (100, 404, int(1e5)),
@@ -165,6 +171,26 @@ def test_exception_different_number_of_columns():
165
171
  )
166
172
 
167
173
 
174
+ def test_exception_malformed_columns(mocked_responses):
175
+ """
176
+ A ValueError should be raised if any column is Unnamed
177
+ """
178
+ url = f"http://example.com/bad_cols.csv"
179
+ expected_content = b"col1,col2,\n1,2,\n3,4,"
180
+ mocked_responses.get(
181
+ url,
182
+ body=expected_content,
183
+ status=200,
184
+ )
185
+ with patch("urllib.request.urlopen") as mock_urlopen:
186
+ mock_response = MagicMock()
187
+ mock_response.read.return_value = expected_content
188
+ mock_response.__enter__.return_value = mock_response
189
+ mock_urlopen.return_value = mock_response
190
+ with pytest.raises(ValueError):
191
+ routine(file_path=url)
192
+
193
+
168
194
  def test_code_dep_reg_on_file():
169
195
  output = routine(
170
196
  file_path="tests/data/b_test_file.csv",
@@ -237,12 +263,6 @@ def test_non_csv_files(params):
237
263
  assert _[k] == v
238
264
 
239
265
 
240
- @pytest.fixture
241
- def mocked_responses():
242
- with responses.RequestsMock() as rsps:
243
- yield rsps
244
-
245
-
246
266
  @pytest.mark.parametrize(
247
267
  "params",
248
268
  # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib