csv-detective 0.10.3.dev3__py3-none-any.whl → 0.10.3.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,24 +5,22 @@ from typing import TextIO
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
7
7
 
8
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
8
+ def detect_header_position(file: TextIO, verbose: bool = False) -> int:
9
9
  """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
10
  if verbose:
11
11
  start = time()
12
- logging.info("Detecting headers")
12
+ logging.info("Detecting header position")
13
13
  file.seek(0)
14
14
  for i in range(10):
15
15
  row = file.readline()
16
16
  position = file.tell()
17
- headers = [c for c in row.replace("\n", "").split(sep) if c]
18
- if not any(col == "" for col in headers):
19
- next_row = file.readline()
20
- file.seek(position)
21
- if row != next_row:
22
- if verbose:
23
- display_logs_depending_process_time(
24
- f"Detected headers in {round(time() - start, 3)}s",
25
- time() - start,
26
- )
27
- return i, headers
17
+ next_row = file.readline()
18
+ file.seek(position)
19
+ if row != next_row:
20
+ if verbose:
21
+ display_logs_depending_process_time(
22
+ f"Detected header position in {round(time() - start, 3)}s",
23
+ time() - start,
24
+ )
25
+ return i
28
26
  raise ValueError("Could not retrieve headers")
@@ -2,7 +2,7 @@ import pandas as pd
2
2
 
3
3
 
4
4
  def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
- """Analog process to detect_headers for csv files, determines how many rows to skip
5
+ """Analog process to detect_header_position for csv files, determines how many rows to skip
6
6
  to end up with the header at the right place"""
7
7
  idx = 0
8
8
  if all([str(c).startswith("Unnamed:") for c in table.columns]):
@@ -11,7 +11,7 @@ from csv_detective.detection.engine import (
11
11
  EXCEL_ENGINES,
12
12
  detect_engine,
13
13
  )
14
- from csv_detective.detection.headers import detect_headers
14
+ from csv_detective.detection.headers import detect_header_position
15
15
  from csv_detective.detection.separator import detect_separator
16
16
  from csv_detective.parsing.compression import unzip
17
17
  from csv_detective.parsing.csv import parse_csv
@@ -83,9 +83,7 @@ def load_file(
83
83
  str_file = open(file_path, "r", encoding=encoding)
84
84
  if sep is None:
85
85
  sep = detect_separator(str_file, verbose=verbose)
86
- header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
87
- if header is None or (isinstance(header, list) and any([h is None for h in header])):
88
- raise ValueError("Could not retrieve headers")
86
+ header_row_idx = detect_header_position(str_file, verbose=verbose)
89
87
  heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
90
88
  trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
91
89
  table, total_lines, nb_duplicates = parse_csv(
@@ -102,9 +100,11 @@ def load_file(
102
100
  }
103
101
  if engine is not None:
104
102
  analysis["compression"] = engine
103
+ if any(col.startswith("Unnamed:") for col in table.columns):
104
+ raise ValueError("Columns are not properly set")
105
105
  analysis |= {
106
106
  "header_row_idx": header_row_idx,
107
- "header": header,
107
+ "header": list(table.columns),
108
108
  }
109
109
  if total_lines is not None:
110
110
  analysis["total_lines"] = total_lines
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.3.dev3
3
+ Version: 0.10.3.dev4
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -9,8 +9,8 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
9
9
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
10
10
  csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
11
11
  csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
12
- csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
13
- csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
12
+ csv_detective/detection/headers.py,sha256=R-Cuo67laOrj6l9Jq_SCFLkRMj3CEzb2HMo_Sa50g9A,879
13
+ csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
14
14
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
15
15
  csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
16
16
  csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
@@ -84,13 +84,13 @@ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcO
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
85
  csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
86
86
  csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
87
- csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
87
+ csv_detective/parsing/load.py,sha256=uWX4r_2K9bf-9qKL6IaiGNkiSPnSSNqng420zdrFkDg,4371
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.3.dev3.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.3.dev4.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
92
  tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
93
- tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
93
+ tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
94
94
  tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
95
95
  tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
96
96
  tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.3.dev3.dist-info/METADATA,sha256=aVDOx1LTRqvJIGf4wu4krkEjPl8HDyCICBbswfn_Wvg,11082
108
- csv_detective-0.10.3.dev3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
- csv_detective-0.10.3.dev3.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.3.dev3.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.3.dev3.dist-info/RECORD,,
107
+ csv_detective-0.10.3.dev4.dist-info/METADATA,sha256=FRE6DkDtXSuva4aVTV1ws8tj53w03fjesToRlgh_78s,11082
108
+ csv_detective-0.10.3.dev4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
+ csv_detective-0.10.3.dev4.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.3.dev4.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.3.dev4.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -9,6 +9,12 @@ from csv_detective.output.profile import create_profile
9
9
  from csv_detective.parsing.csv import CHUNK_SIZE
10
10
 
11
11
 
12
+ @pytest.fixture
13
+ def mocked_responses():
14
+ with responses.RequestsMock() as rsps:
15
+ yield rsps
16
+
17
+
12
18
  @pytest.mark.parametrize(
13
19
  "chunk_size",
14
20
  (100, 404, int(1e5)),
@@ -165,6 +171,26 @@ def test_exception_different_number_of_columns():
165
171
  )
166
172
 
167
173
 
174
+ def test_exception_malformed_columns(mocked_responses):
175
+ """
176
+ A ValueError should be raised if any column is Unnamed
177
+ """
178
+ url = f"http://example.com/bad_cols.csv"
179
+ expected_content = b"col1,col2,\n1,2,\n3,4,"
180
+ mocked_responses.get(
181
+ url,
182
+ body=expected_content,
183
+ status=200,
184
+ )
185
+ with patch("urllib.request.urlopen") as mock_urlopen:
186
+ mock_response = MagicMock()
187
+ mock_response.read.return_value = expected_content
188
+ mock_response.__enter__.return_value = mock_response
189
+ mock_urlopen.return_value = mock_response
190
+ with pytest.raises(ValueError):
191
+ routine(file_path=url)
192
+
193
+
168
194
  def test_code_dep_reg_on_file():
169
195
  output = routine(
170
196
  file_path="tests/data/b_test_file.csv",
@@ -237,12 +263,6 @@ def test_non_csv_files(params):
237
263
  assert _[k] == v
238
264
 
239
265
 
240
- @pytest.fixture
241
- def mocked_responses():
242
- with responses.RequestsMock() as rsps:
243
- yield rsps
244
-
245
-
246
266
  @pytest.mark.parametrize(
247
267
  "params",
248
268
  # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib