csv-detective 0.9.3.dev2232__py3-none-any.whl → 0.9.3.dev2258__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,19 +12,17 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
12
12
  logging.info("Detecting headers")
13
13
  file.seek(0)
14
14
  for i in range(10):
15
- header = file.readline()
15
+ row = file.readline()
16
16
  position = file.tell()
17
- chaine = [c for c in header.replace("\n", "").split(sep) if c]
18
- if chaine[-1] not in ["", "\n"] and all([mot not in ["", "\n"] for mot in chaine[1:-1]]):
17
+ headers = [c for c in row.replace("\n", "").split(sep) if c]
18
+ if not any(col == "" for col in headers):
19
19
  next_row = file.readline()
20
20
  file.seek(position)
21
- if header != next_row:
21
+ if row != next_row:
22
22
  if verbose:
23
23
  display_logs_depending_process_time(
24
24
  f"Detected headers in {round(time() - start, 3)}s",
25
25
  time() - start,
26
26
  )
27
- return i, chaine
28
- if verbose:
29
- logging.info("No header detected")
30
- return 0, None
27
+ return i, headers
28
+ raise ValueError("Could not retrieve headers")
@@ -30,6 +30,10 @@ def create_profile(
30
30
  k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
31
31
  for k, v in columns.items()
32
32
  }
33
+ # value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
34
+ _count_col = "count"
35
+ while _count_col in table.columns:
36
+ _count_col = "_" + _count_col
33
37
  profile = defaultdict(dict)
34
38
  for c in table.columns:
35
39
  # for numerical formats we want min, max, mean, std
@@ -79,14 +83,14 @@ def create_profile(
79
83
  # for all formats we want most frequent values, nb unique values and nb missing values
80
84
  tops_bruts = (
81
85
  (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
82
- .reset_index()
86
+ .reset_index(name=_count_col)
83
87
  .iloc[:10]
84
88
  .to_dict(orient="records")
85
89
  )
86
90
  profile[c].update(
87
91
  tops=[
88
92
  {
89
- "count": tb["count"],
93
+ "count": tb[_count_col],
90
94
  "value": tb[c],
91
95
  }
92
96
  for tb in tops_bruts
@@ -47,6 +47,8 @@ def load_file(
47
47
  if table.empty:
48
48
  raise ValueError("Table seems to be empty")
49
49
  header = table.columns.to_list()
50
+ if any(col.startswith("Unnamed") for col in header):
51
+ raise ValueError("Could not retrieve headers")
50
52
  analysis = {
51
53
  "engine": engine,
52
54
  "sheet_name": sheet_name,
@@ -99,12 +101,10 @@ def load_file(
99
101
  }
100
102
  if engine is not None:
101
103
  analysis["compression"] = engine
102
- analysis.update(
103
- {
104
- "header_row_idx": header_row_idx,
105
- "header": header,
106
- }
107
- )
104
+ analysis |= {
105
+ "header_row_idx": header_row_idx,
106
+ "header": header,
107
+ }
108
108
  if total_lines is not None:
109
109
  analysis["total_lines"] = total_lines
110
110
  if nb_duplicates is not None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev2232
3
+ Version: 0.9.3.dev2258
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -132,14 +132,14 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
132
132
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
133
133
  csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
134
134
  csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
135
- csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
135
+ csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
136
136
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
137
137
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
138
138
  csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
139
139
  csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
140
140
  csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
141
141
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
142
- csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
142
+ csv_detective/output/profile.py,sha256=ZGKMSeVfmQerAfVhViWXVU9j4jbCrv5K484SQNep7Xw,4920
143
143
  csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
144
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
145
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -147,20 +147,20 @@ csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4
147
147
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
148
148
  csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
149
149
  csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
150
- csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
150
+ csv_detective/parsing/load.py,sha256=EHVWQqV9TmWOiVNLCyHr9V8x4PI_53O0iTVluzIqw78,4256
151
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
152
- csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev2258.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
154
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
155
155
  tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
156
- tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
156
+ tests/test_file.py,sha256=EKFW08W96VA5nVwNPvN1v7zXDL0qEEuGWnUqfJJdMh4,13130
157
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
158
158
  tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
159
159
  tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
160
160
  venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
161
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
162
- csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
163
- csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
- csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
- csv_detective-0.9.3.dev2232.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev2258.dist-info/METADATA,sha256=gGV63RWaQSVUSaJl24mR9Ynk2BOV1tK6LWyNsq-AYkA,10845
163
+ csv_detective-0.9.3.dev2258.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev2258.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev2258.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev2258.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -370,22 +370,44 @@ def test_almost_uniform_column(mocked_responses):
370
370
 
371
371
  def test_full_nan_column(mocked_responses):
372
372
  # we want a file that needs sampling
373
- expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
373
+ col_name = "only_nan"
374
+ expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
374
375
  mocked_responses.get(
375
376
  "http://example.com/test.csv",
376
377
  body=expected_content,
377
378
  status=200,
378
379
  )
379
380
  with patch("urllib.request.urlopen") as mock_urlopen:
380
- # Create a mock HTTP response object
381
381
  mock_response = MagicMock()
382
382
  mock_response.read.return_value = expected_content.encode("utf-8")
383
383
  mock_response.__enter__.return_value = mock_response
384
384
  mock_urlopen.return_value = mock_response
385
- # just testing it doesn't fail
386
- routine(
385
+ # only NaNs should return "string"
386
+ analysis = routine(
387
387
  file_path="http://example.com/test.csv",
388
388
  num_rows=-1,
389
389
  output_profile=False,
390
390
  save_results=False,
391
391
  )
392
+ assert analysis["columns"][col_name]["format"] == "string"
393
+
394
+
395
+ def test_count_column(mocked_responses):
396
+ expected_content = "count,_count\n" + "a,1\n" * 100
397
+ mocked_responses.get(
398
+ "http://example.com/test.csv",
399
+ body=expected_content,
400
+ status=200,
401
+ )
402
+ with patch("urllib.request.urlopen") as mock_urlopen:
403
+ mock_response = MagicMock()
404
+ mock_response.read.return_value = expected_content.encode("utf-8")
405
+ mock_response.__enter__.return_value = mock_response
406
+ mock_urlopen.return_value = mock_response
407
+ # only testing it doesn't fail with output_profile=True
408
+ routine(
409
+ file_path="http://example.com/test.csv",
410
+ num_rows=-1,
411
+ output_profile=True,
412
+ save_results=False,
413
+ )