csv-detective 0.9.3.dev2232__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,10 @@ def create_profile(
30
30
  k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
31
31
  for k, v in columns.items()
32
32
  }
33
+ # value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
34
+ _count_col = "count"
35
+ while _count_col in table.columns:
36
+ _count_col = "_" + _count_col
33
37
  profile = defaultdict(dict)
34
38
  for c in table.columns:
35
39
  # for numerical formats we want min, max, mean, std
@@ -79,14 +83,14 @@ def create_profile(
79
83
  # for all formats we want most frequent values, nb unique values and nb missing values
80
84
  tops_bruts = (
81
85
  (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
82
- .reset_index()
86
+ .reset_index(name=_count_col)
83
87
  .iloc[:10]
84
88
  .to_dict(orient="records")
85
89
  )
86
90
  profile[c].update(
87
91
  tops=[
88
92
  {
89
- "count": tb["count"],
93
+ "count": tb[_count_col],
90
94
  "value": tb[c],
91
95
  }
92
96
  for tb in tops_bruts
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev2232
3
+ Version: 0.9.3.dev2241
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -139,7 +139,7 @@ csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0Jze
139
139
  csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
140
140
  csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
141
141
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
142
- csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
142
+ csv_detective/output/profile.py,sha256=ZGKMSeVfmQerAfVhViWXVU9j4jbCrv5K484SQNep7Xw,4920
143
143
  csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
144
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
145
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -149,18 +149,18 @@ csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,
149
149
  csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
150
150
  csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
151
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
152
- csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev2241.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
154
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
155
155
  tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
156
- tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
156
+ tests/test_file.py,sha256=EKFW08W96VA5nVwNPvN1v7zXDL0qEEuGWnUqfJJdMh4,13130
157
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
158
158
  tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
159
159
  tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
160
160
  venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
161
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
162
- csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
163
- csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
- csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
- csv_detective-0.9.3.dev2232.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev2241.dist-info/METADATA,sha256=Cy0R4v1C7Lg-KRgD1_pBP4uO_huMU0158VorSXw8b2w,10845
163
+ csv_detective-0.9.3.dev2241.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev2241.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev2241.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev2241.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -370,22 +370,44 @@ def test_almost_uniform_column(mocked_responses):
370
370
 
371
371
  def test_full_nan_column(mocked_responses):
372
372
  # we want a file that needs sampling
373
- expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
373
+ col_name = "only_nan"
374
+ expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
374
375
  mocked_responses.get(
375
376
  "http://example.com/test.csv",
376
377
  body=expected_content,
377
378
  status=200,
378
379
  )
379
380
  with patch("urllib.request.urlopen") as mock_urlopen:
380
- # Create a mock HTTP response object
381
381
  mock_response = MagicMock()
382
382
  mock_response.read.return_value = expected_content.encode("utf-8")
383
383
  mock_response.__enter__.return_value = mock_response
384
384
  mock_urlopen.return_value = mock_response
385
- # just testing it doesn't fail
386
- routine(
385
+ # only NaNs should return "string"
386
+ analysis = routine(
387
387
  file_path="http://example.com/test.csv",
388
388
  num_rows=-1,
389
389
  output_profile=False,
390
390
  save_results=False,
391
391
  )
392
+ assert analysis["columns"][col_name]["format"] == "string"
393
+
394
+
395
+ def test_count_column(mocked_responses):
396
+ expected_content = "count,_count\n" + "a,1\n" * 100
397
+ mocked_responses.get(
398
+ "http://example.com/test.csv",
399
+ body=expected_content,
400
+ status=200,
401
+ )
402
+ with patch("urllib.request.urlopen") as mock_urlopen:
403
+ mock_response = MagicMock()
404
+ mock_response.read.return_value = expected_content.encode("utf-8")
405
+ mock_response.__enter__.return_value = mock_response
406
+ mock_urlopen.return_value = mock_response
407
+ # only testing it doesn't fail with output_profile=True
408
+ routine(
409
+ file_path="http://example.com/test.csv",
410
+ num_rows=-1,
411
+ output_profile=True,
412
+ save_results=False,
413
+ )