csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
- csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
1
+ csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
4
- csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
3
+ csv_detective/explore_csv.py,sha256=kuLkORQarelG13swoi0dH4cERu8BoRtRvyQ2SsYYhCY,5653
4
+ csv_detective/load_tests.py,sha256=VzHJq1Q22C666nad17ciPRtcQEonP40YmSERn9zylvQ,2399
5
5
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
- csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
6
+ csv_detective/validate.py,sha256=CNTYu_rOiv-Z8iWqCI_Ac_LXvbneRSukiu7NxB9Rcuo,5187
7
7
  csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
8
8
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -130,37 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
130
130
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
131
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
132
132
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
133
- csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
134
- csv_detective/detection/formats.py,sha256=VxLHyQNUb7SrBkS1uV6cTK7cSrCVgrpAd3nd_74s2B0,7775
133
+ csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
134
+ csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
135
135
  csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
136
136
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
137
137
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
138
- csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
139
- csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
140
- csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
141
- csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
142
- csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
138
+ csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
139
+ csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
140
+ csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
141
+ csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
142
+ csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
143
143
  csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
144
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
145
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
- csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
146
+ csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4I30,9838
147
147
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
148
- csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
148
+ csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
149
149
  csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
150
- csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
150
+ csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
151
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
152
- csv_detective-0.9.3.dev2215.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
154
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
155
- tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
156
- tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
155
+ tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
156
+ tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
157
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
158
- tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
159
- tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
158
+ tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
159
+ tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
160
160
  venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
161
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
162
- csv_detective-0.9.3.dev2215.dist-info/METADATA,sha256=9F6hwttFFsBbi0eMv_UChawcvho2C9wHug4H_QEUIsQ,9931
163
- csv_detective-0.9.3.dev2215.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- csv_detective-0.9.3.dev2215.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
- csv_detective-0.9.3.dev2215.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
- csv_detective-0.9.3.dev2215.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
163
+ csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev2232.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -84,13 +84,13 @@ from csv_detective.parsing.columns import test_col as col_test # to prevent pyt
84
84
 
85
85
  def test_all_tests_return_bool():
86
86
  all_tests = return_all_tests("ALL", "detect_fields")
87
- for test in all_tests:
87
+ for attr in all_tests.values():
88
88
  for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
89
- assert isinstance(test._is(tmp), bool)
89
+ assert isinstance(attr["func"](tmp), bool)
90
90
 
91
91
 
92
92
  # categorical
93
- def test_detetect_categorical_variable():
93
+ def test_detect_categorical_variable():
94
94
  categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
95
95
  categorical_col2 = [str(k // 20) for k in range(100)]
96
96
  not_categorical_col = [i for i in range(100)]
@@ -103,7 +103,7 @@ def test_detetect_categorical_variable():
103
103
  df = pd.DataFrame(df_dict, dtype=str)
104
104
 
105
105
  res, _ = detect_categorical_variable(df)
106
- assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
106
+ assert len(res) and all(k in res for k in ["cat", "cat2"])
107
107
 
108
108
 
109
109
  # continuous
@@ -394,8 +394,8 @@ fields = {
394
394
 
395
395
  def test_all_fields_have_tests():
396
396
  all_tests = return_all_tests("ALL", "detect_fields")
397
- for test in all_tests:
398
- assert fields.get(test)
397
+ for attr in all_tests.values():
398
+ assert fields.get(attr["module"])
399
399
 
400
400
 
401
401
  @pytest.mark.parametrize(
@@ -475,13 +475,9 @@ def test_early_detection(args):
475
475
  def test_all_proportion_1():
476
476
  all_tests = return_all_tests("ALL", "detect_fields")
477
477
  prop_1 = {
478
- t.__name__.split(".")[-1]: eval(
479
- t.__name__.split(".")[-1]
480
- if t.__name__.split(".")[-1] not in ["int", "float"]
481
- else "test_" + t.__name__.split(".")[-1]
482
- )
483
- for t in all_tests
484
- if t.PROPORTION == 1
478
+ name: eval(name if name not in ["int", "float"] else "test_" + name)
479
+ for name, attr in all_tests.items()
480
+ if attr["prop"] == 1
485
481
  }
486
482
  # building a table that uses only correct values for these formats, except on one row
487
483
  table = pd.DataFrame(
tests/test_file.py CHANGED
@@ -1,4 +1,4 @@
1
- from unittest.mock import patch
1
+ from unittest.mock import MagicMock, patch
2
2
 
3
3
  import pandas as pd
4
4
  import pytest
@@ -6,15 +6,19 @@ import responses
6
6
 
7
7
  from csv_detective import routine
8
8
  from csv_detective.output.profile import create_profile
9
- from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
9
+ from csv_detective.parsing.csv import CHUNK_SIZE
10
10
 
11
11
 
12
12
  @pytest.mark.parametrize(
13
- "max_rows_analysis",
14
- (100, int(1e5)),
13
+ "chunk_size",
14
+ (100, 404, int(1e5)),
15
15
  )
16
- def test_columns_output_on_file(max_rows_analysis):
17
- with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", max_rows_analysis):
16
+ def test_columns_output_on_file(chunk_size):
17
+ with (
18
+ # maybe we should refactor later to avoid having to patch everywhere
19
+ patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
20
+ patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
21
+ ):
18
22
  output = routine(
19
23
  file_path="tests/data/a_test_file.csv",
20
24
  num_rows=-1,
@@ -248,17 +252,23 @@ def mocked_responses():
248
252
  def test_urls(mocked_responses, params):
249
253
  file_name, checks = params
250
254
  url = f"http://example.com/{file_name}"
255
+ expected_content = open(f"tests/data/{file_name}", "rb").read()
251
256
  mocked_responses.get(
252
257
  url,
253
- body=open(f"tests/data/{file_name}", "rb").read(),
258
+ body=expected_content,
254
259
  status=200,
255
260
  )
256
- _ = routine(
257
- file_path=url,
258
- num_rows=-1,
259
- output_profile=False,
260
- save_results=False,
261
- )
261
+ with patch("urllib.request.urlopen") as mock_urlopen:
262
+ mock_response = MagicMock()
263
+ mock_response.read.return_value = expected_content
264
+ mock_response.__enter__.return_value = mock_response
265
+ mock_urlopen.return_value = mock_response
266
+ _ = routine(
267
+ file_path=url,
268
+ num_rows=-1,
269
+ output_profile=False,
270
+ save_results=False,
271
+ )
262
272
  for k, v in checks.items():
263
273
  if v is None:
264
274
  assert not _.get(k)
@@ -289,13 +299,14 @@ def test_nan_values(expected_type):
289
299
 
290
300
 
291
301
  def test_output_df():
292
- output, df = routine(
302
+ output, df_chunks = routine(
293
303
  file_path="tests/data/b_test_file.csv",
294
304
  num_rows=-1,
295
305
  output_profile=False,
296
306
  save_results=False,
297
307
  output_df=True,
298
308
  )
309
+ df = pd.concat(df_chunks, ignore_index=True)
299
310
  assert isinstance(output, dict)
300
311
  assert isinstance(df, pd.DataFrame)
301
312
  assert len(df) == 6
@@ -317,14 +328,20 @@ def test_cast_json(mocked_responses, cast_json):
317
328
  body=expected_content,
318
329
  status=200,
319
330
  )
320
- analysis, df = routine(
321
- file_path="http://example.com/test.csv",
322
- num_rows=-1,
323
- output_profile=False,
324
- save_results=False,
325
- output_df=True,
326
- cast_json=cast_json,
327
- )
331
+ with patch("urllib.request.urlopen") as mock_urlopen:
332
+ mock_response = MagicMock()
333
+ mock_response.read.return_value = expected_content.encode("utf-8")
334
+ mock_response.__enter__.return_value = mock_response
335
+ mock_urlopen.return_value = mock_response
336
+ analysis, df_chunks = routine(
337
+ file_path="http://example.com/test.csv",
338
+ num_rows=-1,
339
+ output_profile=False,
340
+ save_results=False,
341
+ output_df=True,
342
+ cast_json=cast_json,
343
+ )
344
+ df = pd.concat(df_chunks, ignore_index=True)
328
345
  assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
329
346
  assert isinstance(df["a_simple_dict"][0], expected_type)
330
347
 
@@ -337,27 +354,38 @@ def test_almost_uniform_column(mocked_responses):
337
354
  body=expected_content,
338
355
  status=200,
339
356
  )
340
- analysis = routine(
341
- file_path="http://example.com/test.csv",
342
- num_rows=-1,
343
- output_profile=False,
344
- save_results=False,
345
- )
357
+ with patch("urllib.request.urlopen") as mock_urlopen:
358
+ mock_response = MagicMock()
359
+ mock_response.read.return_value = expected_content.encode("utf-8")
360
+ mock_response.__enter__.return_value = mock_response
361
+ mock_urlopen.return_value = mock_response
362
+ analysis = routine(
363
+ file_path="http://example.com/test.csv",
364
+ num_rows=-1,
365
+ output_profile=False,
366
+ save_results=False,
367
+ )
346
368
  assert analysis["columns"][col_name]["format"] == "int"
347
369
 
348
370
 
349
371
  def test_full_nan_column(mocked_responses):
350
372
  # we want a file that needs sampling
351
- expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
373
+ expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
352
374
  mocked_responses.get(
353
375
  "http://example.com/test.csv",
354
376
  body=expected_content,
355
377
  status=200,
356
378
  )
357
- # just testing it doesn't fail
358
- routine(
359
- file_path="http://example.com/test.csv",
360
- num_rows=-1,
361
- output_profile=False,
362
- save_results=False,
363
- )
379
+ with patch("urllib.request.urlopen") as mock_urlopen:
380
+ # Create a mock HTTP response object
381
+ mock_response = MagicMock()
382
+ mock_response.read.return_value = expected_content.encode("utf-8")
383
+ mock_response.__enter__.return_value = mock_response
384
+ mock_urlopen.return_value = mock_response
385
+ # just testing it doesn't fail
386
+ routine(
387
+ file_path="http://example.com/test.csv",
388
+ num_rows=-1,
389
+ output_profile=False,
390
+ save_results=False,
391
+ )
tests/test_structure.py CHANGED
@@ -34,5 +34,8 @@ def tests_conformity():
34
34
 
35
35
 
36
36
  def test_all_tests_have_unique_name():
37
- names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
37
+ names = [
38
+ attr["module"].__name__.split(".")[-1]
39
+ for attr in return_all_tests("ALL", "detect_fields").values()
40
+ ]
38
41
  assert len(names) == len(set(names))
tests/test_validation.py CHANGED
@@ -49,12 +49,9 @@ def test_validation(_params):
49
49
  for dotkey in modif_previous_analysis:
50
50
  keys = dotkey.split(".")
51
51
  set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
52
- is_valid, table, analysis = validate(
52
+ is_valid, table, analysis, col_values = validate(
53
53
  "tests/data/a_test_file.csv",
54
54
  previous_analysis=previous_analysis,
55
- num_rows=-1,
56
- sep=previous_analysis.get("separator"),
57
- encoding=previous_analysis.get("encoding"),
58
55
  )
59
56
  assert is_valid == should_be_valid
60
57
  if table_type is None:
@@ -65,6 +62,14 @@ def test_validation(_params):
65
62
  assert analysis is None
66
63
  else:
67
64
  assert isinstance(analysis, analysis_type)
65
+ if should_be_valid:
66
+ assert isinstance(col_values, dict)
67
+ assert all(
68
+ col in table.columns and isinstance(values, pd.Series)
69
+ for col, values in col_values.items()
70
+ )
71
+ else:
72
+ assert col_values is None
68
73
 
69
74
 
70
75
  @pytest.mark.parametrize(