csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.12674__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. csv_detective/detection/__init__.py +0 -0
  2. csv_detective/detection/columns.py +0 -0
  3. csv_detective/detection/encoding.py +0 -0
  4. csv_detective/detection/engine.py +0 -0
  5. csv_detective/detection/formats.py +0 -2
  6. csv_detective/detection/headers.py +14 -12
  7. csv_detective/detection/rows.py +1 -1
  8. csv_detective/detection/separator.py +0 -0
  9. csv_detective/detection/variables.py +0 -0
  10. csv_detective/explore_csv.py +4 -15
  11. csv_detective/format.py +1 -1
  12. csv_detective/formats/__init__.py +0 -0
  13. csv_detective/formats/adresse.py +0 -0
  14. csv_detective/formats/binary.py +0 -0
  15. csv_detective/formats/booleen.py +0 -0
  16. csv_detective/formats/code_commune_insee.py +0 -0
  17. csv_detective/formats/code_csp_insee.py +0 -0
  18. csv_detective/formats/code_departement.py +0 -0
  19. csv_detective/formats/code_fantoir.py +0 -0
  20. csv_detective/formats/code_import.py +0 -0
  21. csv_detective/formats/code_postal.py +0 -0
  22. csv_detective/formats/code_region.py +0 -0
  23. csv_detective/formats/code_rna.py +0 -0
  24. csv_detective/formats/code_waldec.py +0 -0
  25. csv_detective/formats/commune.py +0 -0
  26. csv_detective/formats/csp_insee.py +0 -0
  27. csv_detective/formats/date.py +1 -10
  28. csv_detective/formats/date_fr.py +0 -0
  29. csv_detective/formats/datetime_aware.py +0 -0
  30. csv_detective/formats/datetime_naive.py +0 -0
  31. csv_detective/formats/datetime_rfc822.py +0 -0
  32. csv_detective/formats/departement.py +0 -0
  33. csv_detective/formats/email.py +0 -0
  34. csv_detective/formats/float.py +0 -0
  35. csv_detective/formats/geojson.py +0 -0
  36. csv_detective/formats/insee_ape700.py +0 -0
  37. csv_detective/formats/insee_canton.py +0 -0
  38. csv_detective/formats/int.py +0 -0
  39. csv_detective/formats/iso_country_code_alpha2.py +0 -0
  40. csv_detective/formats/iso_country_code_alpha3.py +0 -0
  41. csv_detective/formats/iso_country_code_numeric.py +0 -0
  42. csv_detective/formats/jour_de_la_semaine.py +0 -0
  43. csv_detective/formats/json.py +0 -0
  44. csv_detective/formats/latitude_l93.py +0 -0
  45. csv_detective/formats/latitude_wgs.py +0 -0
  46. csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
  47. csv_detective/formats/latlon_wgs.py +0 -0
  48. csv_detective/formats/longitude_l93.py +0 -0
  49. csv_detective/formats/longitude_wgs.py +0 -0
  50. csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
  51. csv_detective/formats/lonlat_wgs.py +0 -0
  52. csv_detective/formats/mois_de_lannee.py +0 -0
  53. csv_detective/formats/money.py +0 -0
  54. csv_detective/formats/mongo_object_id.py +0 -0
  55. csv_detective/formats/pays.py +0 -0
  56. csv_detective/formats/percent.py +0 -0
  57. csv_detective/formats/region.py +0 -0
  58. csv_detective/formats/sexe.py +0 -0
  59. csv_detective/formats/siren.py +0 -0
  60. csv_detective/formats/siret.py +0 -0
  61. csv_detective/formats/tel_fr.py +0 -0
  62. csv_detective/formats/uai.py +0 -0
  63. csv_detective/formats/url.py +0 -0
  64. csv_detective/formats/username.py +0 -0
  65. csv_detective/formats/uuid.py +0 -0
  66. csv_detective/formats/year.py +0 -0
  67. csv_detective/output/__init__.py +0 -0
  68. csv_detective/output/dataframe.py +2 -2
  69. csv_detective/output/example.py +0 -0
  70. csv_detective/output/profile.py +1 -1
  71. csv_detective/output/schema.py +0 -0
  72. csv_detective/output/utils.py +0 -0
  73. csv_detective/parsing/__init__.py +0 -0
  74. csv_detective/parsing/columns.py +5 -9
  75. csv_detective/parsing/compression.py +0 -0
  76. csv_detective/parsing/csv.py +0 -0
  77. csv_detective/parsing/excel.py +1 -1
  78. csv_detective/parsing/load.py +12 -11
  79. csv_detective/validate.py +36 -71
  80. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +18 -15
  81. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/RECORD +22 -41
  82. csv_detective-0.10.12674.dist-info/WHEEL +4 -0
  83. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +1 -0
  84. csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
  85. csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
  86. csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
  87. tests/__init__.py +0 -0
  88. tests/data/a_test_file.csv +0 -407
  89. tests/data/a_test_file.json +0 -394
  90. tests/data/b_test_file.csv +0 -7
  91. tests/data/c_test_file.csv +0 -2
  92. tests/data/csv_file +0 -7
  93. tests/data/file.csv.gz +0 -0
  94. tests/data/file.ods +0 -0
  95. tests/data/file.xls +0 -0
  96. tests/data/file.xlsx +0 -0
  97. tests/data/xlsx_file +0 -0
  98. tests/test_example.py +0 -67
  99. tests/test_fields.py +0 -175
  100. tests/test_file.py +0 -469
  101. tests/test_labels.py +0 -26
  102. tests/test_structure.py +0 -45
  103. tests/test_validation.py +0 -163
tests/test_file.py DELETED
@@ -1,469 +0,0 @@
1
- from unittest.mock import MagicMock, patch
2
-
3
- import pandas as pd
4
- import pytest
5
- import responses
6
-
7
- from csv_detective import routine
8
- from csv_detective.output.profile import create_profile
9
- from csv_detective.parsing.csv import CHUNK_SIZE
10
-
11
-
12
- @pytest.fixture
13
- def mocked_responses():
14
- with responses.RequestsMock() as rsps:
15
- yield rsps
16
-
17
-
18
- @pytest.mark.parametrize(
19
- "chunk_size",
20
- (100, 404, int(1e5)),
21
- )
22
- def test_columns_output_on_file(chunk_size):
23
- with (
24
- # maybe we should refactor later to avoid having to patch everywhere
25
- patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
26
- patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
27
- ):
28
- output = routine(
29
- file_path="tests/data/a_test_file.csv",
30
- num_rows=-1,
31
- output_profile=False,
32
- save_results=False,
33
- )
34
- assert isinstance(output, dict)
35
- assert output["separator"] == ";"
36
- assert output["header_row_idx"] == 2
37
- assert output["header"] == [
38
- "NUMCOM",
39
- "NOMCOM",
40
- "NUMDEP",
41
- "NOMDEP",
42
- "NUMEPCI",
43
- "NOMEPCI",
44
- "TXCOUVGLO_COM_2014",
45
- "TXCOUVGLO_DEP_2014",
46
- "TXCOUVGLO_EPCI_2014",
47
- "STRUCTURED_INFO",
48
- "GEO_INFO",
49
- ]
50
- assert output["total_lines"] == 404
51
- assert output["nb_duplicates"] == 7
52
- assert output["columns"]["NOMCOM"]["format"] == "commune"
53
- assert output["columns"]["NOMDEP"]["format"] == "departement"
54
- assert output["columns"]["NUMEPCI"]["format"] == "siren"
55
- assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
56
- assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
57
- assert output["columns"]["GEO_INFO"]["python_type"] == "json"
58
- assert output["columns"]["GEO_INFO"]["format"] == "geojson"
59
-
60
-
61
- def test_profile_output_on_file():
62
- output = routine(
63
- file_path="tests/data/a_test_file.csv",
64
- num_rows=-1,
65
- output_profile=True,
66
- save_results=False,
67
- )
68
- assert all(
69
- [
70
- c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
71
- for c in [
72
- "min",
73
- "max",
74
- "mean",
75
- "std",
76
- "tops",
77
- "nb_distinct",
78
- "nb_missing_values",
79
- ]
80
- ]
81
- )
82
- assert not any(
83
- [
84
- c in list(output["profile"]["NUMCOM"].keys())
85
- for c in [
86
- "min",
87
- "max",
88
- "mean",
89
- "std",
90
- ]
91
- ]
92
- )
93
- assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
94
- assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
95
- assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
96
- assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
97
- assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
98
- assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
99
- assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
100
-
101
-
102
- def test_profile_with_num_rows():
103
- with pytest.raises(ValueError):
104
- routine(
105
- file_path="tests/data/a_test_file.csv",
106
- num_rows=50,
107
- output_profile=True,
108
- save_results=False,
109
- )
110
-
111
-
112
- @pytest.mark.parametrize(
113
- "params",
114
- (
115
- (
116
- True,
117
- {
118
- "int_with_nan": {"format": "int", "python_type": "int"},
119
- "date": {"format": "date", "python_type": "date"},
120
- },
121
- ),
122
- (
123
- False,
124
- {
125
- "int_with_nan": [{"format": "int", "python_type": "int"}],
126
- "date": [{"format": "date", "python_type": "date"}],
127
- },
128
- ),
129
- ),
130
- )
131
- def test_profile_specific_cases(params):
132
- limited_output, columns = params
133
- table = pd.DataFrame(
134
- {
135
- "int_with_nan": ["1", pd.NA, pd.NA],
136
- "date": ["1996-01-02", "1996-01-02", "2024-11-12"],
137
- }
138
- )
139
- profile = create_profile(
140
- table=table,
141
- columns=columns,
142
- limited_output=limited_output,
143
- num_rows=-1,
144
- )
145
- assert profile["int_with_nan"] == {
146
- "min": 1,
147
- "max": 1,
148
- "mean": 1,
149
- "std": None,
150
- "tops": [{"count": 1, "value": "1"}],
151
- "nb_distinct": 1,
152
- "nb_missing_values": 2,
153
- }
154
- assert profile["date"] == {
155
- "tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
156
- "nb_distinct": 2,
157
- "nb_missing_values": 0,
158
- }
159
-
160
-
161
- def test_exception_different_number_of_columns():
162
- """
163
- A ValueError should be raised if the number of columns differs between the first rows
164
- """
165
- with pytest.raises(ValueError):
166
- routine(
167
- file_path="tests/data/c_test_file.csv",
168
- num_rows=-1,
169
- output_profile=True,
170
- save_results=False,
171
- )
172
-
173
-
174
- def test_exception_malformed_columns(mocked_responses):
175
- """
176
- A ValueError should be raised if any column is Unnamed
177
- """
178
- url = f"http://example.com/bad_cols.csv"
179
- expected_content = b"col1,col2,\n1,2,\n3,4,"
180
- mocked_responses.get(
181
- url,
182
- body=expected_content,
183
- status=200,
184
- )
185
- with patch("urllib.request.urlopen") as mock_urlopen:
186
- mock_response = MagicMock()
187
- mock_response.read.return_value = expected_content
188
- mock_response.__enter__.return_value = mock_response
189
- mock_urlopen.return_value = mock_response
190
- with pytest.raises(ValueError):
191
- routine(file_path=url)
192
-
193
-
194
- def test_code_dep_reg_on_file():
195
- output = routine(
196
- file_path="tests/data/b_test_file.csv",
197
- num_rows=-1,
198
- output_profile=False,
199
- save_results=False,
200
- )
201
- assert isinstance(output, dict)
202
- assert output["columns"]["code_departement"]["format"] == "code_departement"
203
- assert output["columns"]["code_region"]["format"] == "code_region"
204
-
205
-
206
- def test_schema_on_file():
207
- output = routine(
208
- file_path="tests/data/b_test_file.csv",
209
- num_rows=-1,
210
- output_schema=True,
211
- save_results=False,
212
- )
213
- assert isinstance(output, dict)
214
- is_column_dep = False
215
- is_column_reg = False
216
- for item in output["schema"]["fields"]:
217
- if item["name"] == "code_departement":
218
- is_column_dep = True
219
- assert item["description"] == "Le code INSEE du département"
220
- assert item["type"] == "string"
221
- assert item["formatFR"] == "code_departement"
222
- assert item["constraints"]["pattern"] == "^(([013-9]\\d|2[AB1-9])$|9\\d{2}$)"
223
- if item["name"] == "code_region":
224
- is_column_reg = True
225
- assert item["description"] == "Le code INSEE de la région"
226
- assert item["type"] == "string"
227
- assert item["formatFR"] == "code_region"
228
- assert item["constraints"]["pattern"] == "^\\d{2}$"
229
- assert is_column_dep
230
- assert is_column_reg
231
-
232
-
233
- params_csv = [
234
- ("csv_file", {"engine": None, "sheet_name": None}),
235
- ("file.csv.gz", {"engine": None, "sheet_name": None, "separator": ",", "columns.len": 3}),
236
- ]
237
- params_others = [
238
- ("file.ods", {"engine": "odf"}),
239
- # this is a "tricked" xls file that is actually read as odf
240
- ("file.xls", {"engine": "odf"}),
241
- # this file has an empty first row; check if the sheet we consider is the largest
242
- ("file.xlsx", {"engine": "openpyxl", "header_row_idx": 1, "sheet_name": "REI_1987"}),
243
- ("xlsx_file", {"engine": "openpyxl"}),
244
- ]
245
-
246
-
247
- @pytest.mark.parametrize("params", params_csv + params_others)
248
- def test_non_csv_files(params):
249
- file_name, checks = params
250
- _ = routine(
251
- file_path=f"tests/data/{file_name}",
252
- num_rows=-1,
253
- output_profile=False,
254
- save_results=False,
255
- )
256
- for k, v in checks.items():
257
- if v is None:
258
- assert not _.get(k)
259
- elif "." in k:
260
- key, func = k.split(".")
261
- assert eval(func)(_[key]) == v
262
- else:
263
- assert _[k] == v
264
-
265
-
266
- @pytest.mark.parametrize(
267
- "params",
268
- # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
269
- # which doesn't support the way we mock the response, TBC
270
- params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})],
271
- )
272
- def test_urls(mocked_responses, params):
273
- file_name, checks = params
274
- url = f"http://example.com/{file_name}"
275
- expected_content = open(f"tests/data/{file_name}", "rb").read()
276
- mocked_responses.get(
277
- url,
278
- body=expected_content,
279
- status=200,
280
- )
281
- with patch("urllib.request.urlopen") as mock_urlopen:
282
- mock_response = MagicMock()
283
- mock_response.read.return_value = expected_content
284
- mock_response.__enter__.return_value = mock_response
285
- mock_urlopen.return_value = mock_response
286
- _ = routine(
287
- file_path=url,
288
- num_rows=-1,
289
- output_profile=False,
290
- save_results=False,
291
- )
292
- for k, v in checks.items():
293
- if v is None:
294
- assert not _.get(k)
295
- elif "." in k:
296
- key, func = k.split(".")
297
- assert eval(func)(_[key]) == v
298
- else:
299
- assert _[k] == v
300
-
301
-
302
- @pytest.mark.parametrize(
303
- "expected_type",
304
- (
305
- (True, "int"),
306
- (False, "string"),
307
- ),
308
- )
309
- def test_nan_values(expected_type):
310
- # if skipping NaN, the column contains only ints
311
- skipna, expected_type = expected_type
312
- output = routine(
313
- file_path="tests/data/b_test_file.csv",
314
- num_rows=-1,
315
- save_results=False,
316
- skipna=skipna,
317
- )
318
- assert output["columns"]["partly_empty"]["python_type"] == expected_type
319
-
320
-
321
- def test_output_df():
322
- output, df_chunks = routine(
323
- file_path="tests/data/b_test_file.csv",
324
- num_rows=-1,
325
- output_profile=False,
326
- save_results=False,
327
- output_df=True,
328
- )
329
- df = pd.concat(df_chunks, ignore_index=True)
330
- assert isinstance(output, dict)
331
- assert isinstance(df, pd.DataFrame)
332
- assert len(df) == 6
333
- assert df["partly_empty"].dtype == pd.Int64Dtype()
334
-
335
-
336
- @pytest.mark.parametrize(
337
- "cast_json",
338
- (
339
- (True, dict),
340
- (False, str),
341
- ),
342
- )
343
- def test_cast_json(mocked_responses, cast_json):
344
- cast_json, expected_type = cast_json
345
- expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
346
- mocked_responses.get(
347
- "http://example.com/test.csv",
348
- body=expected_content,
349
- status=200,
350
- )
351
- with patch("urllib.request.urlopen") as mock_urlopen:
352
- mock_response = MagicMock()
353
- mock_response.read.return_value = expected_content.encode("utf-8")
354
- mock_response.__enter__.return_value = mock_response
355
- mock_urlopen.return_value = mock_response
356
- analysis, df_chunks = routine(
357
- file_path="http://example.com/test.csv",
358
- num_rows=-1,
359
- output_profile=False,
360
- save_results=False,
361
- output_df=True,
362
- cast_json=cast_json,
363
- )
364
- df = pd.concat(df_chunks, ignore_index=True)
365
- assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
366
- assert isinstance(df["a_simple_dict"][0], expected_type)
367
-
368
-
369
- def test_almost_uniform_column(mocked_responses):
370
- col_name = "int_not_bool"
371
- expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
372
- mocked_responses.get(
373
- "http://example.com/test.csv",
374
- body=expected_content,
375
- status=200,
376
- )
377
- with patch("urllib.request.urlopen") as mock_urlopen:
378
- mock_response = MagicMock()
379
- mock_response.read.return_value = expected_content.encode("utf-8")
380
- mock_response.__enter__.return_value = mock_response
381
- mock_urlopen.return_value = mock_response
382
- analysis = routine(
383
- file_path="http://example.com/test.csv",
384
- num_rows=-1,
385
- output_profile=False,
386
- save_results=False,
387
- )
388
- assert analysis["columns"][col_name]["format"] == "int"
389
-
390
-
391
- @pytest.mark.parametrize("nb_rows", (CHUNK_SIZE // 10, CHUNK_SIZE + 1))
392
- def test_full_nan_column(mocked_responses, nb_rows):
393
- # we want a file that needs sampling
394
- col_name = "only_nan"
395
- expected_content = f"{col_name},second_col\n" + ",1\n" * nb_rows
396
- mocked_responses.get(
397
- "http://example.com/test.csv",
398
- body=expected_content,
399
- status=200,
400
- )
401
- with patch("urllib.request.urlopen") as mock_urlopen:
402
- mock_response = MagicMock()
403
- mock_response.read.return_value = expected_content.encode("utf-8")
404
- mock_response.__enter__.return_value = mock_response
405
- mock_urlopen.return_value = mock_response
406
- # only NaNs should return "string"
407
- analysis = routine(
408
- file_path="http://example.com/test.csv",
409
- num_rows=-1,
410
- output_profile=False,
411
- save_results=False,
412
- )
413
- assert analysis["columns"][col_name]["format"] == "string"
414
-
415
-
416
- @pytest.mark.parametrize(
417
- "nb_rows",
418
- (100, CHUNK_SIZE + 1),
419
- )
420
- def test_count_column(mocked_responses, nb_rows):
421
- expected_content = "count,_count\n" + "a,1\n" * nb_rows
422
- mocked_responses.get(
423
- "http://example.com/test.csv",
424
- body=expected_content,
425
- status=200,
426
- )
427
- with patch("urllib.request.urlopen") as mock_urlopen:
428
- mock_response = MagicMock()
429
- mock_response.read.return_value = expected_content.encode("utf-8")
430
- mock_response.__enter__.return_value = mock_response
431
- mock_urlopen.return_value = mock_response
432
- # only testing it doesn't fail with output_profile=True
433
- routine(
434
- file_path="http://example.com/test.csv",
435
- num_rows=-1,
436
- output_profile=True,
437
- save_results=False,
438
- )
439
-
440
-
441
- def test_multiple_geo_columns(mocked_responses):
442
- lat, not_lat = "latitude_obj", "latin"
443
- lon, not_lon = "longitude_obj", "longueur"
444
- expected_content = f"{lat},{lon},{not_lat},{not_lon}\n" + "1.0,-10.0,1.0,-10.0\n" * 10
445
- mocked_responses.get(
446
- "http://example.com/test.csv",
447
- body=expected_content,
448
- status=200,
449
- )
450
- analysis = routine(
451
- file_path="http://example.com/test.csv",
452
- num_rows=-1,
453
- output_profile=False,
454
- save_results=False,
455
- )
456
- # we want the lat/lon columns to be labelled as such, and either:
457
- # - the not lat/lon columns to be labelled as float only
458
- # - or the not lat/lon columns to be labelled as lat/lon but with a lower score
459
- # both cases are acceptable
460
- assert analysis["columns"][lat]["format"] == "latitude_wgs"
461
- assert analysis["columns"][lon]["format"] == "longitude_wgs"
462
- assert analysis["columns"][not_lat]["format"] == "float" or (
463
- analysis["columns"][not_lat]["format"] == "latitude_wgs"
464
- and analysis["columns"][not_lat]["score"] < analysis["columns"][lat]["score"]
465
- )
466
- assert analysis["columns"][not_lon]["format"] == "float" or (
467
- analysis["columns"][not_lon]["format"] == "longitude_wgs"
468
- and analysis["columns"][not_lon]["score"] < analysis["columns"][lon]["score"]
469
- )
tests/test_labels.py DELETED
@@ -1,26 +0,0 @@
1
- import pytest
2
-
3
- from csv_detective.format import FormatsManager
4
-
5
- fmtm = FormatsManager()
6
-
7
-
8
- # money labels
9
- def test_money_labels():
10
- header = "Montant total"
11
- assert fmtm.formats["money"].is_valid_label(header) == 0.5
12
-
13
-
14
- @pytest.mark.parametrize(
15
- "params",
16
- [
17
- ("latitude", 1.0),
18
- ("lat", 0.75),
19
- ("coord_lat", 0.375),
20
- ("y", 0.5),
21
- ("nb_cycles", 0.0),
22
- ],
23
- )
24
- def test_latitude(params):
25
- header, expected = params
26
- assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header)
tests/test_structure.py DELETED
@@ -1,45 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from csv_detective.format import Format, FormatsManager
6
-
7
- fmtm = FormatsManager()
8
-
9
-
10
- def test_all_tests_have_unique_name():
11
- formats: list[str] = os.listdir("csv_detective/formats")
12
- assert "__init__.py" in formats
13
- assert len(formats) == len(set(formats))
14
-
15
-
16
- def test_conformity():
17
- for name, format in fmtm.formats.items():
18
- assert isinstance(name, str)
19
- assert isinstance(format, Format)
20
- assert all(
21
- getattr(format, attr) is not None
22
- for attr in [
23
- "name",
24
- "func",
25
- "_test_values",
26
- "labels",
27
- "proportion",
28
- "tags",
29
- ]
30
- )
31
-
32
-
33
- @pytest.mark.parametrize(
34
- "tags",
35
- (
36
- ["type"],
37
- ["temp", "fr"],
38
- ),
39
- )
40
- def test_get_from_tags(tags):
41
- fmts = fmtm.get_formats_from_tags(tags)
42
- assert len(fmts)
43
- for fmt in fmts.values():
44
- for tag in tags:
45
- assert tag in fmt.tags
tests/test_validation.py DELETED
@@ -1,163 +0,0 @@
1
- import json
2
- from unittest.mock import MagicMock, patch
3
-
4
- import pandas as pd
5
- import pytest
6
-
7
- from csv_detective.explore_csv import validate_then_detect
8
- from csv_detective.validate import validate
9
-
10
-
11
- def set_nested_value(source_dict: dict, key_chain: list[str], value):
12
- current_dict = source_dict
13
- for key in key_chain[:-1]:
14
- if key not in current_dict:
15
- current_dict[key] = {}
16
- current_dict = current_dict[key]
17
- current_dict[key_chain[-1]] = value
18
-
19
-
20
- def get_nested_value(source_dict: dict, key_chain: list[str]):
21
- result = source_dict
22
- for k in key_chain:
23
- result = result[k]
24
- return result
25
-
26
-
27
- @pytest.mark.parametrize(
28
- "_params",
29
- (
30
- ((True, dict), {}),
31
- ((False, None), {"separator": "|"}),
32
- ((False, None), {"encoding": "unknown"}),
33
- ((False, None), {"header": ["a", "b"]}),
34
- (
35
- (False, None),
36
- {
37
- "columns.NUMCOM": {
38
- "python_type": "int",
39
- "format": "int",
40
- "score": 1.0,
41
- },
42
- },
43
- ),
44
- ),
45
- )
46
- def test_validation(_params):
47
- (should_be_valid, analysis_type), modif_previous_analysis = _params
48
- with open("tests/data/a_test_file.json", "r") as f:
49
- previous_analysis = json.load(f)
50
- for dotkey in modif_previous_analysis:
51
- keys = dotkey.split(".")
52
- set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
53
- is_valid, analysis, col_values = validate(
54
- "tests/data/a_test_file.csv",
55
- previous_analysis=previous_analysis,
56
- )
57
- assert is_valid == should_be_valid
58
- if analysis_type is None:
59
- assert analysis is None
60
- else:
61
- assert isinstance(analysis, analysis_type)
62
- if should_be_valid:
63
- assert isinstance(col_values, dict)
64
- else:
65
- assert col_values is None
66
-
67
-
68
- @pytest.mark.parametrize(
69
- "_params",
70
- (
71
- # int: proportion = 1, should fail (early)
72
- ("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
73
- # siren: proportion = 0.9, should fail (later)
74
- (
75
- "130025265",
76
- "A13794BC",
77
- {"python_type": "string", "format": "siren", "score": 1.5},
78
- False,
79
- ),
80
- # siret: proportion = 0.8, should succeed
81
- (
82
- "13002526500013",
83
- "A13794BC",
84
- {"python_type": "string", "format": "siret", "score": 1.5},
85
- True,
86
- ),
87
- ),
88
- )
89
- def test_validation_with_proportions(_params):
90
- # testing the behaviour for a file that has 15% invalid values, but all in a single chunk
91
- valid_value, invalid_value, detected, should_be_valid = _params
92
- url = f"http://example.com/test.csv"
93
- expected_content = "col\n"
94
- for _ in range(60):
95
- # 60 rows of valid values
96
- expected_content += f"{valid_value}\n"
97
- for _ in range(15):
98
- # 15 rows of invalid values
99
- expected_content += f"{invalid_value}\n"
100
- for _ in range(25):
101
- # 25 rows of valid values
102
- expected_content += f"{valid_value}\n"
103
- previous_analysis = {
104
- "encoding": "utf-8",
105
- "separator": ",",
106
- "header_row_idx": 0,
107
- "header": ["col"],
108
- "columns": {"col": detected},
109
- # just setting these keys when validation is successful, they're not used for the validation itself
110
- "categorical": [],
111
- "columns_fields": {},
112
- "columns_labels": {},
113
- "formats": {},
114
- }
115
- with (
116
- patch("urllib.request.urlopen") as mock_urlopen,
117
- patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
118
- ):
119
- mock_response = MagicMock()
120
- mock_response.read.return_value = expected_content.encode("utf-8")
121
- mock_response.__enter__.return_value = mock_response
122
- mock_urlopen.return_value = mock_response
123
- is_valid, *_ = validate(
124
- file_path=url,
125
- previous_analysis=previous_analysis,
126
- )
127
- assert is_valid == should_be_valid
128
-
129
-
130
- @pytest.mark.parametrize(
131
- "modif_previous_analysis",
132
- (
133
- {"separator": "|"},
134
- {"encoding": "unknown"},
135
- {"header": ["a", "b"]},
136
- {"total_lines": 100},
137
- {
138
- "columns.NUMCOM": {
139
- "python_type": "int",
140
- "format": "int",
141
- "score": 1.0,
142
- },
143
- },
144
- ),
145
- )
146
- def test_validate_then_detect(modif_previous_analysis):
147
- with open("tests/data/a_test_file.json", "r") as f:
148
- previous_analysis = json.load(f)
149
- valid_values = {}
150
- for dotkey in modif_previous_analysis:
151
- keys = dotkey.split(".")
152
- valid_values[dotkey] = get_nested_value(previous_analysis, keys)
153
- set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
154
- analysis = validate_then_detect(
155
- "tests/data/a_test_file.csv",
156
- previous_analysis=previous_analysis,
157
- num_rows=-1,
158
- output_profile=True,
159
- save_results=False,
160
- )
161
- # checking that if not valid, the analysis has managed to retrieve the right values
162
- for dotkey in modif_previous_analysis:
163
- assert get_nested_value(analysis, dotkey.split(".")) == valid_values[dotkey]