csv-detective 0.10.1.dev2669__py3-none-any.whl → 0.10.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. csv_detective/detection/__init__.py +0 -0
  2. csv_detective/detection/columns.py +0 -0
  3. csv_detective/detection/encoding.py +0 -0
  4. csv_detective/detection/engine.py +0 -0
  5. csv_detective/detection/formats.py +0 -0
  6. csv_detective/detection/headers.py +0 -0
  7. csv_detective/detection/rows.py +0 -0
  8. csv_detective/detection/separator.py +0 -0
  9. csv_detective/detection/variables.py +0 -0
  10. csv_detective/format.py +0 -0
  11. csv_detective/formats/__init__.py +0 -0
  12. csv_detective/formats/adresse.py +0 -0
  13. csv_detective/formats/binary.py +0 -0
  14. csv_detective/formats/booleen.py +0 -0
  15. csv_detective/formats/code_commune_insee.py +0 -0
  16. csv_detective/formats/code_csp_insee.py +0 -0
  17. csv_detective/formats/code_departement.py +0 -0
  18. csv_detective/formats/code_fantoir.py +0 -0
  19. csv_detective/formats/code_import.py +0 -0
  20. csv_detective/formats/code_postal.py +0 -0
  21. csv_detective/formats/code_region.py +0 -0
  22. csv_detective/formats/code_rna.py +0 -0
  23. csv_detective/formats/code_waldec.py +0 -0
  24. csv_detective/formats/commune.py +0 -0
  25. csv_detective/formats/csp_insee.py +0 -0
  26. csv_detective/formats/date.py +0 -0
  27. csv_detective/formats/date_fr.py +0 -0
  28. csv_detective/formats/datetime_aware.py +0 -0
  29. csv_detective/formats/datetime_naive.py +0 -0
  30. csv_detective/formats/datetime_rfc822.py +0 -0
  31. csv_detective/formats/departement.py +0 -0
  32. csv_detective/formats/email.py +0 -0
  33. csv_detective/formats/float.py +0 -0
  34. csv_detective/formats/geojson.py +0 -0
  35. csv_detective/formats/insee_ape700.py +0 -0
  36. csv_detective/formats/insee_canton.py +0 -0
  37. csv_detective/formats/int.py +0 -0
  38. csv_detective/formats/iso_country_code_alpha2.py +0 -0
  39. csv_detective/formats/iso_country_code_alpha3.py +0 -0
  40. csv_detective/formats/iso_country_code_numeric.py +0 -0
  41. csv_detective/formats/jour_de_la_semaine.py +0 -0
  42. csv_detective/formats/json.py +0 -0
  43. csv_detective/formats/latitude_l93.py +0 -0
  44. csv_detective/formats/latitude_wgs.py +0 -0
  45. csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
  46. csv_detective/formats/latlon_wgs.py +0 -0
  47. csv_detective/formats/longitude_l93.py +0 -0
  48. csv_detective/formats/longitude_wgs.py +0 -0
  49. csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
  50. csv_detective/formats/lonlat_wgs.py +0 -0
  51. csv_detective/formats/mois_de_lannee.py +0 -0
  52. csv_detective/formats/money.py +0 -0
  53. csv_detective/formats/mongo_object_id.py +0 -0
  54. csv_detective/formats/pays.py +0 -0
  55. csv_detective/formats/percent.py +0 -0
  56. csv_detective/formats/region.py +0 -0
  57. csv_detective/formats/sexe.py +0 -0
  58. csv_detective/formats/siren.py +0 -0
  59. csv_detective/formats/siret.py +0 -0
  60. csv_detective/formats/tel_fr.py +0 -0
  61. csv_detective/formats/uai.py +0 -0
  62. csv_detective/formats/url.py +0 -0
  63. csv_detective/formats/username.py +0 -0
  64. csv_detective/formats/uuid.py +0 -0
  65. csv_detective/formats/year.py +0 -0
  66. csv_detective/output/__init__.py +0 -0
  67. csv_detective/output/dataframe.py +0 -0
  68. csv_detective/output/example.py +0 -0
  69. csv_detective/output/profile.py +0 -0
  70. csv_detective/output/schema.py +0 -0
  71. csv_detective/output/utils.py +0 -0
  72. csv_detective/parsing/__init__.py +0 -0
  73. csv_detective/parsing/columns.py +0 -0
  74. csv_detective/parsing/compression.py +0 -0
  75. csv_detective/parsing/csv.py +0 -0
  76. csv_detective/parsing/excel.py +0 -0
  77. csv_detective/parsing/load.py +0 -0
  78. csv_detective/validate.py +0 -0
  79. {csv_detective-0.10.1.dev2669.dist-info → csv_detective-0.10.2.dev1.dist-info}/METADATA +18 -17
  80. {csv_detective-0.10.1.dev2669.dist-info → csv_detective-0.10.2.dev1.dist-info}/RECORD +32 -13
  81. csv_detective-0.10.2.dev1.dist-info/WHEEL +5 -0
  82. {csv_detective-0.10.1.dev2669.dist-info → csv_detective-0.10.2.dev1.dist-info}/entry_points.txt +0 -1
  83. csv_detective-0.10.2.dev1.dist-info/licenses/LICENSE +21 -0
  84. csv_detective-0.10.2.dev1.dist-info/top_level.txt +3 -0
  85. tests/__init__.py +0 -0
  86. tests/data/a_test_file.csv +407 -0
  87. tests/data/a_test_file.json +394 -0
  88. tests/data/b_test_file.csv +7 -0
  89. tests/data/c_test_file.csv +2 -0
  90. tests/data/csv_file +7 -0
  91. tests/data/file.csv.gz +0 -0
  92. tests/data/file.ods +0 -0
  93. tests/data/file.xls +0 -0
  94. tests/data/file.xlsx +0 -0
  95. tests/data/xlsx_file +0 -0
  96. tests/test_example.py +67 -0
  97. tests/test_fields.py +169 -0
  98. tests/test_file.py +448 -0
  99. tests/test_labels.py +26 -0
  100. tests/test_structure.py +45 -0
  101. tests/test_validation.py +108 -0
  102. csv_detective-0.10.1.dev2669.dist-info/WHEEL +0 -4
tests/test_file.py ADDED
@@ -0,0 +1,448 @@
1
+ from unittest.mock import MagicMock, patch
2
+
3
+ import pandas as pd
4
+ import pytest
5
+ import responses
6
+
7
+ from csv_detective import routine
8
+ from csv_detective.output.profile import create_profile
9
+ from csv_detective.parsing.csv import CHUNK_SIZE
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ "chunk_size",
14
+ (100, 404, int(1e5)),
15
+ )
16
+ def test_columns_output_on_file(chunk_size):
17
+ with (
18
+ # maybe we should refactor later to avoid having to patch everywhere
19
+ patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
20
+ patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
21
+ ):
22
+ output = routine(
23
+ file_path="tests/data/a_test_file.csv",
24
+ num_rows=-1,
25
+ output_profile=False,
26
+ save_results=False,
27
+ )
28
+ assert isinstance(output, dict)
29
+ assert output["separator"] == ";"
30
+ assert output["header_row_idx"] == 2
31
+ assert output["header"] == [
32
+ "NUMCOM",
33
+ "NOMCOM",
34
+ "NUMDEP",
35
+ "NOMDEP",
36
+ "NUMEPCI",
37
+ "NOMEPCI",
38
+ "TXCOUVGLO_COM_2014",
39
+ "TXCOUVGLO_DEP_2014",
40
+ "TXCOUVGLO_EPCI_2014",
41
+ "STRUCTURED_INFO",
42
+ "GEO_INFO",
43
+ ]
44
+ assert output["total_lines"] == 404
45
+ assert output["nb_duplicates"] == 7
46
+ assert output["columns"]["NOMCOM"]["format"] == "commune"
47
+ assert output["columns"]["NOMDEP"]["format"] == "departement"
48
+ assert output["columns"]["NUMEPCI"]["format"] == "siren"
49
+ assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
50
+ assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
51
+ assert output["columns"]["GEO_INFO"]["python_type"] == "json"
52
+ assert output["columns"]["GEO_INFO"]["format"] == "geojson"
53
+
54
+
55
+ def test_profile_output_on_file():
56
+ output = routine(
57
+ file_path="tests/data/a_test_file.csv",
58
+ num_rows=-1,
59
+ output_profile=True,
60
+ save_results=False,
61
+ )
62
+ assert all(
63
+ [
64
+ c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
65
+ for c in [
66
+ "min",
67
+ "max",
68
+ "mean",
69
+ "std",
70
+ "tops",
71
+ "nb_distinct",
72
+ "nb_missing_values",
73
+ ]
74
+ ]
75
+ )
76
+ assert not any(
77
+ [
78
+ c in list(output["profile"]["NUMCOM"].keys())
79
+ for c in [
80
+ "min",
81
+ "max",
82
+ "mean",
83
+ "std",
84
+ ]
85
+ ]
86
+ )
87
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
88
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
89
+ assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
90
+ assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
91
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
92
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
93
+ assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
94
+
95
+
96
+ def test_profile_with_num_rows():
97
+ with pytest.raises(ValueError):
98
+ routine(
99
+ file_path="tests/data/a_test_file.csv",
100
+ num_rows=50,
101
+ output_profile=True,
102
+ save_results=False,
103
+ )
104
+
105
+
106
+ @pytest.mark.parametrize(
107
+ "params",
108
+ (
109
+ (
110
+ True,
111
+ {
112
+ "int_with_nan": {"format": "int", "python_type": "int"},
113
+ "date": {"format": "date", "python_type": "date"},
114
+ },
115
+ ),
116
+ (
117
+ False,
118
+ {
119
+ "int_with_nan": [{"format": "int", "python_type": "int"}],
120
+ "date": [{"format": "date", "python_type": "date"}],
121
+ },
122
+ ),
123
+ ),
124
+ )
125
+ def test_profile_specific_cases(params):
126
+ limited_output, columns = params
127
+ table = pd.DataFrame(
128
+ {
129
+ "int_with_nan": ["1", pd.NA, pd.NA],
130
+ "date": ["1996-01-02", "1996-01-02", "2024-11-12"],
131
+ }
132
+ )
133
+ profile = create_profile(
134
+ table=table,
135
+ columns=columns,
136
+ limited_output=limited_output,
137
+ num_rows=-1,
138
+ )
139
+ assert profile["int_with_nan"] == {
140
+ "min": 1,
141
+ "max": 1,
142
+ "mean": 1,
143
+ "std": None,
144
+ "tops": [{"count": 1, "value": "1"}],
145
+ "nb_distinct": 1,
146
+ "nb_missing_values": 2,
147
+ }
148
+ assert profile["date"] == {
149
+ "tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
150
+ "nb_distinct": 2,
151
+ "nb_missing_values": 0,
152
+ }
153
+
154
+
155
+ def test_exception_different_number_of_columns():
156
+ """
157
+ A ValueError should be raised if the number of columns differs between the first rows
158
+ """
159
+ with pytest.raises(ValueError):
160
+ routine(
161
+ file_path="tests/data/c_test_file.csv",
162
+ num_rows=-1,
163
+ output_profile=True,
164
+ save_results=False,
165
+ )
166
+
167
+
168
+ def test_code_dep_reg_on_file():
169
+ output = routine(
170
+ file_path="tests/data/b_test_file.csv",
171
+ num_rows=-1,
172
+ output_profile=False,
173
+ save_results=False,
174
+ )
175
+ assert isinstance(output, dict)
176
+ assert output["columns"]["code_departement"]["format"] == "code_departement"
177
+ assert output["columns"]["code_region"]["format"] == "code_region"
178
+
179
+
180
+ def test_schema_on_file():
181
+ output = routine(
182
+ file_path="tests/data/b_test_file.csv",
183
+ num_rows=-1,
184
+ output_schema=True,
185
+ save_results=False,
186
+ )
187
+ assert isinstance(output, dict)
188
+ is_column_dep = False
189
+ is_column_reg = False
190
+ for item in output["schema"]["fields"]:
191
+ if item["name"] == "code_departement":
192
+ is_column_dep = True
193
+ assert item["description"] == "Le code INSEE du département"
194
+ assert item["type"] == "string"
195
+ assert item["formatFR"] == "code_departement"
196
+ assert item["constraints"]["pattern"] == "^(([013-9]\\d|2[AB1-9])$|9\\d{2}$)"
197
+ if item["name"] == "code_region":
198
+ is_column_reg = True
199
+ assert item["description"] == "Le code INSEE de la région"
200
+ assert item["type"] == "string"
201
+ assert item["formatFR"] == "code_region"
202
+ assert item["constraints"]["pattern"] == "^\\d{2}$"
203
+ assert is_column_dep
204
+ assert is_column_reg
205
+
206
+
207
+ params_csv = [
208
+ ("csv_file", {"engine": None, "sheet_name": None}),
209
+ ("file.csv.gz", {"engine": None, "sheet_name": None, "separator": ",", "columns.len": 3}),
210
+ ]
211
+ params_others = [
212
+ ("file.ods", {"engine": "odf"}),
213
+ # this is a "tricked" xls file that is actually read as odf
214
+ ("file.xls", {"engine": "odf"}),
215
+ # this file has an empty first row; check if the sheet we consider is the largest
216
+ ("file.xlsx", {"engine": "openpyxl", "header_row_idx": 1, "sheet_name": "REI_1987"}),
217
+ ("xlsx_file", {"engine": "openpyxl"}),
218
+ ]
219
+
220
+
221
+ @pytest.mark.parametrize("params", params_csv + params_others)
222
+ def test_non_csv_files(params):
223
+ file_name, checks = params
224
+ _ = routine(
225
+ file_path=f"tests/data/{file_name}",
226
+ num_rows=-1,
227
+ output_profile=False,
228
+ save_results=False,
229
+ )
230
+ for k, v in checks.items():
231
+ if v is None:
232
+ assert not _.get(k)
233
+ elif "." in k:
234
+ key, func = k.split(".")
235
+ assert eval(func)(_[key]) == v
236
+ else:
237
+ assert _[k] == v
238
+
239
+
240
+ @pytest.fixture
241
+ def mocked_responses():
242
+ with responses.RequestsMock() as rsps:
243
+ yield rsps
244
+
245
+
246
+ @pytest.mark.parametrize(
247
+ "params",
248
+ # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
249
+ # which doesn't support the way we mock the response, TBC
250
+ params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})],
251
+ )
252
+ def test_urls(mocked_responses, params):
253
+ file_name, checks = params
254
+ url = f"http://example.com/{file_name}"
255
+ expected_content = open(f"tests/data/{file_name}", "rb").read()
256
+ mocked_responses.get(
257
+ url,
258
+ body=expected_content,
259
+ status=200,
260
+ )
261
+ with patch("urllib.request.urlopen") as mock_urlopen:
262
+ mock_response = MagicMock()
263
+ mock_response.read.return_value = expected_content
264
+ mock_response.__enter__.return_value = mock_response
265
+ mock_urlopen.return_value = mock_response
266
+ _ = routine(
267
+ file_path=url,
268
+ num_rows=-1,
269
+ output_profile=False,
270
+ save_results=False,
271
+ )
272
+ for k, v in checks.items():
273
+ if v is None:
274
+ assert not _.get(k)
275
+ elif "." in k:
276
+ key, func = k.split(".")
277
+ assert eval(func)(_[key]) == v
278
+ else:
279
+ assert _[k] == v
280
+
281
+
282
+ @pytest.mark.parametrize(
283
+ "expected_type",
284
+ (
285
+ (True, "int"),
286
+ (False, "string"),
287
+ ),
288
+ )
289
+ def test_nan_values(expected_type):
290
+ # if skipping NaN, the column contains only ints
291
+ skipna, expected_type = expected_type
292
+ output = routine(
293
+ file_path="tests/data/b_test_file.csv",
294
+ num_rows=-1,
295
+ save_results=False,
296
+ skipna=skipna,
297
+ )
298
+ assert output["columns"]["partly_empty"]["python_type"] == expected_type
299
+
300
+
301
+ def test_output_df():
302
+ output, df_chunks = routine(
303
+ file_path="tests/data/b_test_file.csv",
304
+ num_rows=-1,
305
+ output_profile=False,
306
+ save_results=False,
307
+ output_df=True,
308
+ )
309
+ df = pd.concat(df_chunks, ignore_index=True)
310
+ assert isinstance(output, dict)
311
+ assert isinstance(df, pd.DataFrame)
312
+ assert len(df) == 6
313
+ assert df["partly_empty"].dtype == pd.Int64Dtype()
314
+
315
+
316
+ @pytest.mark.parametrize(
317
+ "cast_json",
318
+ (
319
+ (True, dict),
320
+ (False, str),
321
+ ),
322
+ )
323
+ def test_cast_json(mocked_responses, cast_json):
324
+ cast_json, expected_type = cast_json
325
+ expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
326
+ mocked_responses.get(
327
+ "http://example.com/test.csv",
328
+ body=expected_content,
329
+ status=200,
330
+ )
331
+ with patch("urllib.request.urlopen") as mock_urlopen:
332
+ mock_response = MagicMock()
333
+ mock_response.read.return_value = expected_content.encode("utf-8")
334
+ mock_response.__enter__.return_value = mock_response
335
+ mock_urlopen.return_value = mock_response
336
+ analysis, df_chunks = routine(
337
+ file_path="http://example.com/test.csv",
338
+ num_rows=-1,
339
+ output_profile=False,
340
+ save_results=False,
341
+ output_df=True,
342
+ cast_json=cast_json,
343
+ )
344
+ df = pd.concat(df_chunks, ignore_index=True)
345
+ assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
346
+ assert isinstance(df["a_simple_dict"][0], expected_type)
347
+
348
+
349
+ def test_almost_uniform_column(mocked_responses):
350
+ col_name = "int_not_bool"
351
+ expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
352
+ mocked_responses.get(
353
+ "http://example.com/test.csv",
354
+ body=expected_content,
355
+ status=200,
356
+ )
357
+ with patch("urllib.request.urlopen") as mock_urlopen:
358
+ mock_response = MagicMock()
359
+ mock_response.read.return_value = expected_content.encode("utf-8")
360
+ mock_response.__enter__.return_value = mock_response
361
+ mock_urlopen.return_value = mock_response
362
+ analysis = routine(
363
+ file_path="http://example.com/test.csv",
364
+ num_rows=-1,
365
+ output_profile=False,
366
+ save_results=False,
367
+ )
368
+ assert analysis["columns"][col_name]["format"] == "int"
369
+
370
+
371
+ def test_full_nan_column(mocked_responses):
372
+ # we want a file that needs sampling
373
+ col_name = "only_nan"
374
+ expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
375
+ mocked_responses.get(
376
+ "http://example.com/test.csv",
377
+ body=expected_content,
378
+ status=200,
379
+ )
380
+ with patch("urllib.request.urlopen") as mock_urlopen:
381
+ mock_response = MagicMock()
382
+ mock_response.read.return_value = expected_content.encode("utf-8")
383
+ mock_response.__enter__.return_value = mock_response
384
+ mock_urlopen.return_value = mock_response
385
+ # only NaNs should return "string"
386
+ analysis = routine(
387
+ file_path="http://example.com/test.csv",
388
+ num_rows=-1,
389
+ output_profile=False,
390
+ save_results=False,
391
+ )
392
+ assert analysis["columns"][col_name]["format"] == "string"
393
+
394
+
395
+ @pytest.mark.parametrize(
396
+ "nb_rows",
397
+ (100, CHUNK_SIZE + 1),
398
+ )
399
+ def test_count_column(mocked_responses, nb_rows):
400
+ expected_content = "count,_count\n" + "a,1\n" * nb_rows
401
+ mocked_responses.get(
402
+ "http://example.com/test.csv",
403
+ body=expected_content,
404
+ status=200,
405
+ )
406
+ with patch("urllib.request.urlopen") as mock_urlopen:
407
+ mock_response = MagicMock()
408
+ mock_response.read.return_value = expected_content.encode("utf-8")
409
+ mock_response.__enter__.return_value = mock_response
410
+ mock_urlopen.return_value = mock_response
411
+ # only testing it doesn't fail with output_profile=True
412
+ routine(
413
+ file_path="http://example.com/test.csv",
414
+ num_rows=-1,
415
+ output_profile=True,
416
+ save_results=False,
417
+ )
418
+
419
+
420
+ def test_multiple_geo_columns(mocked_responses):
421
+ lat, not_lat = "latitude_obj", "latin"
422
+ lon, not_lon = "longitude_obj", "longueur"
423
+ expected_content = f"{lat},{lon},{not_lat},{not_lon}\n" + "1.0,-10.0,1.0,-10.0\n" * 10
424
+ mocked_responses.get(
425
+ "http://example.com/test.csv",
426
+ body=expected_content,
427
+ status=200,
428
+ )
429
+ analysis = routine(
430
+ file_path="http://example.com/test.csv",
431
+ num_rows=-1,
432
+ output_profile=False,
433
+ save_results=False,
434
+ )
435
+ # we want the lat/lon columns to be labelled as such, and either:
436
+ # - the not lat/lon columns to be labelled as float only
437
+ # - or the not lat/lon columns to be labelled as lat/lon but with a lower score
438
+ # both cases are acceptable
439
+ assert analysis["columns"][lat]["format"] == "latitude_wgs"
440
+ assert analysis["columns"][lon]["format"] == "longitude_wgs"
441
+ assert analysis["columns"][not_lat]["format"] == "float" or (
442
+ analysis["columns"][not_lat]["format"] == "latitude_wgs"
443
+ and analysis["columns"][not_lat]["score"] < analysis["columns"][lat]["score"]
444
+ )
445
+ assert analysis["columns"][not_lon]["format"] == "float" or (
446
+ analysis["columns"][not_lon]["format"] == "longitude_wgs"
447
+ and analysis["columns"][not_lon]["score"] < analysis["columns"][lon]["score"]
448
+ )
tests/test_labels.py ADDED
@@ -0,0 +1,26 @@
1
+ import pytest
2
+
3
+ from csv_detective.format import FormatsManager
4
+
5
+ fmtm = FormatsManager()
6
+
7
+
8
+ # money labels
9
+ def test_money_labels():
10
+ header = "Montant total"
11
+ assert fmtm.formats["money"].is_valid_label(header) == 0.5
12
+
13
+
14
+ @pytest.mark.parametrize(
15
+ "params",
16
+ [
17
+ ("latitude", 1.0),
18
+ ("lat", 0.75),
19
+ ("coord_lat", 0.375),
20
+ ("y", 0.5),
21
+ ("nb_cycles", 0.0),
22
+ ],
23
+ )
24
+ def test_latitude(params):
25
+ header, expected = params
26
+ assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header)
@@ -0,0 +1,45 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from csv_detective.format import Format, FormatsManager
6
+
7
+ fmtm = FormatsManager()
8
+
9
+
10
+ def test_all_tests_have_unique_name():
11
+ formats: list[str] = os.listdir("csv_detective/formats")
12
+ assert "__init__.py" in formats
13
+ assert len(formats) == len(set(formats))
14
+
15
+
16
+ def test_conformity():
17
+ for name, format in fmtm.formats.items():
18
+ assert isinstance(name, str)
19
+ assert isinstance(format, Format)
20
+ assert all(
21
+ getattr(format, attr) is not None
22
+ for attr in [
23
+ "name",
24
+ "func",
25
+ "_test_values",
26
+ "labels",
27
+ "proportion",
28
+ "tags",
29
+ ]
30
+ )
31
+
32
+
33
+ @pytest.mark.parametrize(
34
+ "tags",
35
+ (
36
+ ["type"],
37
+ ["temp", "fr"],
38
+ ),
39
+ )
40
+ def test_get_from_tags(tags):
41
+ fmts = fmtm.get_formats_from_tags(tags)
42
+ assert len(fmts)
43
+ for fmt in fmts.values():
44
+ for tag in tags:
45
+ assert tag in fmt.tags
@@ -0,0 +1,108 @@
1
+ import json
2
+
3
+ import pandas as pd
4
+ import pytest
5
+
6
+ from csv_detective.explore_csv import validate_then_detect
7
+ from csv_detective.validate import validate
8
+
9
+
10
+ def set_nested_value(source_dict: dict, key_chain: list[str], value):
11
+ current_dict = source_dict
12
+ for key in key_chain[:-1]:
13
+ if key not in current_dict:
14
+ current_dict[key] = {}
15
+ current_dict = current_dict[key]
16
+ current_dict[key_chain[-1]] = value
17
+
18
+
19
+ def get_nested_value(source_dict: dict, key_chain: list[str]):
20
+ result = source_dict
21
+ for k in key_chain:
22
+ result = result[k]
23
+ return result
24
+
25
+
26
+ @pytest.mark.parametrize(
27
+ "_params",
28
+ (
29
+ ((True, pd.DataFrame, dict), {}),
30
+ ((False, None, None), {"separator": "|"}),
31
+ ((False, None, None), {"encoding": "unknown"}),
32
+ ((False, None, None), {"header": ["a", "b"]}),
33
+ (
34
+ (False, pd.DataFrame, dict),
35
+ {
36
+ "columns.NUMCOM": {
37
+ "python_type": "int",
38
+ "format": "int",
39
+ "score": 1.0,
40
+ },
41
+ },
42
+ ),
43
+ ),
44
+ )
45
+ def test_validation(_params):
46
+ (should_be_valid, table_type, analysis_type), modif_previous_analysis = _params
47
+ with open("tests/data/a_test_file.json", "r") as f:
48
+ previous_analysis = json.load(f)
49
+ for dotkey in modif_previous_analysis:
50
+ keys = dotkey.split(".")
51
+ set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
52
+ is_valid, table, analysis, col_values = validate(
53
+ "tests/data/a_test_file.csv",
54
+ previous_analysis=previous_analysis,
55
+ )
56
+ assert is_valid == should_be_valid
57
+ if table_type is None:
58
+ assert table is None
59
+ else:
60
+ assert isinstance(table, table_type)
61
+ if analysis_type is None:
62
+ assert analysis is None
63
+ else:
64
+ assert isinstance(analysis, analysis_type)
65
+ if should_be_valid:
66
+ assert isinstance(col_values, dict)
67
+ assert all(
68
+ col in table.columns and isinstance(values, pd.Series)
69
+ for col, values in col_values.items()
70
+ )
71
+ else:
72
+ assert col_values is None
73
+
74
+
75
+ @pytest.mark.parametrize(
76
+ "modif_previous_analysis",
77
+ (
78
+ {"separator": "|"},
79
+ {"encoding": "unknown"},
80
+ {"header": ["a", "b"]},
81
+ {"total_lines": 100},
82
+ {
83
+ "columns.NUMCOM": {
84
+ "python_type": "int",
85
+ "format": "int",
86
+ "score": 1.0,
87
+ },
88
+ },
89
+ ),
90
+ )
91
+ def test_validate_then_detect(modif_previous_analysis):
92
+ with open("tests/data/a_test_file.json", "r") as f:
93
+ previous_analysis = json.load(f)
94
+ valid_values = {}
95
+ for dotkey in modif_previous_analysis:
96
+ keys = dotkey.split(".")
97
+ valid_values[dotkey] = get_nested_value(previous_analysis, keys)
98
+ set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
99
+ analysis = validate_then_detect(
100
+ "tests/data/a_test_file.csv",
101
+ previous_analysis=previous_analysis,
102
+ num_rows=-1,
103
+ output_profile=True,
104
+ save_results=False,
105
+ )
106
+ # checking that if not valid, the analysis has managed to retrieve the right values
107
+ for dotkey in modif_previous_analysis:
108
+ assert get_nested_value(analysis, dotkey.split(".")) == valid_values[dotkey]
@@ -1,4 +0,0 @@
1
- Wheel-Version: 1.0
2
- Generator: uv 0.9.25
3
- Root-Is-Purelib: true
4
- Tag: py3-none-any