csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.12674__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/__init__.py +0 -0
- csv_detective/detection/columns.py +0 -0
- csv_detective/detection/encoding.py +0 -0
- csv_detective/detection/engine.py +0 -0
- csv_detective/detection/formats.py +0 -2
- csv_detective/detection/headers.py +14 -12
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/separator.py +0 -0
- csv_detective/detection/variables.py +0 -0
- csv_detective/explore_csv.py +4 -15
- csv_detective/format.py +1 -1
- csv_detective/formats/__init__.py +0 -0
- csv_detective/formats/adresse.py +0 -0
- csv_detective/formats/binary.py +0 -0
- csv_detective/formats/booleen.py +0 -0
- csv_detective/formats/code_commune_insee.py +0 -0
- csv_detective/formats/code_csp_insee.py +0 -0
- csv_detective/formats/code_departement.py +0 -0
- csv_detective/formats/code_fantoir.py +0 -0
- csv_detective/formats/code_import.py +0 -0
- csv_detective/formats/code_postal.py +0 -0
- csv_detective/formats/code_region.py +0 -0
- csv_detective/formats/code_rna.py +0 -0
- csv_detective/formats/code_waldec.py +0 -0
- csv_detective/formats/commune.py +0 -0
- csv_detective/formats/csp_insee.py +0 -0
- csv_detective/formats/date.py +1 -10
- csv_detective/formats/date_fr.py +0 -0
- csv_detective/formats/datetime_aware.py +0 -0
- csv_detective/formats/datetime_naive.py +0 -0
- csv_detective/formats/datetime_rfc822.py +0 -0
- csv_detective/formats/departement.py +0 -0
- csv_detective/formats/email.py +0 -0
- csv_detective/formats/float.py +0 -0
- csv_detective/formats/geojson.py +0 -0
- csv_detective/formats/insee_ape700.py +0 -0
- csv_detective/formats/insee_canton.py +0 -0
- csv_detective/formats/int.py +0 -0
- csv_detective/formats/iso_country_code_alpha2.py +0 -0
- csv_detective/formats/iso_country_code_alpha3.py +0 -0
- csv_detective/formats/iso_country_code_numeric.py +0 -0
- csv_detective/formats/jour_de_la_semaine.py +0 -0
- csv_detective/formats/json.py +0 -0
- csv_detective/formats/latitude_l93.py +0 -0
- csv_detective/formats/latitude_wgs.py +0 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
- csv_detective/formats/latlon_wgs.py +0 -0
- csv_detective/formats/longitude_l93.py +0 -0
- csv_detective/formats/longitude_wgs.py +0 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
- csv_detective/formats/lonlat_wgs.py +0 -0
- csv_detective/formats/mois_de_lannee.py +0 -0
- csv_detective/formats/money.py +0 -0
- csv_detective/formats/mongo_object_id.py +0 -0
- csv_detective/formats/pays.py +0 -0
- csv_detective/formats/percent.py +0 -0
- csv_detective/formats/region.py +0 -0
- csv_detective/formats/sexe.py +0 -0
- csv_detective/formats/siren.py +0 -0
- csv_detective/formats/siret.py +0 -0
- csv_detective/formats/tel_fr.py +0 -0
- csv_detective/formats/uai.py +0 -0
- csv_detective/formats/url.py +0 -0
- csv_detective/formats/username.py +0 -0
- csv_detective/formats/uuid.py +0 -0
- csv_detective/formats/year.py +0 -0
- csv_detective/output/__init__.py +0 -0
- csv_detective/output/dataframe.py +2 -2
- csv_detective/output/example.py +0 -0
- csv_detective/output/profile.py +1 -1
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/parsing/__init__.py +0 -0
- csv_detective/parsing/columns.py +5 -9
- csv_detective/parsing/compression.py +0 -0
- csv_detective/parsing/csv.py +0 -0
- csv_detective/parsing/excel.py +1 -1
- csv_detective/parsing/load.py +12 -11
- csv_detective/validate.py +36 -71
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +18 -15
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/RECORD +22 -41
- csv_detective-0.10.12674.dist-info/WHEEL +4 -0
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +1 -0
- csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
- csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
- csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/data/a_test_file.csv +0 -407
- tests/data/a_test_file.json +0 -394
- tests/data/b_test_file.csv +0 -7
- tests/data/c_test_file.csv +0 -2
- tests/data/csv_file +0 -7
- tests/data/file.csv.gz +0 -0
- tests/data/file.ods +0 -0
- tests/data/file.xls +0 -0
- tests/data/file.xlsx +0 -0
- tests/data/xlsx_file +0 -0
- tests/test_example.py +0 -67
- tests/test_fields.py +0 -175
- tests/test_file.py +0 -469
- tests/test_labels.py +0 -26
- tests/test_structure.py +0 -45
- tests/test_validation.py +0 -163
tests/test_file.py
DELETED
|
@@ -1,469 +0,0 @@
|
|
|
1
|
-
from unittest.mock import MagicMock, patch
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import pytest
|
|
5
|
-
import responses
|
|
6
|
-
|
|
7
|
-
from csv_detective import routine
|
|
8
|
-
from csv_detective.output.profile import create_profile
|
|
9
|
-
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.fixture
|
|
13
|
-
def mocked_responses():
|
|
14
|
-
with responses.RequestsMock() as rsps:
|
|
15
|
-
yield rsps
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@pytest.mark.parametrize(
|
|
19
|
-
"chunk_size",
|
|
20
|
-
(100, 404, int(1e5)),
|
|
21
|
-
)
|
|
22
|
-
def test_columns_output_on_file(chunk_size):
|
|
23
|
-
with (
|
|
24
|
-
# maybe we should refactor later to avoid having to patch everywhere
|
|
25
|
-
patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
|
|
26
|
-
patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
|
|
27
|
-
):
|
|
28
|
-
output = routine(
|
|
29
|
-
file_path="tests/data/a_test_file.csv",
|
|
30
|
-
num_rows=-1,
|
|
31
|
-
output_profile=False,
|
|
32
|
-
save_results=False,
|
|
33
|
-
)
|
|
34
|
-
assert isinstance(output, dict)
|
|
35
|
-
assert output["separator"] == ";"
|
|
36
|
-
assert output["header_row_idx"] == 2
|
|
37
|
-
assert output["header"] == [
|
|
38
|
-
"NUMCOM",
|
|
39
|
-
"NOMCOM",
|
|
40
|
-
"NUMDEP",
|
|
41
|
-
"NOMDEP",
|
|
42
|
-
"NUMEPCI",
|
|
43
|
-
"NOMEPCI",
|
|
44
|
-
"TXCOUVGLO_COM_2014",
|
|
45
|
-
"TXCOUVGLO_DEP_2014",
|
|
46
|
-
"TXCOUVGLO_EPCI_2014",
|
|
47
|
-
"STRUCTURED_INFO",
|
|
48
|
-
"GEO_INFO",
|
|
49
|
-
]
|
|
50
|
-
assert output["total_lines"] == 404
|
|
51
|
-
assert output["nb_duplicates"] == 7
|
|
52
|
-
assert output["columns"]["NOMCOM"]["format"] == "commune"
|
|
53
|
-
assert output["columns"]["NOMDEP"]["format"] == "departement"
|
|
54
|
-
assert output["columns"]["NUMEPCI"]["format"] == "siren"
|
|
55
|
-
assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
|
|
56
|
-
assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
|
|
57
|
-
assert output["columns"]["GEO_INFO"]["python_type"] == "json"
|
|
58
|
-
assert output["columns"]["GEO_INFO"]["format"] == "geojson"
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def test_profile_output_on_file():
|
|
62
|
-
output = routine(
|
|
63
|
-
file_path="tests/data/a_test_file.csv",
|
|
64
|
-
num_rows=-1,
|
|
65
|
-
output_profile=True,
|
|
66
|
-
save_results=False,
|
|
67
|
-
)
|
|
68
|
-
assert all(
|
|
69
|
-
[
|
|
70
|
-
c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
|
|
71
|
-
for c in [
|
|
72
|
-
"min",
|
|
73
|
-
"max",
|
|
74
|
-
"mean",
|
|
75
|
-
"std",
|
|
76
|
-
"tops",
|
|
77
|
-
"nb_distinct",
|
|
78
|
-
"nb_missing_values",
|
|
79
|
-
]
|
|
80
|
-
]
|
|
81
|
-
)
|
|
82
|
-
assert not any(
|
|
83
|
-
[
|
|
84
|
-
c in list(output["profile"]["NUMCOM"].keys())
|
|
85
|
-
for c in [
|
|
86
|
-
"min",
|
|
87
|
-
"max",
|
|
88
|
-
"mean",
|
|
89
|
-
"std",
|
|
90
|
-
]
|
|
91
|
-
]
|
|
92
|
-
)
|
|
93
|
-
assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
|
|
94
|
-
assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
|
|
95
|
-
assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
|
|
96
|
-
assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
|
|
97
|
-
assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
|
|
98
|
-
assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
|
|
99
|
-
assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def test_profile_with_num_rows():
|
|
103
|
-
with pytest.raises(ValueError):
|
|
104
|
-
routine(
|
|
105
|
-
file_path="tests/data/a_test_file.csv",
|
|
106
|
-
num_rows=50,
|
|
107
|
-
output_profile=True,
|
|
108
|
-
save_results=False,
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
@pytest.mark.parametrize(
|
|
113
|
-
"params",
|
|
114
|
-
(
|
|
115
|
-
(
|
|
116
|
-
True,
|
|
117
|
-
{
|
|
118
|
-
"int_with_nan": {"format": "int", "python_type": "int"},
|
|
119
|
-
"date": {"format": "date", "python_type": "date"},
|
|
120
|
-
},
|
|
121
|
-
),
|
|
122
|
-
(
|
|
123
|
-
False,
|
|
124
|
-
{
|
|
125
|
-
"int_with_nan": [{"format": "int", "python_type": "int"}],
|
|
126
|
-
"date": [{"format": "date", "python_type": "date"}],
|
|
127
|
-
},
|
|
128
|
-
),
|
|
129
|
-
),
|
|
130
|
-
)
|
|
131
|
-
def test_profile_specific_cases(params):
|
|
132
|
-
limited_output, columns = params
|
|
133
|
-
table = pd.DataFrame(
|
|
134
|
-
{
|
|
135
|
-
"int_with_nan": ["1", pd.NA, pd.NA],
|
|
136
|
-
"date": ["1996-01-02", "1996-01-02", "2024-11-12"],
|
|
137
|
-
}
|
|
138
|
-
)
|
|
139
|
-
profile = create_profile(
|
|
140
|
-
table=table,
|
|
141
|
-
columns=columns,
|
|
142
|
-
limited_output=limited_output,
|
|
143
|
-
num_rows=-1,
|
|
144
|
-
)
|
|
145
|
-
assert profile["int_with_nan"] == {
|
|
146
|
-
"min": 1,
|
|
147
|
-
"max": 1,
|
|
148
|
-
"mean": 1,
|
|
149
|
-
"std": None,
|
|
150
|
-
"tops": [{"count": 1, "value": "1"}],
|
|
151
|
-
"nb_distinct": 1,
|
|
152
|
-
"nb_missing_values": 2,
|
|
153
|
-
}
|
|
154
|
-
assert profile["date"] == {
|
|
155
|
-
"tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
|
|
156
|
-
"nb_distinct": 2,
|
|
157
|
-
"nb_missing_values": 0,
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def test_exception_different_number_of_columns():
|
|
162
|
-
"""
|
|
163
|
-
A ValueError should be raised if the number of columns differs between the first rows
|
|
164
|
-
"""
|
|
165
|
-
with pytest.raises(ValueError):
|
|
166
|
-
routine(
|
|
167
|
-
file_path="tests/data/c_test_file.csv",
|
|
168
|
-
num_rows=-1,
|
|
169
|
-
output_profile=True,
|
|
170
|
-
save_results=False,
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def test_exception_malformed_columns(mocked_responses):
|
|
175
|
-
"""
|
|
176
|
-
A ValueError should be raised if any column is Unnamed
|
|
177
|
-
"""
|
|
178
|
-
url = f"http://example.com/bad_cols.csv"
|
|
179
|
-
expected_content = b"col1,col2,\n1,2,\n3,4,"
|
|
180
|
-
mocked_responses.get(
|
|
181
|
-
url,
|
|
182
|
-
body=expected_content,
|
|
183
|
-
status=200,
|
|
184
|
-
)
|
|
185
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
186
|
-
mock_response = MagicMock()
|
|
187
|
-
mock_response.read.return_value = expected_content
|
|
188
|
-
mock_response.__enter__.return_value = mock_response
|
|
189
|
-
mock_urlopen.return_value = mock_response
|
|
190
|
-
with pytest.raises(ValueError):
|
|
191
|
-
routine(file_path=url)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def test_code_dep_reg_on_file():
|
|
195
|
-
output = routine(
|
|
196
|
-
file_path="tests/data/b_test_file.csv",
|
|
197
|
-
num_rows=-1,
|
|
198
|
-
output_profile=False,
|
|
199
|
-
save_results=False,
|
|
200
|
-
)
|
|
201
|
-
assert isinstance(output, dict)
|
|
202
|
-
assert output["columns"]["code_departement"]["format"] == "code_departement"
|
|
203
|
-
assert output["columns"]["code_region"]["format"] == "code_region"
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
def test_schema_on_file():
|
|
207
|
-
output = routine(
|
|
208
|
-
file_path="tests/data/b_test_file.csv",
|
|
209
|
-
num_rows=-1,
|
|
210
|
-
output_schema=True,
|
|
211
|
-
save_results=False,
|
|
212
|
-
)
|
|
213
|
-
assert isinstance(output, dict)
|
|
214
|
-
is_column_dep = False
|
|
215
|
-
is_column_reg = False
|
|
216
|
-
for item in output["schema"]["fields"]:
|
|
217
|
-
if item["name"] == "code_departement":
|
|
218
|
-
is_column_dep = True
|
|
219
|
-
assert item["description"] == "Le code INSEE du département"
|
|
220
|
-
assert item["type"] == "string"
|
|
221
|
-
assert item["formatFR"] == "code_departement"
|
|
222
|
-
assert item["constraints"]["pattern"] == "^(([013-9]\\d|2[AB1-9])$|9\\d{2}$)"
|
|
223
|
-
if item["name"] == "code_region":
|
|
224
|
-
is_column_reg = True
|
|
225
|
-
assert item["description"] == "Le code INSEE de la région"
|
|
226
|
-
assert item["type"] == "string"
|
|
227
|
-
assert item["formatFR"] == "code_region"
|
|
228
|
-
assert item["constraints"]["pattern"] == "^\\d{2}$"
|
|
229
|
-
assert is_column_dep
|
|
230
|
-
assert is_column_reg
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
params_csv = [
|
|
234
|
-
("csv_file", {"engine": None, "sheet_name": None}),
|
|
235
|
-
("file.csv.gz", {"engine": None, "sheet_name": None, "separator": ",", "columns.len": 3}),
|
|
236
|
-
]
|
|
237
|
-
params_others = [
|
|
238
|
-
("file.ods", {"engine": "odf"}),
|
|
239
|
-
# this is a "tricked" xls file that is actually read as odf
|
|
240
|
-
("file.xls", {"engine": "odf"}),
|
|
241
|
-
# this file has an empty first row; check if the sheet we consider is the largest
|
|
242
|
-
("file.xlsx", {"engine": "openpyxl", "header_row_idx": 1, "sheet_name": "REI_1987"}),
|
|
243
|
-
("xlsx_file", {"engine": "openpyxl"}),
|
|
244
|
-
]
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
@pytest.mark.parametrize("params", params_csv + params_others)
|
|
248
|
-
def test_non_csv_files(params):
|
|
249
|
-
file_name, checks = params
|
|
250
|
-
_ = routine(
|
|
251
|
-
file_path=f"tests/data/{file_name}",
|
|
252
|
-
num_rows=-1,
|
|
253
|
-
output_profile=False,
|
|
254
|
-
save_results=False,
|
|
255
|
-
)
|
|
256
|
-
for k, v in checks.items():
|
|
257
|
-
if v is None:
|
|
258
|
-
assert not _.get(k)
|
|
259
|
-
elif "." in k:
|
|
260
|
-
key, func = k.split(".")
|
|
261
|
-
assert eval(func)(_[key]) == v
|
|
262
|
-
else:
|
|
263
|
-
assert _[k] == v
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
@pytest.mark.parametrize(
|
|
267
|
-
"params",
|
|
268
|
-
# ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
|
|
269
|
-
# which doesn't support the way we mock the response, TBC
|
|
270
|
-
params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})],
|
|
271
|
-
)
|
|
272
|
-
def test_urls(mocked_responses, params):
|
|
273
|
-
file_name, checks = params
|
|
274
|
-
url = f"http://example.com/{file_name}"
|
|
275
|
-
expected_content = open(f"tests/data/{file_name}", "rb").read()
|
|
276
|
-
mocked_responses.get(
|
|
277
|
-
url,
|
|
278
|
-
body=expected_content,
|
|
279
|
-
status=200,
|
|
280
|
-
)
|
|
281
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
282
|
-
mock_response = MagicMock()
|
|
283
|
-
mock_response.read.return_value = expected_content
|
|
284
|
-
mock_response.__enter__.return_value = mock_response
|
|
285
|
-
mock_urlopen.return_value = mock_response
|
|
286
|
-
_ = routine(
|
|
287
|
-
file_path=url,
|
|
288
|
-
num_rows=-1,
|
|
289
|
-
output_profile=False,
|
|
290
|
-
save_results=False,
|
|
291
|
-
)
|
|
292
|
-
for k, v in checks.items():
|
|
293
|
-
if v is None:
|
|
294
|
-
assert not _.get(k)
|
|
295
|
-
elif "." in k:
|
|
296
|
-
key, func = k.split(".")
|
|
297
|
-
assert eval(func)(_[key]) == v
|
|
298
|
-
else:
|
|
299
|
-
assert _[k] == v
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
@pytest.mark.parametrize(
|
|
303
|
-
"expected_type",
|
|
304
|
-
(
|
|
305
|
-
(True, "int"),
|
|
306
|
-
(False, "string"),
|
|
307
|
-
),
|
|
308
|
-
)
|
|
309
|
-
def test_nan_values(expected_type):
|
|
310
|
-
# if skipping NaN, the column contains only ints
|
|
311
|
-
skipna, expected_type = expected_type
|
|
312
|
-
output = routine(
|
|
313
|
-
file_path="tests/data/b_test_file.csv",
|
|
314
|
-
num_rows=-1,
|
|
315
|
-
save_results=False,
|
|
316
|
-
skipna=skipna,
|
|
317
|
-
)
|
|
318
|
-
assert output["columns"]["partly_empty"]["python_type"] == expected_type
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def test_output_df():
|
|
322
|
-
output, df_chunks = routine(
|
|
323
|
-
file_path="tests/data/b_test_file.csv",
|
|
324
|
-
num_rows=-1,
|
|
325
|
-
output_profile=False,
|
|
326
|
-
save_results=False,
|
|
327
|
-
output_df=True,
|
|
328
|
-
)
|
|
329
|
-
df = pd.concat(df_chunks, ignore_index=True)
|
|
330
|
-
assert isinstance(output, dict)
|
|
331
|
-
assert isinstance(df, pd.DataFrame)
|
|
332
|
-
assert len(df) == 6
|
|
333
|
-
assert df["partly_empty"].dtype == pd.Int64Dtype()
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
@pytest.mark.parametrize(
|
|
337
|
-
"cast_json",
|
|
338
|
-
(
|
|
339
|
-
(True, dict),
|
|
340
|
-
(False, str),
|
|
341
|
-
),
|
|
342
|
-
)
|
|
343
|
-
def test_cast_json(mocked_responses, cast_json):
|
|
344
|
-
cast_json, expected_type = cast_json
|
|
345
|
-
expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
|
|
346
|
-
mocked_responses.get(
|
|
347
|
-
"http://example.com/test.csv",
|
|
348
|
-
body=expected_content,
|
|
349
|
-
status=200,
|
|
350
|
-
)
|
|
351
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
352
|
-
mock_response = MagicMock()
|
|
353
|
-
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
354
|
-
mock_response.__enter__.return_value = mock_response
|
|
355
|
-
mock_urlopen.return_value = mock_response
|
|
356
|
-
analysis, df_chunks = routine(
|
|
357
|
-
file_path="http://example.com/test.csv",
|
|
358
|
-
num_rows=-1,
|
|
359
|
-
output_profile=False,
|
|
360
|
-
save_results=False,
|
|
361
|
-
output_df=True,
|
|
362
|
-
cast_json=cast_json,
|
|
363
|
-
)
|
|
364
|
-
df = pd.concat(df_chunks, ignore_index=True)
|
|
365
|
-
assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
|
|
366
|
-
assert isinstance(df["a_simple_dict"][0], expected_type)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
def test_almost_uniform_column(mocked_responses):
|
|
370
|
-
col_name = "int_not_bool"
|
|
371
|
-
expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
|
|
372
|
-
mocked_responses.get(
|
|
373
|
-
"http://example.com/test.csv",
|
|
374
|
-
body=expected_content,
|
|
375
|
-
status=200,
|
|
376
|
-
)
|
|
377
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
378
|
-
mock_response = MagicMock()
|
|
379
|
-
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
380
|
-
mock_response.__enter__.return_value = mock_response
|
|
381
|
-
mock_urlopen.return_value = mock_response
|
|
382
|
-
analysis = routine(
|
|
383
|
-
file_path="http://example.com/test.csv",
|
|
384
|
-
num_rows=-1,
|
|
385
|
-
output_profile=False,
|
|
386
|
-
save_results=False,
|
|
387
|
-
)
|
|
388
|
-
assert analysis["columns"][col_name]["format"] == "int"
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
@pytest.mark.parametrize("nb_rows", (CHUNK_SIZE // 10, CHUNK_SIZE + 1))
|
|
392
|
-
def test_full_nan_column(mocked_responses, nb_rows):
|
|
393
|
-
# we want a file that needs sampling
|
|
394
|
-
col_name = "only_nan"
|
|
395
|
-
expected_content = f"{col_name},second_col\n" + ",1\n" * nb_rows
|
|
396
|
-
mocked_responses.get(
|
|
397
|
-
"http://example.com/test.csv",
|
|
398
|
-
body=expected_content,
|
|
399
|
-
status=200,
|
|
400
|
-
)
|
|
401
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
402
|
-
mock_response = MagicMock()
|
|
403
|
-
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
404
|
-
mock_response.__enter__.return_value = mock_response
|
|
405
|
-
mock_urlopen.return_value = mock_response
|
|
406
|
-
# only NaNs should return "string"
|
|
407
|
-
analysis = routine(
|
|
408
|
-
file_path="http://example.com/test.csv",
|
|
409
|
-
num_rows=-1,
|
|
410
|
-
output_profile=False,
|
|
411
|
-
save_results=False,
|
|
412
|
-
)
|
|
413
|
-
assert analysis["columns"][col_name]["format"] == "string"
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
@pytest.mark.parametrize(
|
|
417
|
-
"nb_rows",
|
|
418
|
-
(100, CHUNK_SIZE + 1),
|
|
419
|
-
)
|
|
420
|
-
def test_count_column(mocked_responses, nb_rows):
|
|
421
|
-
expected_content = "count,_count\n" + "a,1\n" * nb_rows
|
|
422
|
-
mocked_responses.get(
|
|
423
|
-
"http://example.com/test.csv",
|
|
424
|
-
body=expected_content,
|
|
425
|
-
status=200,
|
|
426
|
-
)
|
|
427
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
428
|
-
mock_response = MagicMock()
|
|
429
|
-
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
430
|
-
mock_response.__enter__.return_value = mock_response
|
|
431
|
-
mock_urlopen.return_value = mock_response
|
|
432
|
-
# only testing it doesn't fail with output_profile=True
|
|
433
|
-
routine(
|
|
434
|
-
file_path="http://example.com/test.csv",
|
|
435
|
-
num_rows=-1,
|
|
436
|
-
output_profile=True,
|
|
437
|
-
save_results=False,
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
def test_multiple_geo_columns(mocked_responses):
|
|
442
|
-
lat, not_lat = "latitude_obj", "latin"
|
|
443
|
-
lon, not_lon = "longitude_obj", "longueur"
|
|
444
|
-
expected_content = f"{lat},{lon},{not_lat},{not_lon}\n" + "1.0,-10.0,1.0,-10.0\n" * 10
|
|
445
|
-
mocked_responses.get(
|
|
446
|
-
"http://example.com/test.csv",
|
|
447
|
-
body=expected_content,
|
|
448
|
-
status=200,
|
|
449
|
-
)
|
|
450
|
-
analysis = routine(
|
|
451
|
-
file_path="http://example.com/test.csv",
|
|
452
|
-
num_rows=-1,
|
|
453
|
-
output_profile=False,
|
|
454
|
-
save_results=False,
|
|
455
|
-
)
|
|
456
|
-
# we want the lat/lon columns to be labelled as such, and either:
|
|
457
|
-
# - the not lat/lon columns to be labelled as float only
|
|
458
|
-
# - or the not lat/lon columns to be labelled as lat/lon but with a lower score
|
|
459
|
-
# both cases are acceptable
|
|
460
|
-
assert analysis["columns"][lat]["format"] == "latitude_wgs"
|
|
461
|
-
assert analysis["columns"][lon]["format"] == "longitude_wgs"
|
|
462
|
-
assert analysis["columns"][not_lat]["format"] == "float" or (
|
|
463
|
-
analysis["columns"][not_lat]["format"] == "latitude_wgs"
|
|
464
|
-
and analysis["columns"][not_lat]["score"] < analysis["columns"][lat]["score"]
|
|
465
|
-
)
|
|
466
|
-
assert analysis["columns"][not_lon]["format"] == "float" or (
|
|
467
|
-
analysis["columns"][not_lon]["format"] == "longitude_wgs"
|
|
468
|
-
and analysis["columns"][not_lon]["score"] < analysis["columns"][lon]["score"]
|
|
469
|
-
)
|
tests/test_labels.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
from csv_detective.format import FormatsManager
|
|
4
|
-
|
|
5
|
-
fmtm = FormatsManager()
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
# money labels
|
|
9
|
-
def test_money_labels():
|
|
10
|
-
header = "Montant total"
|
|
11
|
-
assert fmtm.formats["money"].is_valid_label(header) == 0.5
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@pytest.mark.parametrize(
|
|
15
|
-
"params",
|
|
16
|
-
[
|
|
17
|
-
("latitude", 1.0),
|
|
18
|
-
("lat", 0.75),
|
|
19
|
-
("coord_lat", 0.375),
|
|
20
|
-
("y", 0.5),
|
|
21
|
-
("nb_cycles", 0.0),
|
|
22
|
-
],
|
|
23
|
-
)
|
|
24
|
-
def test_latitude(params):
|
|
25
|
-
header, expected = params
|
|
26
|
-
assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header)
|
tests/test_structure.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from csv_detective.format import Format, FormatsManager
|
|
6
|
-
|
|
7
|
-
fmtm = FormatsManager()
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def test_all_tests_have_unique_name():
|
|
11
|
-
formats: list[str] = os.listdir("csv_detective/formats")
|
|
12
|
-
assert "__init__.py" in formats
|
|
13
|
-
assert len(formats) == len(set(formats))
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def test_conformity():
|
|
17
|
-
for name, format in fmtm.formats.items():
|
|
18
|
-
assert isinstance(name, str)
|
|
19
|
-
assert isinstance(format, Format)
|
|
20
|
-
assert all(
|
|
21
|
-
getattr(format, attr) is not None
|
|
22
|
-
for attr in [
|
|
23
|
-
"name",
|
|
24
|
-
"func",
|
|
25
|
-
"_test_values",
|
|
26
|
-
"labels",
|
|
27
|
-
"proportion",
|
|
28
|
-
"tags",
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@pytest.mark.parametrize(
|
|
34
|
-
"tags",
|
|
35
|
-
(
|
|
36
|
-
["type"],
|
|
37
|
-
["temp", "fr"],
|
|
38
|
-
),
|
|
39
|
-
)
|
|
40
|
-
def test_get_from_tags(tags):
|
|
41
|
-
fmts = fmtm.get_formats_from_tags(tags)
|
|
42
|
-
assert len(fmts)
|
|
43
|
-
for fmt in fmts.values():
|
|
44
|
-
for tag in tags:
|
|
45
|
-
assert tag in fmt.tags
|
tests/test_validation.py
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from unittest.mock import MagicMock, patch
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from csv_detective.explore_csv import validate_then_detect
|
|
8
|
-
from csv_detective.validate import validate
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def set_nested_value(source_dict: dict, key_chain: list[str], value):
|
|
12
|
-
current_dict = source_dict
|
|
13
|
-
for key in key_chain[:-1]:
|
|
14
|
-
if key not in current_dict:
|
|
15
|
-
current_dict[key] = {}
|
|
16
|
-
current_dict = current_dict[key]
|
|
17
|
-
current_dict[key_chain[-1]] = value
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def get_nested_value(source_dict: dict, key_chain: list[str]):
|
|
21
|
-
result = source_dict
|
|
22
|
-
for k in key_chain:
|
|
23
|
-
result = result[k]
|
|
24
|
-
return result
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@pytest.mark.parametrize(
|
|
28
|
-
"_params",
|
|
29
|
-
(
|
|
30
|
-
((True, dict), {}),
|
|
31
|
-
((False, None), {"separator": "|"}),
|
|
32
|
-
((False, None), {"encoding": "unknown"}),
|
|
33
|
-
((False, None), {"header": ["a", "b"]}),
|
|
34
|
-
(
|
|
35
|
-
(False, None),
|
|
36
|
-
{
|
|
37
|
-
"columns.NUMCOM": {
|
|
38
|
-
"python_type": "int",
|
|
39
|
-
"format": "int",
|
|
40
|
-
"score": 1.0,
|
|
41
|
-
},
|
|
42
|
-
},
|
|
43
|
-
),
|
|
44
|
-
),
|
|
45
|
-
)
|
|
46
|
-
def test_validation(_params):
|
|
47
|
-
(should_be_valid, analysis_type), modif_previous_analysis = _params
|
|
48
|
-
with open("tests/data/a_test_file.json", "r") as f:
|
|
49
|
-
previous_analysis = json.load(f)
|
|
50
|
-
for dotkey in modif_previous_analysis:
|
|
51
|
-
keys = dotkey.split(".")
|
|
52
|
-
set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
|
|
53
|
-
is_valid, analysis, col_values = validate(
|
|
54
|
-
"tests/data/a_test_file.csv",
|
|
55
|
-
previous_analysis=previous_analysis,
|
|
56
|
-
)
|
|
57
|
-
assert is_valid == should_be_valid
|
|
58
|
-
if analysis_type is None:
|
|
59
|
-
assert analysis is None
|
|
60
|
-
else:
|
|
61
|
-
assert isinstance(analysis, analysis_type)
|
|
62
|
-
if should_be_valid:
|
|
63
|
-
assert isinstance(col_values, dict)
|
|
64
|
-
else:
|
|
65
|
-
assert col_values is None
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@pytest.mark.parametrize(
|
|
69
|
-
"_params",
|
|
70
|
-
(
|
|
71
|
-
# int: proportion = 1, should fail (early)
|
|
72
|
-
("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
|
|
73
|
-
# siren: proportion = 0.9, should fail (later)
|
|
74
|
-
(
|
|
75
|
-
"130025265",
|
|
76
|
-
"A13794BC",
|
|
77
|
-
{"python_type": "string", "format": "siren", "score": 1.5},
|
|
78
|
-
False,
|
|
79
|
-
),
|
|
80
|
-
# siret: proportion = 0.8, should succeed
|
|
81
|
-
(
|
|
82
|
-
"13002526500013",
|
|
83
|
-
"A13794BC",
|
|
84
|
-
{"python_type": "string", "format": "siret", "score": 1.5},
|
|
85
|
-
True,
|
|
86
|
-
),
|
|
87
|
-
),
|
|
88
|
-
)
|
|
89
|
-
def test_validation_with_proportions(_params):
|
|
90
|
-
# testing the behaviour for a file that has 15% invalid values, but all in a single chunk
|
|
91
|
-
valid_value, invalid_value, detected, should_be_valid = _params
|
|
92
|
-
url = f"http://example.com/test.csv"
|
|
93
|
-
expected_content = "col\n"
|
|
94
|
-
for _ in range(60):
|
|
95
|
-
# 60 rows of valid values
|
|
96
|
-
expected_content += f"{valid_value}\n"
|
|
97
|
-
for _ in range(15):
|
|
98
|
-
# 15 rows of invalid values
|
|
99
|
-
expected_content += f"{invalid_value}\n"
|
|
100
|
-
for _ in range(25):
|
|
101
|
-
# 25 rows of valid values
|
|
102
|
-
expected_content += f"{valid_value}\n"
|
|
103
|
-
previous_analysis = {
|
|
104
|
-
"encoding": "utf-8",
|
|
105
|
-
"separator": ",",
|
|
106
|
-
"header_row_idx": 0,
|
|
107
|
-
"header": ["col"],
|
|
108
|
-
"columns": {"col": detected},
|
|
109
|
-
# just setting these keys when validation is successful, they're not used for the validation itself
|
|
110
|
-
"categorical": [],
|
|
111
|
-
"columns_fields": {},
|
|
112
|
-
"columns_labels": {},
|
|
113
|
-
"formats": {},
|
|
114
|
-
}
|
|
115
|
-
with (
|
|
116
|
-
patch("urllib.request.urlopen") as mock_urlopen,
|
|
117
|
-
patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
|
|
118
|
-
):
|
|
119
|
-
mock_response = MagicMock()
|
|
120
|
-
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
121
|
-
mock_response.__enter__.return_value = mock_response
|
|
122
|
-
mock_urlopen.return_value = mock_response
|
|
123
|
-
is_valid, *_ = validate(
|
|
124
|
-
file_path=url,
|
|
125
|
-
previous_analysis=previous_analysis,
|
|
126
|
-
)
|
|
127
|
-
assert is_valid == should_be_valid
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@pytest.mark.parametrize(
|
|
131
|
-
"modif_previous_analysis",
|
|
132
|
-
(
|
|
133
|
-
{"separator": "|"},
|
|
134
|
-
{"encoding": "unknown"},
|
|
135
|
-
{"header": ["a", "b"]},
|
|
136
|
-
{"total_lines": 100},
|
|
137
|
-
{
|
|
138
|
-
"columns.NUMCOM": {
|
|
139
|
-
"python_type": "int",
|
|
140
|
-
"format": "int",
|
|
141
|
-
"score": 1.0,
|
|
142
|
-
},
|
|
143
|
-
},
|
|
144
|
-
),
|
|
145
|
-
)
|
|
146
|
-
def test_validate_then_detect(modif_previous_analysis):
|
|
147
|
-
with open("tests/data/a_test_file.json", "r") as f:
|
|
148
|
-
previous_analysis = json.load(f)
|
|
149
|
-
valid_values = {}
|
|
150
|
-
for dotkey in modif_previous_analysis:
|
|
151
|
-
keys = dotkey.split(".")
|
|
152
|
-
valid_values[dotkey] = get_nested_value(previous_analysis, keys)
|
|
153
|
-
set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
|
|
154
|
-
analysis = validate_then_detect(
|
|
155
|
-
"tests/data/a_test_file.csv",
|
|
156
|
-
previous_analysis=previous_analysis,
|
|
157
|
-
num_rows=-1,
|
|
158
|
-
output_profile=True,
|
|
159
|
-
save_results=False,
|
|
160
|
-
)
|
|
161
|
-
# checking that if not valid, the analysis has managed to retrieve the right values
|
|
162
|
-
for dotkey in modif_previous_analysis:
|
|
163
|
-
assert get_nested_value(analysis, dotkey.split(".")) == valid_values[dotkey]
|