csv-detective 0.7.5.dev1180__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_fields/temp/date/__init__.py +5 -1
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
- csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/other/booleen/__init__.py +1 -1
- csv_detective/detect_labels/other/email/__init__.py +1 -1
- csv_detective/detect_labels/other/float/__init__.py +1 -1
- csv_detective/detect_labels/other/int/__init__.py +1 -1
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detect_labels/other/twitter/__init__.py +1 -1
- csv_detective/detect_labels/other/url/__init__.py +1 -1
- csv_detective/detect_labels/other/uuid/__init__.py +1 -1
- csv_detective/detect_labels/temp/date/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
- csv_detective/detect_labels/temp/year/__init__.py +1 -1
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +27 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/headers.py +32 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +98 -0
- csv_detective/explore_csv.py +40 -110
- csv_detective/output/dataframe.py +55 -0
- csv_detective/{create_example.py → output/example.py} +10 -9
- csv_detective/output/profile.py +87 -0
- csv_detective/{schema_generation.py → output/schema.py} +344 -343
- csv_detective/output/utils.py +51 -0
- csv_detective/parsing/columns.py +141 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +55 -0
- csv_detective/parsing/excel.py +169 -0
- csv_detective/parsing/load.py +97 -0
- csv_detective/utils.py +10 -236
- {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +3 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +3 -2
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +85 -71
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +1 -1
- tests/test_fields.py +7 -6
- tests/test_file.py +56 -57
- csv_detective/detection.py +0 -618
- /csv_detective/{process_text.py → parsing/text.py} +0 -0
- {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info/licenses}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
tests/test_file.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
import pandas as pd
|
|
2
2
|
import pytest
|
|
3
3
|
import responses
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from csv_detective import routine
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def test_columns_output_on_file():
|
|
8
9
|
output = routine(
|
|
9
|
-
|
|
10
|
+
file_path="tests/data/a_test_file.csv",
|
|
10
11
|
num_rows=-1,
|
|
11
12
|
output_profile=False,
|
|
12
13
|
save_results=False,
|
|
@@ -40,7 +41,7 @@ def test_columns_output_on_file():
|
|
|
40
41
|
|
|
41
42
|
def test_profile_output_on_file():
|
|
42
43
|
output = routine(
|
|
43
|
-
|
|
44
|
+
file_path="tests/data/a_test_file.csv",
|
|
44
45
|
num_rows=-1,
|
|
45
46
|
output_profile=True,
|
|
46
47
|
save_results=False,
|
|
@@ -69,10 +70,10 @@ def test_profile_output_on_file():
|
|
|
69
70
|
assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def
|
|
73
|
+
def test_profile_with_num_rows():
|
|
73
74
|
with pytest.raises(ValueError):
|
|
74
75
|
routine(
|
|
75
|
-
|
|
76
|
+
file_path="tests/data/a_test_file.csv",
|
|
76
77
|
num_rows=50,
|
|
77
78
|
output_profile=True,
|
|
78
79
|
save_results=False,
|
|
@@ -85,7 +86,7 @@ def test_exception_different_number_of_columns():
|
|
|
85
86
|
"""
|
|
86
87
|
with pytest.raises(ValueError):
|
|
87
88
|
routine(
|
|
88
|
-
|
|
89
|
+
file_path="tests/data/c_test_file.csv",
|
|
89
90
|
num_rows=-1,
|
|
90
91
|
output_profile=True,
|
|
91
92
|
save_results=False,
|
|
@@ -94,7 +95,7 @@ def test_exception_different_number_of_columns():
|
|
|
94
95
|
|
|
95
96
|
def test_code_dep_reg_on_file():
|
|
96
97
|
output = routine(
|
|
97
|
-
|
|
98
|
+
file_path="tests/data/b_test_file.csv",
|
|
98
99
|
num_rows=-1,
|
|
99
100
|
output_profile=False,
|
|
100
101
|
save_results=False,
|
|
@@ -106,7 +107,7 @@ def test_code_dep_reg_on_file():
|
|
|
106
107
|
|
|
107
108
|
def test_schema_on_file():
|
|
108
109
|
output = routine(
|
|
109
|
-
|
|
110
|
+
file_path="tests/data/b_test_file.csv",
|
|
110
111
|
num_rows=-1,
|
|
111
112
|
output_schema=True,
|
|
112
113
|
save_results=False,
|
|
@@ -131,52 +132,37 @@ def test_schema_on_file():
|
|
|
131
132
|
assert is_column_reg
|
|
132
133
|
|
|
133
134
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
)
|
|
141
|
-
assert _['engine'] == 'odf'
|
|
142
|
-
|
|
135
|
+
params_csv = [
|
|
136
|
+
("csv_file", {"engine": None, "sheet_name": None}),
|
|
137
|
+
("file.csv.gz", {"engine": None, "sheet_name": None, "separator": ",", "columns.len": 3}),
|
|
138
|
+
]
|
|
139
|
+
params_others = [
|
|
140
|
+
("file.ods", {"engine": "odf"}),
|
|
143
141
|
# this is a "tricked" xls file that is actually read as odf
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
)
|
|
150
|
-
assert _['engine'] == 'odf'
|
|
142
|
+
("file.xls", {"engine": "odf"}),
|
|
143
|
+
# this file has an empty first row; check if the sheet we consider is the largest
|
|
144
|
+
("file.xlsx", {"engine": "openpyxl", "header_row_idx": 1, "sheet_name": "REI_1987"}),
|
|
145
|
+
("xlsx_file", {"engine": "openpyxl"}),
|
|
146
|
+
]
|
|
151
147
|
|
|
152
|
-
_ = routine(
|
|
153
|
-
csv_file_path="tests/file.xlsx",
|
|
154
|
-
num_rows=-1,
|
|
155
|
-
output_profile=False,
|
|
156
|
-
save_results=False,
|
|
157
|
-
)
|
|
158
|
-
assert _['engine'] == 'openpyxl'
|
|
159
|
-
# this file has an empty first row
|
|
160
|
-
assert _['header_row_idx'] == 1
|
|
161
|
-
# check if the sheet we consider is the largest
|
|
162
|
-
assert _['sheet_name'] == 'REI_1987'
|
|
163
|
-
|
|
164
|
-
_ = routine(
|
|
165
|
-
csv_file_path="tests/csv_file",
|
|
166
|
-
num_rows=-1,
|
|
167
|
-
output_profile=False,
|
|
168
|
-
save_results=False,
|
|
169
|
-
)
|
|
170
|
-
assert not _.get('engine')
|
|
171
|
-
assert not _.get('sheet_name')
|
|
172
148
|
|
|
149
|
+
@pytest.mark.parametrize("params", params_csv + params_others)
|
|
150
|
+
def test_non_csv_files(params):
|
|
151
|
+
file_name, checks = params
|
|
173
152
|
_ = routine(
|
|
174
|
-
|
|
153
|
+
file_path=f"tests/data/{file_name}",
|
|
175
154
|
num_rows=-1,
|
|
176
155
|
output_profile=False,
|
|
177
156
|
save_results=False,
|
|
178
157
|
)
|
|
179
|
-
|
|
158
|
+
for k, v in checks.items():
|
|
159
|
+
if v is None:
|
|
160
|
+
assert not _.get(k)
|
|
161
|
+
elif "." in k:
|
|
162
|
+
key, func = k.split(".")
|
|
163
|
+
assert eval(func)(_[key]) == v
|
|
164
|
+
else:
|
|
165
|
+
assert _[k] == v
|
|
180
166
|
|
|
181
167
|
|
|
182
168
|
@pytest.fixture
|
|
@@ -185,21 +171,34 @@ def mocked_responses():
|
|
|
185
171
|
yield rsps
|
|
186
172
|
|
|
187
173
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
174
|
+
@pytest.mark.parametrize(
|
|
175
|
+
"params",
|
|
176
|
+
# ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
|
|
177
|
+
# which doesn't support the way we mock the response, TBC
|
|
178
|
+
params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 414})]
|
|
179
|
+
)
|
|
180
|
+
def test_urls(mocked_responses, params):
|
|
181
|
+
file_name, checks = params
|
|
182
|
+
url = f"http://example.com/{file_name}"
|
|
191
183
|
mocked_responses.get(
|
|
192
184
|
url,
|
|
193
|
-
body=
|
|
185
|
+
body=open(f"tests/data/{file_name}", "rb").read(),
|
|
194
186
|
status=200,
|
|
195
187
|
)
|
|
196
|
-
|
|
197
|
-
|
|
188
|
+
_ = routine(
|
|
189
|
+
file_path=url,
|
|
198
190
|
num_rows=-1,
|
|
199
191
|
output_profile=False,
|
|
200
192
|
save_results=False,
|
|
201
193
|
)
|
|
202
|
-
|
|
194
|
+
for k, v in checks.items():
|
|
195
|
+
if v is None:
|
|
196
|
+
assert not _.get(k)
|
|
197
|
+
elif "." in k:
|
|
198
|
+
key, func = k.split(".")
|
|
199
|
+
assert eval(func)(_[key]) == v
|
|
200
|
+
else:
|
|
201
|
+
assert _[k] == v
|
|
203
202
|
|
|
204
203
|
|
|
205
204
|
@pytest.mark.parametrize(
|
|
@@ -213,7 +212,7 @@ def test_nan_values(expected_type):
|
|
|
213
212
|
# if skipping NaN, the column contains only ints
|
|
214
213
|
skipna, expected_type = expected_type
|
|
215
214
|
output = routine(
|
|
216
|
-
|
|
215
|
+
file_path="tests/data/b_test_file.csv",
|
|
217
216
|
num_rows=-1,
|
|
218
217
|
save_results=False,
|
|
219
218
|
skipna=skipna,
|
|
@@ -223,7 +222,7 @@ def test_nan_values(expected_type):
|
|
|
223
222
|
|
|
224
223
|
def test_output_df():
|
|
225
224
|
output, df = routine(
|
|
226
|
-
|
|
225
|
+
file_path="tests/data/b_test_file.csv",
|
|
227
226
|
num_rows=-1,
|
|
228
227
|
output_profile=False,
|
|
229
228
|
save_results=False,
|
|
@@ -251,7 +250,7 @@ def test_cast_json(mocked_responses, cast_json):
|
|
|
251
250
|
status=200,
|
|
252
251
|
)
|
|
253
252
|
analysis, df = routine(
|
|
254
|
-
|
|
253
|
+
file_path='http://example.com/test.csv',
|
|
255
254
|
num_rows=-1,
|
|
256
255
|
output_profile=False,
|
|
257
256
|
save_results=False,
|