csv-detective 0.9.1.dev1847__py3-none-any.whl → 0.9.1.dev1860__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +39 -2
- csv_detective/parsing/columns.py +1 -1
- {csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/METADATA +1 -1
- {csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/RECORD +9 -9
- tests/test_file.py +17 -0
- {csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/top_level.txt +0 -0
|
@@ -14,6 +14,9 @@ from csv_detective.output.utils import prepare_output_dict
|
|
|
14
14
|
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
|
|
15
15
|
from csv_detective.validate import validate
|
|
16
16
|
|
|
17
|
+
# above this threshold, a column is not considered categorical
|
|
18
|
+
MAX_NUMBER_CATEGORICAL_VALUES = 25
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
def detect_formats(
|
|
19
22
|
table: pd.DataFrame,
|
|
@@ -28,14 +31,18 @@ def detect_formats(
|
|
|
28
31
|
if on_sample:
|
|
29
32
|
if verbose:
|
|
30
33
|
logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
|
|
31
|
-
table = table
|
|
34
|
+
table = build_sample(table)
|
|
32
35
|
|
|
33
36
|
if table.empty:
|
|
34
37
|
res_categorical = []
|
|
35
38
|
# res_continuous = []
|
|
36
39
|
else:
|
|
37
40
|
# Detects columns that are categorical
|
|
38
|
-
res_categorical, categorical_mask = detect_categorical_variable(
|
|
41
|
+
res_categorical, categorical_mask = detect_categorical_variable(
|
|
42
|
+
table,
|
|
43
|
+
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
|
44
|
+
verbose=verbose,
|
|
45
|
+
)
|
|
39
46
|
res_categorical = list(res_categorical)
|
|
40
47
|
# Detect columns that are continuous (we already know the categorical) :
|
|
41
48
|
# we don't need this for now, cuts processing time
|
|
@@ -166,3 +173,33 @@ def detect_formats(
|
|
|
166
173
|
raise ValueError("Could not infer detected formats on the whole file")
|
|
167
174
|
|
|
168
175
|
return analysis
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
179
|
+
"""
|
|
180
|
+
building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
|
|
181
|
+
the min and max values of each column, and one case of NaN if the column contains any.
|
|
182
|
+
"""
|
|
183
|
+
samples = pd.concat(
|
|
184
|
+
[
|
|
185
|
+
# one row with the minimum of the column
|
|
186
|
+
table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
|
|
187
|
+
for col in table.columns
|
|
188
|
+
]
|
|
189
|
+
+ [
|
|
190
|
+
# one row with the maximum of the column
|
|
191
|
+
table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
|
|
192
|
+
for col in table.columns
|
|
193
|
+
]
|
|
194
|
+
+ [
|
|
195
|
+
# one row with a NaN value if the column has any
|
|
196
|
+
table.loc[table[col].isna()].iloc[[0]]
|
|
197
|
+
for col in table.columns
|
|
198
|
+
if table[col].isna().any()
|
|
199
|
+
],
|
|
200
|
+
ignore_index=True,
|
|
201
|
+
)
|
|
202
|
+
return pd.concat(
|
|
203
|
+
[samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
|
|
204
|
+
ignore_index=True,
|
|
205
|
+
)
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
132
132
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
133
133
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
134
134
|
csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
|
|
135
|
-
csv_detective/detection/formats.py,sha256=
|
|
135
|
+
csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
|
|
136
136
|
csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
|
|
137
137
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
@@ -144,25 +144,25 @@ csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77l
|
|
|
144
144
|
csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
|
|
145
145
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
146
146
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
147
|
-
csv_detective/parsing/columns.py,sha256=
|
|
147
|
+
csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
|
|
148
148
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
149
149
|
csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
|
|
150
150
|
csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
|
|
151
151
|
csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
|
|
152
152
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.9.1.
|
|
153
|
+
csv_detective-0.9.1.dev1860.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
155
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
156
|
tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
|
|
157
|
-
tests/test_file.py,sha256=
|
|
157
|
+
tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
|
|
158
158
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
159
159
|
tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
|
|
160
160
|
tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
161
161
|
venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
|
|
162
162
|
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
163
163
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.9.1.
|
|
165
|
-
csv_detective-0.9.1.
|
|
166
|
-
csv_detective-0.9.1.
|
|
167
|
-
csv_detective-0.9.1.
|
|
168
|
-
csv_detective-0.9.1.
|
|
164
|
+
csv_detective-0.9.1.dev1860.dist-info/METADATA,sha256=v8z2NQcMQznhH_35NtggEtjF-H9UGUycexq3Y8dNtp8,9767
|
|
165
|
+
csv_detective-0.9.1.dev1860.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.9.1.dev1860.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.9.1.dev1860.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
168
|
+
csv_detective-0.9.1.dev1860.dist-info/RECORD,,
|
tests/test_file.py
CHANGED
|
@@ -276,3 +276,20 @@ def test_cast_json(mocked_responses, cast_json):
|
|
|
276
276
|
)
|
|
277
277
|
assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
|
|
278
278
|
assert isinstance(df["a_simple_dict"][0], expected_type)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def test_almost_uniform_column(mocked_responses):
|
|
282
|
+
col_name = "int_not_bool"
|
|
283
|
+
expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
|
|
284
|
+
mocked_responses.get(
|
|
285
|
+
"http://example.com/test.csv",
|
|
286
|
+
body=expected_content,
|
|
287
|
+
status=200,
|
|
288
|
+
)
|
|
289
|
+
analysis = routine(
|
|
290
|
+
file_path="http://example.com/test.csv",
|
|
291
|
+
num_rows=-1,
|
|
292
|
+
output_profile=False,
|
|
293
|
+
save_results=False,
|
|
294
|
+
)
|
|
295
|
+
assert analysis["columns"][col_name]["format"] == "int"
|
|
File without changes
|
{csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.1.dev1847.dist-info → csv_detective-0.9.1.dev1860.dist-info}/top_level.txt
RENAMED
|
File without changes
|