csv-detective 0.9.1.dev1847__py3-none-any.whl → 0.9.1.dev1860__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,9 @@ from csv_detective.output.utils import prepare_output_dict
14
14
  from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
15
15
  from csv_detective.validate import validate
16
16
 
17
+ # above this threshold, a column is not considered categorical
18
+ MAX_NUMBER_CATEGORICAL_VALUES = 25
19
+
17
20
 
18
21
  def detect_formats(
19
22
  table: pd.DataFrame,
@@ -28,14 +31,18 @@ def detect_formats(
28
31
  if on_sample:
29
32
  if verbose:
30
33
  logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
31
- table = table.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
34
+ table = build_sample(table)
32
35
 
33
36
  if table.empty:
34
37
  res_categorical = []
35
38
  # res_continuous = []
36
39
  else:
37
40
  # Detects columns that are categorical
38
- res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
41
+ res_categorical, categorical_mask = detect_categorical_variable(
42
+ table,
43
+ max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
44
+ verbose=verbose,
45
+ )
39
46
  res_categorical = list(res_categorical)
40
47
  # Detect columns that are continuous (we already know the categorical) :
41
48
  # we don't need this for now, cuts processing time
@@ -166,3 +173,33 @@ def detect_formats(
166
173
  raise ValueError("Could not infer detected formats on the whole file")
167
174
 
168
175
  return analysis
176
+
177
+
178
+ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
179
+ """
180
+ building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
181
+ the min and max values of each column, and one case of NaN if the column contains any.
182
+ """
183
+ samples = pd.concat(
184
+ [
185
+ # one row with the minimum of the column
186
+ table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
187
+ for col in table.columns
188
+ ]
189
+ + [
190
+ # one row with the maximum of the column
191
+ table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
192
+ for col in table.columns
193
+ ]
194
+ + [
195
+ # one row with a NaN value if the column has any
196
+ table.loc[table[col].isna()].iloc[[0]]
197
+ for col in table.columns
198
+ if table[col].isna().any()
199
+ ],
200
+ ignore_index=True,
201
+ )
202
+ return pd.concat(
203
+ [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
204
+ ignore_index=True,
205
+ )
@@ -6,7 +6,7 @@ import pandas as pd
6
6
 
7
7
  from csv_detective.utils import display_logs_depending_process_time
8
8
 
9
- MAX_ROWS_ANALYSIS = int(1e5)
9
+ MAX_ROWS_ANALYSIS = int(1e4)
10
10
 
11
11
 
12
12
  def test_col_val(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.1.dev1847
3
+ Version: 0.9.1.dev1860
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
132
132
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
133
133
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
134
134
  csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
135
- csv_detective/detection/formats.py,sha256=dzJPdi2rP2jTHZBk9UHpJL3c5N-PSohCymHs-OZt45c,6211
135
+ csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
136
136
  csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
137
137
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -144,25 +144,25 @@ csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77l
144
144
  csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
145
145
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
146
146
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
- csv_detective/parsing/columns.py,sha256=fbvQMu12gAmz4TnNCL7pLnMFB-mWN_O-zEoj8jEGj0A,5696
147
+ csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
148
148
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
149
149
  csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
150
150
  csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
151
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
152
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.1.dev1847.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.9.1.dev1860.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
156
  tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
157
- tests/test_file.py,sha256=YuVbSfeo_ASPiLT8CyxXqJENcDpj4wAFXzLwu_GzsOA,8437
157
+ tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
158
158
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
159
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
160
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
161
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
162
  venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
163
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.1.dev1847.dist-info/METADATA,sha256=4GPrJUwsDAkxwVV9fnFv4pVHmelYX1C1H4QCh_zG8wc,9767
165
- csv_detective-0.9.1.dev1847.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.1.dev1847.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.1.dev1847.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.1.dev1847.dist-info/RECORD,,
164
+ csv_detective-0.9.1.dev1860.dist-info/METADATA,sha256=v8z2NQcMQznhH_35NtggEtjF-H9UGUycexq3Y8dNtp8,9767
165
+ csv_detective-0.9.1.dev1860.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.9.1.dev1860.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.9.1.dev1860.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
+ csv_detective-0.9.1.dev1860.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -276,3 +276,20 @@ def test_cast_json(mocked_responses, cast_json):
276
276
  )
277
277
  assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
278
278
  assert isinstance(df["a_simple_dict"][0], expected_type)
279
+
280
+
281
+ def test_almost_uniform_column(mocked_responses):
282
+ col_name = "int_not_bool"
283
+ expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
284
+ mocked_responses.get(
285
+ "http://example.com/test.csv",
286
+ body=expected_content,
287
+ status=200,
288
+ )
289
+ analysis = routine(
290
+ file_path="http://example.com/test.csv",
291
+ num_rows=-1,
292
+ output_profile=False,
293
+ save_results=False,
294
+ )
295
+ assert analysis["columns"][col_name]["format"] == "int"