csv-detective 0.9.3.dev1915__py3-none-any.whl → 0.9.3.dev1948__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -183,13 +183,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
183
183
  samples = pd.concat(
184
184
  [
185
185
  # one row with the minimum of the column
186
- table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
186
+ table.loc[table[col] == val].iloc[[0]]
187
187
  for col in table.columns
188
+ if not pd.isna(val := table[col].dropna().min())
188
189
  ]
189
190
  + [
190
191
  # one row with the maximum of the column
191
- table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
192
+ table.loc[table[col] == val].iloc[[0]]
192
193
  for col in table.columns
194
+ if not pd.isna(val := table[col].dropna().max())
193
195
  ]
194
196
  + [
195
197
  # one row with a NaN value if the column has any
@@ -199,7 +201,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
199
201
  ],
200
202
  ignore_index=True,
201
203
  )
202
- return pd.concat(
203
- [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
204
- ignore_index=True,
204
+ return (
205
+ pd.concat(
206
+ [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
207
+ ignore_index=True,
208
+ )
209
+ # this is very unlikely but we never know
210
+ if len(samples) <= MAX_ROWS_ANALYSIS
211
+ else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
205
212
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev1915
3
+ Version: 0.9.3.dev1948
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
132
132
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
133
133
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
134
134
  csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
135
- csv_detective/detection/formats.py,sha256=94vhRl0GQlRsp7Upkt6Sceess5qXQR5eYrWN-C-CVR8,7461
135
+ csv_detective/detection/formats.py,sha256=aP6boV9fz0xH-u_uMAwwo2GKO_jkUBWi8orxRcZQVGE,7734
136
136
  csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
137
137
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
150
150
  csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
151
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
152
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.3.dev1915.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.9.3.dev1948.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
156
  tests/test_fields.py,sha256=-6wwuqNmGUIxpNn4u9_OmgqgS95uKWBtahDGy3iw3NI,12566
157
- tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
157
+ tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
158
158
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
159
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
160
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
161
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
162
  venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
163
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.3.dev1915.dist-info/METADATA,sha256=9HxUTSbsVUzmqObgTsqSMbjSQkNgxZxdZp6OdImjUN0,9767
165
- csv_detective-0.9.3.dev1915.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.3.dev1915.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.3.dev1915.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.3.dev1915.dist-info/RECORD,,
164
+ csv_detective-0.9.3.dev1948.dist-info/METADATA,sha256=gl7Ss-DfsY0OU7kn0cdoe4PInQ1WpXed4GRru0np4rU,9767
165
+ csv_detective-0.9.3.dev1948.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.9.3.dev1948.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.9.3.dev1948.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
+ csv_detective-0.9.3.dev1948.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -6,6 +6,7 @@ import responses
6
6
 
7
7
  from csv_detective import routine
8
8
  from csv_detective.output.profile import create_profile
9
+ from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
9
10
 
10
11
 
11
12
  @pytest.mark.parametrize(
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
343
344
  save_results=False,
344
345
  )
345
346
  assert analysis["columns"][col_name]["format"] == "int"
347
+
348
+
349
+ def test_full_nan_column(mocked_responses):
350
+ # we want a file that needs sampling
351
+ expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
352
+ mocked_responses.get(
353
+ "http://example.com/test.csv",
354
+ body=expected_content,
355
+ status=200,
356
+ )
357
+ # just testing it doesn't fail
358
+ routine(
359
+ file_path="http://example.com/test.csv",
360
+ num_rows=-1,
361
+ output_profile=False,
362
+ save_results=False,
363
+ )