csv-detective 0.9.3.dev1901__py3-none-any.whl → 0.9.3.dev1948__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  import re
2
2
 
3
- PROPORTION = 1
3
+ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
7
  """Detects e-mails"""
8
8
  return isinstance(val, str) and bool(
9
- re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
9
+ re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
10
10
  )
@@ -30,7 +30,7 @@ def detect_formats(
30
30
  on_sample = len(table) > MAX_ROWS_ANALYSIS
31
31
  if on_sample:
32
32
  if verbose:
33
- logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
33
+ logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
34
34
  table = build_sample(table)
35
35
 
36
36
  if table.empty:
@@ -183,13 +183,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
183
183
  samples = pd.concat(
184
184
  [
185
185
  # one row with the minimum of the column
186
- table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
186
+ table.loc[table[col] == val].iloc[[0]]
187
187
  for col in table.columns
188
+ if not pd.isna(val := table[col].dropna().min())
188
189
  ]
189
190
  + [
190
191
  # one row with the maximum of the column
191
- table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
192
+ table.loc[table[col] == val].iloc[[0]]
192
193
  for col in table.columns
194
+ if not pd.isna(val := table[col].dropna().max())
193
195
  ]
194
196
  + [
195
197
  # one row with a NaN value if the column has any
@@ -199,7 +201,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
199
201
  ],
200
202
  ignore_index=True,
201
203
  )
202
- return pd.concat(
203
- [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
204
- ignore_index=True,
204
+ return (
205
+ pd.concat(
206
+ [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
207
+ ignore_index=True,
208
+ )
209
+ # this is very unlikely but we never know
210
+ if len(samples) <= MAX_ROWS_ANALYSIS
211
+ else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
205
212
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev1901
3
+ Version: 0.9.3.dev1948
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -56,7 +56,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-
56
56
  csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
57
57
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
59
- csv_detective/detect_fields/other/email/__init__.py,sha256=p235wILf0fR9TeSEuyuPgoysAv9zg23a4vzdy3YJlxE,192
59
+ csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
60
60
  csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
61
61
  csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
62
62
  csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
132
132
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
133
133
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
134
134
  csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
135
- csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
135
+ csv_detective/detection/formats.py,sha256=aP6boV9fz0xH-u_uMAwwo2GKO_jkUBWi8orxRcZQVGE,7734
136
136
  csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
137
137
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
150
150
  csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
151
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
152
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.3.dev1901.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.9.3.dev1948.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
- tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
157
- tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
156
+ tests/test_fields.py,sha256=-6wwuqNmGUIxpNn4u9_OmgqgS95uKWBtahDGy3iw3NI,12566
157
+ tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
158
158
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
159
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
160
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
161
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
162
  venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
163
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.3.dev1901.dist-info/METADATA,sha256=zlYwJcrxQIjXmPEUaQuAIsIyl2hQsa_ORGAwO5SKfAw,9767
165
- csv_detective-0.9.3.dev1901.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.3.dev1901.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.3.dev1901.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.3.dev1901.dist-info/RECORD,,
164
+ csv_detective-0.9.3.dev1948.dist-info/METADATA,sha256=gl7Ss-DfsY0OU7kn0cdoe4PInQ1WpXed4GRru0np4rU,9767
165
+ csv_detective-0.9.3.dev1948.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.9.3.dev1948.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.9.3.dev1948.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
+ csv_detective-0.9.3.dev1948.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -284,7 +284,7 @@ fields = {
284
284
  False: ["nein", "ja", "2", "-0"],
285
285
  },
286
286
  email: {
287
- True: ["cdo_intern@data.gouv.fr"],
287
+ True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
288
288
  False: ["cdo@@gouv.sfd"],
289
289
  },
290
290
  json: {
tests/test_file.py CHANGED
@@ -6,6 +6,7 @@ import responses
6
6
 
7
7
  from csv_detective import routine
8
8
  from csv_detective.output.profile import create_profile
9
+ from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
9
10
 
10
11
 
11
12
  @pytest.mark.parametrize(
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
343
344
  save_results=False,
344
345
  )
345
346
  assert analysis["columns"][col_name]["format"] == "int"
347
+
348
+
349
+ def test_full_nan_column(mocked_responses):
350
+ # we want a file that needs sampling
351
+ expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
352
+ mocked_responses.get(
353
+ "http://example.com/test.csv",
354
+ body=expected_content,
355
+ status=200,
356
+ )
357
+ # just testing it doesn't fail
358
+ routine(
359
+ file_path="http://example.com/test.csv",
360
+ num_rows=-1,
361
+ output_profile=False,
362
+ save_results=False,
363
+ )