csv-detective 0.8.1.dev1380__py3-none-any.whl → 0.8.1.dev1416__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,14 @@
1
+ import re
1
2
 
2
3
  PROPORTION = 1
4
+ url_pattern = re.compile(
5
+ r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
6
+ r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$"
7
+ )
3
8
 
4
9
 
5
10
  def _is(val):
6
- '''Detects urls'''
11
+ """Detects urls"""
7
12
  if not isinstance(val, str):
8
13
  return False
9
- a = 'http://' in val
10
- b = 'www.' in val
11
- c = any([x in val for x in ['.fr', '.com', '.org', '.gouv', '.net']])
12
- d = not ('@' in val)
13
- return (a or b or c) and d
14
+ return bool(url_pattern.match(val))
csv_detective/utils.py CHANGED
@@ -25,6 +25,7 @@ def display_logs_depending_process_time(prompt: str, duration: float):
25
25
 
26
26
  def is_url(file_path: str) -> bool:
27
27
  # could be more sophisticated if needed
28
+ # using the URL detection test was considered but too broad (schema required to use requests)
28
29
  return file_path.startswith('http')
29
30
 
30
31
 
@@ -3,6 +3,7 @@
3
3
  ## Current (in progress)
4
4
 
5
5
  - Refactor label testing [#119](https://github.com/datagouv/csv-detective/pull/119)
6
+ - Better URL detection [#120](https://github.com/datagouv/csv-detective/pull/120)
6
7
 
7
8
  ## 0.8.0 (2025-05-20)
8
9
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.8.1.dev1380
3
+ Version: 0.8.1.dev1416
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -3,7 +3,7 @@ csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
3
  csv_detective/explore_csv.py,sha256=IT1-9TbS78p6oeDpQ5T6DQ93xQbobcscyBQb6nh86H4,9082
4
4
  csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
5
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
6
- csv_detective/utils.py,sha256=CfR4XztO9KdBecjAX0MfclcRgtB1siv4tQrbCAXyOls,927
6
+ csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
7
7
  csv_detective/validate.py,sha256=4e7f8bNXPU9GqNx4QXXiaoINyotozbL52JB6psVAjyY,2631
8
8
  csv_detective/detect_fields/__init__.py,sha256=7Tz0Niaz0BboA3YVsp_6WPA6ywciwDN4-lOy_Ie_0Y8,976
9
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -63,7 +63,7 @@ csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5
63
63
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
64
64
  csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
65
65
  csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeEZ5Hkf5Wwi3ZKclLER_V0YO3g,154
66
- csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
66
+ csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
67
67
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
68
68
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  csv_detective/detect_fields/temp/date/__init__.py,sha256=1a_Ra9fmT4wgGMrcknXP7eN7A2QiaMF0Yjy0-BMihtA,987
@@ -147,19 +147,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
147
147
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
148
148
  csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
149
149
  csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
150
- csv_detective-0.8.1.dev1380.data/data/share/csv_detective/CHANGELOG.md,sha256=rPCHesCnCZgVSjdXkzEtDCgkkA__aKmvJWko_SvD4gs,8361
151
- csv_detective-0.8.1.dev1380.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
152
- csv_detective-0.8.1.dev1380.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
153
- csv_detective-0.8.1.dev1380.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
150
+ csv_detective-0.8.1.dev1416.data/data/share/csv_detective/CHANGELOG.md,sha256=Ar1X9WX1CVoStDzDEOo5O3P0DgRtUUmo70KAYlWLJyQ,8443
151
+ csv_detective-0.8.1.dev1416.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
152
+ csv_detective-0.8.1.dev1416.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
153
+ csv_detective-0.8.1.dev1416.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
156
- tests/test_fields.py,sha256=E6kEsp6_W56WW6FXWUl7hggsJv-vsKuOaJ9JLoFmrUw,9964
156
+ tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
157
157
  tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
158
158
  tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
159
159
  tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
160
160
  tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
161
- csv_detective-0.8.1.dev1380.dist-info/METADATA,sha256=_892qUzBNdUnGSDIZbXDWVSi-3s4OvgGhxsBkizXWYQ,1386
162
- csv_detective-0.8.1.dev1380.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
163
- csv_detective-0.8.1.dev1380.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
164
- csv_detective-0.8.1.dev1380.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
165
- csv_detective-0.8.1.dev1380.dist-info/RECORD,,
161
+ csv_detective-0.8.1.dev1416.dist-info/METADATA,sha256=aCmQVKUNFvJLzTS8DHELQme0GS9jwrHGod4JLWIGt1o,1386
162
+ csv_detective-0.8.1.dev1416.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ csv_detective-0.8.1.dev1416.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
164
+ csv_detective-0.8.1.dev1416.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
165
+ csv_detective-0.8.1.dev1416.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -293,8 +293,17 @@ fields = {
293
293
  False: ["adresse@mail"],
294
294
  },
295
295
  url: {
296
- True: ["www.etalab.data.gouv.fr"],
297
- False: ["une phrase avec un @ dedans"],
296
+ True: [
297
+ "www.data.gouv.fr",
298
+ "http://data.gouv.fr",
299
+ "https://www.youtube.com/@data-gouv-fr",
300
+ (
301
+ "https://tabular-api.data.gouv.fr/api/resources/"
302
+ "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
303
+ "?score__greater=0.9&decompte__exact=13"
304
+ ),
305
+ ],
306
+ False: ["tmp@data.gouv.fr"],
298
307
  },
299
308
  uuid: {
300
309
  True: ["884762be-51f3-44c3-b811-1e14c5d89262"],