csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1416__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. csv_detective/detect_fields/other/url/__init__.py +7 -6
  2. csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
  3. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
  4. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
  5. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
  6. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
  7. csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
  8. csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
  9. csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
  10. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
  11. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
  12. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
  13. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
  14. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
  15. csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
  16. csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
  17. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
  18. csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
  19. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
  20. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
  21. csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
  22. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
  23. csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
  24. csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
  25. csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
  26. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
  27. csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
  28. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
  29. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
  30. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
  31. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
  32. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
  33. csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
  34. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
  35. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
  36. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
  37. csv_detective/detect_labels/other/booleen/__init__.py +4 -30
  38. csv_detective/detect_labels/other/email/__init__.py +14 -39
  39. csv_detective/detect_labels/other/float/__init__.py +4 -29
  40. csv_detective/detect_labels/other/int/__init__.py +4 -29
  41. csv_detective/detect_labels/other/money/__init__.py +5 -8
  42. csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
  43. csv_detective/detect_labels/other/twitter/__init__.py +4 -29
  44. csv_detective/detect_labels/other/url/__init__.py +17 -42
  45. csv_detective/detect_labels/other/uuid/__init__.py +4 -29
  46. csv_detective/detect_labels/temp/date/__init__.py +22 -47
  47. csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
  48. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
  49. csv_detective/detect_labels/temp/year/__init__.py +13 -38
  50. csv_detective/parsing/text.py +42 -20
  51. csv_detective/utils.py +1 -4
  52. {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/CHANGELOG.md +2 -1
  53. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/METADATA +1 -1
  54. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/RECORD +62 -63
  55. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/WHEEL +1 -1
  56. tests/test_fields.py +11 -2
  57. tests/test_labels.py +18 -2
  58. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  59. {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  60. {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/README.md +0 -0
  61. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/entry_points.txt +0 -0
  62. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  63. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/top_level.txt +0 -0
@@ -1,44 +1,19 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'year',
15
- 'annee',
16
- 'annee depot',
17
- 'an nais',
18
- 'exercice',
19
- 'data year',
20
- 'annee de publication',
21
- 'exercice comptable',
22
- 'annee de naissance',
23
- 'annee ouverture'
8
+ "year",
9
+ "annee",
10
+ "annee depot",
11
+ "an nais",
12
+ "exercice",
13
+ "data year",
14
+ "annee de publication",
15
+ "exercice comptable",
16
+ "annee de naissance",
17
+ "annee ouverture",
24
18
  ]
25
- processed_header = _process_text(header)
26
-
27
- header_matches_words_combination = float(
28
- any(
29
- [
30
- words_combination == processed_header for words_combination in words_combinations_list
31
- ]
32
- )
33
- )
34
- words_combination_in_header = 0.5 * float(
35
- any(
36
- [
37
- is_word_in_string(
38
- words_combination, processed_header
39
- ) for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
-
44
- return max(header_matches_words_combination, words_combination_in_header)
19
+ return header_score(header, words_combinations_list)
@@ -8,6 +8,17 @@ def camel_case_split(identifier: str):
8
8
  return " ".join([m.group(0) for m in matches])
9
9
 
10
10
 
11
+ translate_dict = {
12
+ " ": ["-", "_", "'", ",", " "],
13
+ "a": ["à", "â"],
14
+ "c": ["ç"],
15
+ "e": ["é", "è", "ê", "é"],
16
+ "i": ["î", "ï"],
17
+ "o": ["ô", "ö"],
18
+ "u": ["ù", "û", "ü"],
19
+ }
20
+
21
+
11
22
  # Process text
12
23
  def _process_text(val: str):
13
24
  """Traitement des chaînes de caractères pour les standardiser.
@@ -15,25 +26,36 @@ def _process_text(val: str):
15
26
  des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
16
27
  val = camel_case_split(val)
17
28
  val = val.lower()
18
- val = val.replace("-", " ")
19
- val = val.replace("_", " ")
20
- val = val.replace("'", " ")
21
- val = val.replace(",", " ")
22
- val = val.replace(" ", " ")
23
- val = val.replace("à", "a")
24
- val = val.replace("â", "a")
25
- val = val.replace("ç", "c")
26
- val = val.replace("é", "e")
27
- val = val.replace("é", "e")
28
- val = val.replace("è", "e")
29
- val = val.replace("ê", "e")
30
- val = val.replace("î", "i")
31
- val = val.replace("ï", "i")
32
- val = val.replace("ô", "o")
33
- val = val.replace("ö", "o")
34
- val = val.replace("î", "i")
35
- val = val.replace("û", "u")
36
- val = val.replace("ù", "u")
37
- val = val.replace("ü", "u")
29
+ for target in translate_dict:
30
+ for source in translate_dict[target]:
31
+ val = val.replace(source, target)
38
32
  val = val.strip()
39
33
  return val
34
+
35
+
36
+ def is_word_in_string(word: str, string: str):
37
+ # if the substring is too short, the test can become irrelevant
38
+ return len(word) > 2 and word in string
39
+
40
+
41
+ def header_score(header: str, words_combinations_list: list[str]) -> float:
42
+ """Returns:
43
+ - 1 if the header is exactly in the specified list
44
+ - 0.5 if any of the words is within the header
45
+ - 0 otherwise"""
46
+ processed_header = _process_text(header)
47
+
48
+ header_matches_words_combination = float(
49
+ any(
50
+ words_combination == processed_header for words_combination in words_combinations_list
51
+ )
52
+ )
53
+ words_combination_in_header = 0.5 * (
54
+ any(
55
+ is_word_in_string(
56
+ words_combination, processed_header
57
+ ) for words_combination in words_combinations_list
58
+ )
59
+ )
60
+
61
+ return max(header_matches_words_combination, words_combination_in_header)
csv_detective/utils.py CHANGED
@@ -25,6 +25,7 @@ def display_logs_depending_process_time(prompt: str, duration: float):
25
25
 
26
26
  def is_url(file_path: str) -> bool:
27
27
  # could be more sophisticated if needed
28
+ # using the URL detection test was considered but too broad (schema required to use requests)
28
29
  return file_path.startswith('http')
29
30
 
30
31
 
@@ -32,7 +33,3 @@ def prevent_nan(value: float) -> Optional[float]:
32
33
  if math.isnan(value):
33
34
  return None
34
35
  return value
35
-
36
-
37
- def is_word_in_string(word: str, string: str):
38
- return word in string
@@ -2,7 +2,8 @@
2
2
 
3
3
  ## Current (in progress)
4
4
 
5
- - Nothing yet
5
+ - Refactor label testing [#119](https://github.com/datagouv/csv-detective/pull/119)
6
+ - Better URL detection [#120](https://github.com/datagouv/csv-detective/pull/120)
6
7
 
7
8
  ## 0.8.0 (2025-05-20)
8
9
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.8.1.dev1362
3
+ Version: 0.8.1.dev1416
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -3,7 +3,7 @@ csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
3
  csv_detective/explore_csv.py,sha256=IT1-9TbS78p6oeDpQ5T6DQ93xQbobcscyBQb6nh86H4,9082
4
4
  csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
5
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
6
- csv_detective/utils.py,sha256=8cBKgWifWF7BG_uMfLmxtV45p6PZ4b50NjWXKoAAZ4s,1002
6
+ csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
7
7
  csv_detective/validate.py,sha256=4e7f8bNXPU9GqNx4QXXiaoINyotozbL52JB6psVAjyY,2631
8
8
  csv_detective/detect_fields/__init__.py,sha256=7Tz0Niaz0BboA3YVsp_6WPA6ywciwDN4-lOy_Ie_0Y8,976
9
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -63,7 +63,7 @@ csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5
63
63
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
64
64
  csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
65
65
  csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeEZ5Hkf5Wwi3ZKclLER_V0YO3g,154
66
- csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
66
+ csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
67
67
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
68
68
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  csv_detective/detect_fields/temp/date/__init__.py,sha256=1a_Ra9fmT4wgGMrcknXP7eN7A2QiaMF0Yjy0-BMihtA,987
@@ -74,60 +74,59 @@ csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRh
74
74
  csv_detective/detect_labels/__init__.py,sha256=BJjWlwTnnDe9nomABDUreu9EMu6IFG3T47d7YCJZbRc,878
75
75
  csv_detective/detect_labels/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  csv_detective/detect_labels/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- csv_detective/detect_labels/FR/geo/adresse/__init__.py,sha256=ISgpkhy6KwOmKqCt6w_RpxZ7zm5gx2D3mp2UE9D6Pjw,1033
78
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py,sha256=_QKJX7Og8cL1AYBLjIbvULsy-XJ017G0ZXk7H_GOqdI,1067
79
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py,sha256=_lU5bXG8hODduVxVyXegZjRR_mxWM3SXfwb6stJbOrU,995
80
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py,sha256=qIFLhkj3vr0lfBHtDwYNhGqLgdzN0w7LRFJByt0pEts,919
81
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py,sha256=TUquZFf6cuTIvjvox8ReIiOqzJnepCZcLX21KNtWwyo,1040
82
- csv_detective/detect_labels/FR/geo/code_region/__init__.py,sha256=6I9DpXNMBYJ1bTqAiheFhnMo2vbrz51PdZttrbinGVA,982
83
- csv_detective/detect_labels/FR/geo/commune/__init__.py,sha256=WQl7z3h0428A-4H5ytry0XseAjE7hKLVh2YvCFvqfuM,918
84
- csv_detective/detect_labels/FR/geo/departement/__init__.py,sha256=qnCjAkBGwsKsfLtvW_EgG-9eK_SBgyFrBKE9Q0A7wxI,1199
85
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py,sha256=dLJPbSuOQETbl1IBeme5H4KXtDlfPBe5lIfczR4ek48,927
86
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py,sha256=fJrd8pIewZqAkNNfERWD39kK3oxzYy-Paxce66c3UnY,1356
87
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=xRrXcUUlk7XqHuHbTXUToM3n90_kLXQxdSzMkcc9jIc,1351
88
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py,sha256=1uFQv436tkosABNVU_htAJcggJ6QRlF70-aBgHJHc8A,1109
89
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=dOhUJy_vukt9xFnY2CG4wg1q9vHBUa00mbsu4YSN6xY,1114
90
- csv_detective/detect_labels/FR/geo/pays/__init__.py,sha256=-k5shWSQnLpDvRWKuGFqt5ScbNyBO__vL-4UrL_hRjQ,1139
91
- csv_detective/detect_labels/FR/geo/region/__init__.py,sha256=uQKqMZvG4bs0eafvRHV2RwtbwFJ9vCFQNE2Ep23eHq0,1134
77
+ csv_detective/detect_labels/FR/geo/adresse/__init__.py,sha256=fNWFW-Wo3n6azDBfmi0J0qnzP-p2StLxCc9eNiE9NNE,346
78
+ csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py,sha256=Cr9eyNnP1bLcOx0BlF9ZGZkQDTVuSFjPxvkoZJGs-Eg,379
79
+ csv_detective/detect_labels/FR/geo/code_departement/__init__.py,sha256=Uzufy44ERqIX8wol6tEZg1SrNUcYAWl4AMsWVnL4SLM,355
80
+ csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py,sha256=TUUj3XNlMEK7fl_R5BWBSXYGr_2xzMqIhRTb_GDcnqY,262
81
+ csv_detective/detect_labels/FR/geo/code_postal/__init__.py,sha256=qGGujM5sDqkNZcoVLRRZCh9H9cid9dx2T8jcJsbo3cs,353
82
+ csv_detective/detect_labels/FR/geo/code_region/__init__.py,sha256=gAy0TxV6qL7_SfthSSulouvYJn3C70xMYuqABP61euA,334
83
+ csv_detective/detect_labels/FR/geo/commune/__init__.py,sha256=eTyTtKe1NHTvgaB4jMywIqYRATU2A-E-Tq3m0KDMr6w,261
84
+ csv_detective/detect_labels/FR/geo/departement/__init__.py,sha256=IJy_aKEocrTN39dxK2fE_PoDM4OR9W2rHsR4cULHw9g,512
85
+ csv_detective/detect_labels/FR/geo/insee_canton/__init__.py,sha256=H8iuLwn_x3ctxOL5pi8REqKO5Z3wL4rSDohbSdnnpIM,278
86
+ csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py,sha256=_KjSU6XFeX3Tll5Nb2nnTEhXJXA4-WxqoTov926TGlU,666
87
+ csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
88
+ csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py,sha256=jnbtGriHroGKoOmsmCVGJf6sJXzsVkKH21Qf0aamgkk,428
89
+ csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=jnbtGriHroGKoOmsmCVGJf6sJXzsVkKH21Qf0aamgkk,428
90
+ csv_detective/detect_labels/FR/geo/pays/__init__.py,sha256=GW5wEO0g-YXKXerdtyt4VOVg8kKXUsMb7EPf8nKEbH0,452
91
+ csv_detective/detect_labels/FR/geo/region/__init__.py,sha256=P0eVE46w5GAbTgeGnvJgZynQt7EY_Bi_NZ1gmYxP6io,447
92
92
  csv_detective/detect_labels/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py,sha256=OIOih96ohL50BXgkopAV6NTXQsp5hP78YC46g_r-hKs,909
94
- csv_detective/detect_labels/FR/other/code_rna/__init__.py,sha256=Nih32b26tuJs2f_x-XZ-cjD4nobgBhXsMALsQDlz2NM,994
95
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py,sha256=JcvDvLHlxddehJHEJNAAu3ZmjcJ__6qa4t440CFtKq0,904
96
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=XgcgdjcLA1OdPktRPSPzlXePaK8GYR6SF1DCKSoZ6RA,1013
97
- csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=FoWbiIxDHIcoQmyWMayqmnRedd0I_RuC_0SIhWIXzww,945
98
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=TnZocEWxNwqcX5Y-c45dW9BCEWUMbwFlqM2p0XRTNWU,1094
99
- csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=rn035P9h8PsZ-Fu-v71DxcA_6HH9vmJ8lH-hSPmsflg,926
100
- csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=utC1MWILaja5dkNfg3T2-0gXgHxOpIi74L2SaS5Z2PE,1073
101
- csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=zl45o9AtUgAjsH9WZsdU9nDbEXUEOxuRcAX2JOxUe4U,1010
102
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=BeHgQwLDrFABECzDYfuAKmXhAFGqTK9mrjk2w3aecNY,1113
103
- csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=XYs7d5CipvJcvL1OEIvqKNg1Ubb9nI2x54KG_jW8Sx8,1286
93
+ csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py,sha256=8nl4UCONRg_x5FtdmTGvPnXS8J1ASWCUGr0Ziv32Ngw,221
94
+ csv_detective/detect_labels/FR/other/code_rna/__init__.py,sha256=I7CliSnzWJzAxNlVmbUjMsXThNQe336RzNuBWOXINkc,307
95
+ csv_detective/detect_labels/FR/other/code_waldec/__init__.py,sha256=soWkoyVsSn2E26Sem8Y7u6gyZc7tqzjMJ9VO3aXfLzQ,216
96
+ csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=AI9nqj3zm6_vycAXsXZdsBD7ceNzMzGQL7xZnDZ8nhw,327
97
+ csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=4Crk045ZD_tVovI7C-IqjKFz23Ej5-hrFkhZK4OilqA,258
98
+ csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=N7LzmtNwZERgrwMy3EFHaVBpdiwkt2_9Tt7XVJLff6U,406
99
+ csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=ZWhc8S9L1X2fFh2g5Ja-LuhsfHg_lALKrur6yDnGDPk,238
100
+ csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=g7Y7IvW9VKO528z1MSPxfFtRB7kQXSiG7QQ-VZRfFEk,386
101
+ csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=-gvdxUnv3LRfje60ljC4F3B2c1LBcWfV3zZbV3VJZ08,323
102
+ csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=pg2nwqw2lphUMUeuuh_8NPi54TPmQFP3c8Dl9yGOxbI,427
103
+ csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=5L6JowK9y6y9uZNg6hWzknMSzh0SurkwQeTINNKTdYY,599
104
104
  csv_detective/detect_labels/FR/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py,sha256=Ezpf-7lsk389VKdKMZvZ00rMqq070uSVVb8oko06KGw,1044
106
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py,sha256=5GytrQmPCmr-vndjcAS5cQWOO4RPvrfQh8KqH9qhrCc,904
105
+ csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py,sha256=Vmv7Hp6LxR-bh3aXOBCHYzJVyCHtGoiWzJ40xnfTvdA,357
106
+ csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py,sha256=M4ANAy40vq328DRdB6LudjO9G9duSh7e-RqFr6axXO0,225
107
107
  csv_detective/detect_labels/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py,sha256=CUtYIsh08LjNoa-BJkxrYvHuwJBG--u1AK5BN4RDpL4,1035
109
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=CUtYIsh08LjNoa-BJkxrYvHuwJBG--u1AK5BN4RDpL4,1035
110
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=CUtYIsh08LjNoa-BJkxrYvHuwJBG--u1AK5BN4RDpL4,1035
111
- csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=3scv7fZ5cxu5MR8RR-AF4KmGhkZT--CYcFg22IibhkY,1042
112
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=xRrXcUUlk7XqHuHbTXUToM3n90_kLXQxdSzMkcc9jIc,1351
113
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=yL8Fp4DcwOm0f5_5CbSZwbvGD1p3LOkRS7hxz778O7g,1675
114
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=yHhVPefvqgl8Q1fEdstoxDeGyJNkJ-2b1S5cwdF4HTI,1115
108
+ csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
109
+ csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
110
+ csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
111
+ csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
112
+ csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
113
+ csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=dbWX1LKpoev7zwWthw9vlwGQp6CSlgYrTBnPpvyNC-A,989
114
+ csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
115
115
  csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
- csv_detective/detect_labels/other/booleen/__init__.py,sha256=0AvbuPVr7corJLDOu-wNS9BOy6J8XzOPIouS9MyFKHA,957
117
- csv_detective/detect_labels/other/email/__init__.py,sha256=0VXS8hWILdGRWugx9hEz5yEAnlaoJ6jYX3znkzjlDYE,1118
118
- csv_detective/detect_labels/other/float/__init__.py,sha256=FD8NlVSZ0TARGKKKCkWYRT9vYwDXpQe7X4V7VPJNUrw,896
119
- csv_detective/detect_labels/other/int/__init__.py,sha256=I8ff6zX1tsk4JtNWs0V0Vam-BtdiKiGyUkUvIysfbUY,903
120
- csv_detective/detect_labels/other/money/__init__.py,sha256=kBEGuUy6kYkOI3vC_a7waBciG2ipyV9bhC330U8WaoI,279
121
- csv_detective/detect_labels/other/money/check_col_name.py,sha256=zgp5eUnf3XRQuxgdEGfxPfUnniO8Pzw19uK0ICr2pf8,414
122
- csv_detective/detect_labels/other/mongo_object_id/__init__.py,sha256=gyuizUcsQwdwKVmnaGJbauc01SkqhgaXtsq_vWlwsXs,897
123
- csv_detective/detect_labels/other/twitter/__init__.py,sha256=MGuWhcmZFDcBz16v-g8By_k-RF3UimU7qb8QTAAs8PA,929
124
- csv_detective/detect_labels/other/url/__init__.py,sha256=NSMvRhtNJgyVr2AQpkI1O-UWdBiovq62WHEmMb3WlOM,1172
125
- csv_detective/detect_labels/other/uuid/__init__.py,sha256=ePXGCdVfKus67jvdeq5MZA1CA2j47PKjHhWnrsyCAi8,901
116
+ csv_detective/detect_labels/other/booleen/__init__.py,sha256=BZwnfR-Zcv8dqscLrBKhttgwm4Dqq16M0PaGirxYWio,214
117
+ csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
118
+ csv_detective/detect_labels/other/float/__init__.py,sha256=X0axZN2GAfC_y01zRfIyvOfRsOy2KNQcQ-mlQAKxqT4,216
119
+ csv_detective/detect_labels/other/int/__init__.py,sha256=_1AY7thEBCcgSBQQ2YbY4YaPaxGRQ71BtmaFaX088ig,215
120
+ csv_detective/detect_labels/other/money/__init__.py,sha256=1JRArDZ5r6gtyuKijH_fuuVFVc0f3MN5gPyAf4GPqzs,249
121
+ csv_detective/detect_labels/other/mongo_object_id/__init__.py,sha256=1eoJpaK0mP8Jjh9ljwvG7yG_05fxmAyYoZDdbOVbfw4,209
122
+ csv_detective/detect_labels/other/twitter/__init__.py,sha256=96WhOB6nOutzSFOC5ZJYFSlhHDJRn2SkT4nYNj8E6ww,241
123
+ csv_detective/detect_labels/other/url/__init__.py,sha256=4Ajpdp8W0jS9aHZAAMyUlgefjSgpB7Y6ci29KNkwAoI,485
124
+ csv_detective/detect_labels/other/uuid/__init__.py,sha256=kXVb4oMy-Zv-OYmAIEoNFrBA20l9hbUTdvTfjeMmhjk,213
126
125
  csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
127
- csv_detective/detect_labels/temp/date/__init__.py,sha256=oI77XxATeJLk27r8Cdg1DmSNYtLl5Se4zay3eG12eJ0,1292
128
- csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=C8ZgzfZWVw6nebMuySpED2HRUho8W4rLxv6qDNpJvas,1127
129
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=WPSWNPgDPAHBMT-Nv0X-6u3yTQfcsaab2NNiG2-8qgk,1145
130
- csv_detective/detect_labels/temp/year/__init__.py,sha256=AGkHXXvo_oG9di9p9Glae-c8TIPJ0319isnNKOzBCjk,1120
126
+ csv_detective/detect_labels/temp/date/__init__.py,sha256=w0eeZIseAmPwL4OvCWzZXbxGOIXYRKiZUhEtgHiBXd0,604
127
+ csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=d0laZNzHx-kSARs9Re8TZ11GNs99aMz6gXc72CJ6ul4,440
128
+ csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=53ysj7QgsxXwG1le3zfSJd1oaTTf-Er3jBeYi_A4F9g,458
129
+ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWmcu8K-9jPED-pOlMlErfo,433
131
130
  csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
132
131
  csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
133
132
  csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
@@ -147,20 +146,20 @@ csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm
147
146
  csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
148
147
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
149
148
  csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
150
- csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
151
- csv_detective-0.8.1.dev1362.data/data/share/csv_detective/CHANGELOG.md,sha256=798lqRkhkIMWZgj8UWholpCJ719DOF_bB0hD8y6HM-g,8291
152
- csv_detective-0.8.1.dev1362.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
153
- csv_detective-0.8.1.dev1362.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
154
- csv_detective-0.8.1.dev1362.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
149
+ csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
150
+ csv_detective-0.8.1.dev1416.data/data/share/csv_detective/CHANGELOG.md,sha256=Ar1X9WX1CVoStDzDEOo5O3P0DgRtUUmo70KAYlWLJyQ,8443
151
+ csv_detective-0.8.1.dev1416.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
152
+ csv_detective-0.8.1.dev1416.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
153
+ csv_detective-0.8.1.dev1416.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
155
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
156
155
  tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
157
- tests/test_fields.py,sha256=E6kEsp6_W56WW6FXWUl7hggsJv-vsKuOaJ9JLoFmrUw,9964
156
+ tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
158
157
  tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
159
- tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
158
+ tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
160
159
  tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
161
160
  tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
162
- csv_detective-0.8.1.dev1362.dist-info/METADATA,sha256=Vk1SkLg14wAGFzxtutlVEFt5q9G__rpclVVR7rvtJOo,1386
163
- csv_detective-0.8.1.dev1362.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
164
- csv_detective-0.8.1.dev1362.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
- csv_detective-0.8.1.dev1362.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
166
- csv_detective-0.8.1.dev1362.dist-info/RECORD,,
161
+ csv_detective-0.8.1.dev1416.dist-info/METADATA,sha256=aCmQVKUNFvJLzTS8DHELQme0GS9jwrHGod4JLWIGt1o,1386
162
+ csv_detective-0.8.1.dev1416.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ csv_detective-0.8.1.dev1416.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
164
+ csv_detective-0.8.1.dev1416.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
165
+ csv_detective-0.8.1.dev1416.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.7.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -293,8 +293,17 @@ fields = {
293
293
  False: ["adresse@mail"],
294
294
  },
295
295
  url: {
296
- True: ["www.etalab.data.gouv.fr"],
297
- False: ["une phrase avec un @ dedans"],
296
+ True: [
297
+ "www.data.gouv.fr",
298
+ "http://data.gouv.fr",
299
+ "https://www.youtube.com/@data-gouv-fr",
300
+ (
301
+ "https://tabular-api.data.gouv.fr/api/resources/"
302
+ "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
303
+ "?score__greater=0.9&decompte__exact=13"
304
+ ),
305
+ ],
306
+ False: ["tmp@data.gouv.fr"],
298
307
  },
299
308
  uuid: {
300
309
  True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
tests/test_labels.py CHANGED
@@ -1,7 +1,23 @@
1
- from csv_detective.detect_labels.other import money
1
+ import pytest
2
+
3
+ from csv_detective.detect_labels import latitude_wgs, money
2
4
 
3
5
 
4
6
  # money labels
5
7
  def test_money_labels():
6
8
  header = "Montant total"
7
- assert money._is(header) == 1.0
9
+ assert money._is(header) == 0.5
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ "params", [
14
+ ("latitude", 1.0),
15
+ ("lat", 1.0),
16
+ ("coord_lat", 0.5),
17
+ ("y", 1.0),
18
+ ("nb_cycles", 0.0),
19
+ ]
20
+ )
21
+ def test_latitude(params):
22
+ header, expected = params
23
+ assert expected == latitude_wgs._is(header)
@@ -1,8 +0,0 @@
1
- def is_col_name_related_to_money(name):
2
- # TODO : make this a little bit more clever (spacy ?)
3
- col_name_related_to_money = False
4
- money_themes = ['budget', 'salaire', 'euro', 'euros', 'prêt', 'montant']
5
- # TODO attention 'européeen' est détecté OK
6
- for theme in money_themes:
7
- col_name_related_to_money = col_name_related_to_money or (theme in name)
8
- return col_name_related_to_money