csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1416__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/other/url/__init__.py +7 -6
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
- csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
- csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
- csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
- csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
- csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
- csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
- csv_detective/detect_labels/other/booleen/__init__.py +4 -30
- csv_detective/detect_labels/other/email/__init__.py +14 -39
- csv_detective/detect_labels/other/float/__init__.py +4 -29
- csv_detective/detect_labels/other/int/__init__.py +4 -29
- csv_detective/detect_labels/other/money/__init__.py +5 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
- csv_detective/detect_labels/other/twitter/__init__.py +4 -29
- csv_detective/detect_labels/other/url/__init__.py +17 -42
- csv_detective/detect_labels/other/uuid/__init__.py +4 -29
- csv_detective/detect_labels/temp/date/__init__.py +22 -47
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
- csv_detective/detect_labels/temp/year/__init__.py +13 -38
- csv_detective/parsing/text.py +42 -20
- csv_detective/utils.py +1 -4
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/CHANGELOG.md +2 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/RECORD +62 -63
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/WHEEL +1 -1
- tests/test_fields.py +11 -2
- tests/test_labels.py +18 -2
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/top_level.txt +0 -0
|
@@ -1,44 +1,19 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
8
|
+
"year",
|
|
9
|
+
"annee",
|
|
10
|
+
"annee depot",
|
|
11
|
+
"an nais",
|
|
12
|
+
"exercice",
|
|
13
|
+
"data year",
|
|
14
|
+
"annee de publication",
|
|
15
|
+
"exercice comptable",
|
|
16
|
+
"annee de naissance",
|
|
17
|
+
"annee ouverture",
|
|
24
18
|
]
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
header_matches_words_combination = float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
31
|
-
]
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
words_combination_in_header = 0.5 * float(
|
|
35
|
-
any(
|
|
36
|
-
[
|
|
37
|
-
is_word_in_string(
|
|
38
|
-
words_combination, processed_header
|
|
39
|
-
) for words_combination in words_combinations_list
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
19
|
+
return header_score(header, words_combinations_list)
|
csv_detective/parsing/text.py
CHANGED
|
@@ -8,6 +8,17 @@ def camel_case_split(identifier: str):
|
|
|
8
8
|
return " ".join([m.group(0) for m in matches])
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
translate_dict = {
|
|
12
|
+
" ": ["-", "_", "'", ",", " "],
|
|
13
|
+
"a": ["à", "â"],
|
|
14
|
+
"c": ["ç"],
|
|
15
|
+
"e": ["é", "è", "ê", "é"],
|
|
16
|
+
"i": ["î", "ï"],
|
|
17
|
+
"o": ["ô", "ö"],
|
|
18
|
+
"u": ["ù", "û", "ü"],
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
11
22
|
# Process text
|
|
12
23
|
def _process_text(val: str):
|
|
13
24
|
"""Traitement des chaînes de caractères pour les standardiser.
|
|
@@ -15,25 +26,36 @@ def _process_text(val: str):
|
|
|
15
26
|
des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
|
|
16
27
|
val = camel_case_split(val)
|
|
17
28
|
val = val.lower()
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
val = val.replace(",", " ")
|
|
22
|
-
val = val.replace(" ", " ")
|
|
23
|
-
val = val.replace("à", "a")
|
|
24
|
-
val = val.replace("â", "a")
|
|
25
|
-
val = val.replace("ç", "c")
|
|
26
|
-
val = val.replace("é", "e")
|
|
27
|
-
val = val.replace("é", "e")
|
|
28
|
-
val = val.replace("è", "e")
|
|
29
|
-
val = val.replace("ê", "e")
|
|
30
|
-
val = val.replace("î", "i")
|
|
31
|
-
val = val.replace("ï", "i")
|
|
32
|
-
val = val.replace("ô", "o")
|
|
33
|
-
val = val.replace("ö", "o")
|
|
34
|
-
val = val.replace("î", "i")
|
|
35
|
-
val = val.replace("û", "u")
|
|
36
|
-
val = val.replace("ù", "u")
|
|
37
|
-
val = val.replace("ü", "u")
|
|
29
|
+
for target in translate_dict:
|
|
30
|
+
for source in translate_dict[target]:
|
|
31
|
+
val = val.replace(source, target)
|
|
38
32
|
val = val.strip()
|
|
39
33
|
return val
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def is_word_in_string(word: str, string: str):
|
|
37
|
+
# if the substring is too short, the test can become irrelevant
|
|
38
|
+
return len(word) > 2 and word in string
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def header_score(header: str, words_combinations_list: list[str]) -> float:
|
|
42
|
+
"""Returns:
|
|
43
|
+
- 1 if the header is exactly in the specified list
|
|
44
|
+
- 0.5 if any of the words is within the header
|
|
45
|
+
- 0 otherwise"""
|
|
46
|
+
processed_header = _process_text(header)
|
|
47
|
+
|
|
48
|
+
header_matches_words_combination = float(
|
|
49
|
+
any(
|
|
50
|
+
words_combination == processed_header for words_combination in words_combinations_list
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
words_combination_in_header = 0.5 * (
|
|
54
|
+
any(
|
|
55
|
+
is_word_in_string(
|
|
56
|
+
words_combination, processed_header
|
|
57
|
+
) for words_combination in words_combinations_list
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return max(header_matches_words_combination, words_combination_in_header)
|
csv_detective/utils.py
CHANGED
|
@@ -25,6 +25,7 @@ def display_logs_depending_process_time(prompt: str, duration: float):
|
|
|
25
25
|
|
|
26
26
|
def is_url(file_path: str) -> bool:
|
|
27
27
|
# could be more sophisticated if needed
|
|
28
|
+
# using the URL detection test was considered but too broad (schema required to use requests)
|
|
28
29
|
return file_path.startswith('http')
|
|
29
30
|
|
|
30
31
|
|
|
@@ -32,7 +33,3 @@ def prevent_nan(value: float) -> Optional[float]:
|
|
|
32
33
|
if math.isnan(value):
|
|
33
34
|
return None
|
|
34
35
|
return value
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def is_word_in_string(word: str, string: str):
|
|
38
|
-
return word in string
|
|
@@ -3,7 +3,7 @@ csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
|
|
|
3
3
|
csv_detective/explore_csv.py,sha256=IT1-9TbS78p6oeDpQ5T6DQ93xQbobcscyBQb6nh86H4,9082
|
|
4
4
|
csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
|
|
5
5
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
6
|
-
csv_detective/utils.py,sha256
|
|
6
|
+
csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
|
|
7
7
|
csv_detective/validate.py,sha256=4e7f8bNXPU9GqNx4QXXiaoINyotozbL52JB6psVAjyY,2631
|
|
8
8
|
csv_detective/detect_fields/__init__.py,sha256=7Tz0Niaz0BboA3YVsp_6WPA6ywciwDN4-lOy_Ie_0Y8,976
|
|
9
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -63,7 +63,7 @@ csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5
|
|
|
63
63
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
64
64
|
csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
|
|
65
65
|
csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeEZ5Hkf5Wwi3ZKclLER_V0YO3g,154
|
|
66
|
-
csv_detective/detect_fields/other/url/__init__.py,sha256=
|
|
66
|
+
csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
|
|
67
67
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
|
|
68
68
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
69
69
|
csv_detective/detect_fields/temp/date/__init__.py,sha256=1a_Ra9fmT4wgGMrcknXP7eN7A2QiaMF0Yjy0-BMihtA,987
|
|
@@ -74,60 +74,59 @@ csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRh
|
|
|
74
74
|
csv_detective/detect_labels/__init__.py,sha256=BJjWlwTnnDe9nomABDUreu9EMu6IFG3T47d7YCJZbRc,878
|
|
75
75
|
csv_detective/detect_labels/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
76
|
csv_detective/detect_labels/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
|
-
csv_detective/detect_labels/FR/geo/adresse/__init__.py,sha256=
|
|
78
|
-
csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py,sha256=
|
|
79
|
-
csv_detective/detect_labels/FR/geo/code_departement/__init__.py,sha256=
|
|
80
|
-
csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py,sha256=
|
|
81
|
-
csv_detective/detect_labels/FR/geo/code_postal/__init__.py,sha256=
|
|
82
|
-
csv_detective/detect_labels/FR/geo/code_region/__init__.py,sha256=
|
|
83
|
-
csv_detective/detect_labels/FR/geo/commune/__init__.py,sha256=
|
|
84
|
-
csv_detective/detect_labels/FR/geo/departement/__init__.py,sha256=
|
|
85
|
-
csv_detective/detect_labels/FR/geo/insee_canton/__init__.py,sha256=
|
|
86
|
-
csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py,sha256=
|
|
87
|
-
csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=
|
|
88
|
-
csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py,sha256=
|
|
89
|
-
csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=
|
|
90
|
-
csv_detective/detect_labels/FR/geo/pays/__init__.py,sha256
|
|
91
|
-
csv_detective/detect_labels/FR/geo/region/__init__.py,sha256=
|
|
77
|
+
csv_detective/detect_labels/FR/geo/adresse/__init__.py,sha256=fNWFW-Wo3n6azDBfmi0J0qnzP-p2StLxCc9eNiE9NNE,346
|
|
78
|
+
csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py,sha256=Cr9eyNnP1bLcOx0BlF9ZGZkQDTVuSFjPxvkoZJGs-Eg,379
|
|
79
|
+
csv_detective/detect_labels/FR/geo/code_departement/__init__.py,sha256=Uzufy44ERqIX8wol6tEZg1SrNUcYAWl4AMsWVnL4SLM,355
|
|
80
|
+
csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py,sha256=TUUj3XNlMEK7fl_R5BWBSXYGr_2xzMqIhRTb_GDcnqY,262
|
|
81
|
+
csv_detective/detect_labels/FR/geo/code_postal/__init__.py,sha256=qGGujM5sDqkNZcoVLRRZCh9H9cid9dx2T8jcJsbo3cs,353
|
|
82
|
+
csv_detective/detect_labels/FR/geo/code_region/__init__.py,sha256=gAy0TxV6qL7_SfthSSulouvYJn3C70xMYuqABP61euA,334
|
|
83
|
+
csv_detective/detect_labels/FR/geo/commune/__init__.py,sha256=eTyTtKe1NHTvgaB4jMywIqYRATU2A-E-Tq3m0KDMr6w,261
|
|
84
|
+
csv_detective/detect_labels/FR/geo/departement/__init__.py,sha256=IJy_aKEocrTN39dxK2fE_PoDM4OR9W2rHsR4cULHw9g,512
|
|
85
|
+
csv_detective/detect_labels/FR/geo/insee_canton/__init__.py,sha256=H8iuLwn_x3ctxOL5pi8REqKO5Z3wL4rSDohbSdnnpIM,278
|
|
86
|
+
csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py,sha256=_KjSU6XFeX3Tll5Nb2nnTEhXJXA4-WxqoTov926TGlU,666
|
|
87
|
+
csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
|
|
88
|
+
csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py,sha256=jnbtGriHroGKoOmsmCVGJf6sJXzsVkKH21Qf0aamgkk,428
|
|
89
|
+
csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=jnbtGriHroGKoOmsmCVGJf6sJXzsVkKH21Qf0aamgkk,428
|
|
90
|
+
csv_detective/detect_labels/FR/geo/pays/__init__.py,sha256=GW5wEO0g-YXKXerdtyt4VOVg8kKXUsMb7EPf8nKEbH0,452
|
|
91
|
+
csv_detective/detect_labels/FR/geo/region/__init__.py,sha256=P0eVE46w5GAbTgeGnvJgZynQt7EY_Bi_NZ1gmYxP6io,447
|
|
92
92
|
csv_detective/detect_labels/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
93
|
-
csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py,sha256=
|
|
94
|
-
csv_detective/detect_labels/FR/other/code_rna/__init__.py,sha256=
|
|
95
|
-
csv_detective/detect_labels/FR/other/code_waldec/__init__.py,sha256=
|
|
96
|
-
csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=
|
|
97
|
-
csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=
|
|
98
|
-
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=
|
|
99
|
-
csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=
|
|
100
|
-
csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=
|
|
101
|
-
csv_detective/detect_labels/FR/other/siret/__init__.py,sha256
|
|
102
|
-
csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=
|
|
103
|
-
csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=
|
|
93
|
+
csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py,sha256=8nl4UCONRg_x5FtdmTGvPnXS8J1ASWCUGr0Ziv32Ngw,221
|
|
94
|
+
csv_detective/detect_labels/FR/other/code_rna/__init__.py,sha256=I7CliSnzWJzAxNlVmbUjMsXThNQe336RzNuBWOXINkc,307
|
|
95
|
+
csv_detective/detect_labels/FR/other/code_waldec/__init__.py,sha256=soWkoyVsSn2E26Sem8Y7u6gyZc7tqzjMJ9VO3aXfLzQ,216
|
|
96
|
+
csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=AI9nqj3zm6_vycAXsXZdsBD7ceNzMzGQL7xZnDZ8nhw,327
|
|
97
|
+
csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=4Crk045ZD_tVovI7C-IqjKFz23Ej5-hrFkhZK4OilqA,258
|
|
98
|
+
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=N7LzmtNwZERgrwMy3EFHaVBpdiwkt2_9Tt7XVJLff6U,406
|
|
99
|
+
csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=ZWhc8S9L1X2fFh2g5Ja-LuhsfHg_lALKrur6yDnGDPk,238
|
|
100
|
+
csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=g7Y7IvW9VKO528z1MSPxfFtRB7kQXSiG7QQ-VZRfFEk,386
|
|
101
|
+
csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=-gvdxUnv3LRfje60ljC4F3B2c1LBcWfV3zZbV3VJZ08,323
|
|
102
|
+
csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=pg2nwqw2lphUMUeuuh_8NPi54TPmQFP3c8Dl9yGOxbI,427
|
|
103
|
+
csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=5L6JowK9y6y9uZNg6hWzknMSzh0SurkwQeTINNKTdYY,599
|
|
104
104
|
csv_detective/detect_labels/FR/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
105
|
-
csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py,sha256=
|
|
106
|
-
csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py,sha256=
|
|
105
|
+
csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py,sha256=Vmv7Hp6LxR-bh3aXOBCHYzJVyCHtGoiWzJ40xnfTvdA,357
|
|
106
|
+
csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py,sha256=M4ANAy40vq328DRdB6LudjO9G9duSh7e-RqFr6axXO0,225
|
|
107
107
|
csv_detective/detect_labels/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
-
csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py,sha256=
|
|
109
|
-
csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=
|
|
110
|
-
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=
|
|
111
|
-
csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=
|
|
112
|
-
csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=
|
|
113
|
-
csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=
|
|
114
|
-
csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=
|
|
108
|
+
csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
|
|
109
|
+
csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
|
|
110
|
+
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
|
|
111
|
+
csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
|
|
112
|
+
csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
|
|
113
|
+
csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=dbWX1LKpoev7zwWthw9vlwGQp6CSlgYrTBnPpvyNC-A,989
|
|
114
|
+
csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
|
|
115
115
|
csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
|
-
csv_detective/detect_labels/other/booleen/__init__.py,sha256=
|
|
117
|
-
csv_detective/detect_labels/other/email/__init__.py,sha256=
|
|
118
|
-
csv_detective/detect_labels/other/float/__init__.py,sha256=
|
|
119
|
-
csv_detective/detect_labels/other/int/__init__.py,sha256=
|
|
120
|
-
csv_detective/detect_labels/other/money/__init__.py,sha256=
|
|
121
|
-
csv_detective/detect_labels/other/
|
|
122
|
-
csv_detective/detect_labels/other/
|
|
123
|
-
csv_detective/detect_labels/other/
|
|
124
|
-
csv_detective/detect_labels/other/
|
|
125
|
-
csv_detective/detect_labels/other/uuid/__init__.py,sha256=ePXGCdVfKus67jvdeq5MZA1CA2j47PKjHhWnrsyCAi8,901
|
|
116
|
+
csv_detective/detect_labels/other/booleen/__init__.py,sha256=BZwnfR-Zcv8dqscLrBKhttgwm4Dqq16M0PaGirxYWio,214
|
|
117
|
+
csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
|
|
118
|
+
csv_detective/detect_labels/other/float/__init__.py,sha256=X0axZN2GAfC_y01zRfIyvOfRsOy2KNQcQ-mlQAKxqT4,216
|
|
119
|
+
csv_detective/detect_labels/other/int/__init__.py,sha256=_1AY7thEBCcgSBQQ2YbY4YaPaxGRQ71BtmaFaX088ig,215
|
|
120
|
+
csv_detective/detect_labels/other/money/__init__.py,sha256=1JRArDZ5r6gtyuKijH_fuuVFVc0f3MN5gPyAf4GPqzs,249
|
|
121
|
+
csv_detective/detect_labels/other/mongo_object_id/__init__.py,sha256=1eoJpaK0mP8Jjh9ljwvG7yG_05fxmAyYoZDdbOVbfw4,209
|
|
122
|
+
csv_detective/detect_labels/other/twitter/__init__.py,sha256=96WhOB6nOutzSFOC5ZJYFSlhHDJRn2SkT4nYNj8E6ww,241
|
|
123
|
+
csv_detective/detect_labels/other/url/__init__.py,sha256=4Ajpdp8W0jS9aHZAAMyUlgefjSgpB7Y6ci29KNkwAoI,485
|
|
124
|
+
csv_detective/detect_labels/other/uuid/__init__.py,sha256=kXVb4oMy-Zv-OYmAIEoNFrBA20l9hbUTdvTfjeMmhjk,213
|
|
126
125
|
csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
127
|
-
csv_detective/detect_labels/temp/date/__init__.py,sha256=
|
|
128
|
-
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=
|
|
129
|
-
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=
|
|
130
|
-
csv_detective/detect_labels/temp/year/__init__.py,sha256=
|
|
126
|
+
csv_detective/detect_labels/temp/date/__init__.py,sha256=w0eeZIseAmPwL4OvCWzZXbxGOIXYRKiZUhEtgHiBXd0,604
|
|
127
|
+
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=d0laZNzHx-kSARs9Re8TZ11GNs99aMz6gXc72CJ6ul4,440
|
|
128
|
+
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=53ysj7QgsxXwG1le3zfSJd1oaTTf-Er3jBeYi_A4F9g,458
|
|
129
|
+
csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWmcu8K-9jPED-pOlMlErfo,433
|
|
131
130
|
csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
|
|
132
131
|
csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
|
|
133
132
|
csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
|
|
@@ -147,20 +146,20 @@ csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm
|
|
|
147
146
|
csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
|
|
148
147
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
149
148
|
csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
|
|
150
|
-
csv_detective/parsing/text.py,sha256=
|
|
151
|
-
csv_detective-0.8.1.
|
|
152
|
-
csv_detective-0.8.1.
|
|
153
|
-
csv_detective-0.8.1.
|
|
154
|
-
csv_detective-0.8.1.
|
|
149
|
+
csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
|
|
150
|
+
csv_detective-0.8.1.dev1416.data/data/share/csv_detective/CHANGELOG.md,sha256=Ar1X9WX1CVoStDzDEOo5O3P0DgRtUUmo70KAYlWLJyQ,8443
|
|
151
|
+
csv_detective-0.8.1.dev1416.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
152
|
+
csv_detective-0.8.1.dev1416.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
153
|
+
csv_detective-0.8.1.dev1416.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
155
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
155
|
tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
|
|
157
|
-
tests/test_fields.py,sha256=
|
|
156
|
+
tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
|
|
158
157
|
tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
|
|
159
|
-
tests/test_labels.py,sha256=
|
|
158
|
+
tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
|
|
160
159
|
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
161
160
|
tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
|
|
162
|
-
csv_detective-0.8.1.
|
|
163
|
-
csv_detective-0.8.1.
|
|
164
|
-
csv_detective-0.8.1.
|
|
165
|
-
csv_detective-0.8.1.
|
|
166
|
-
csv_detective-0.8.1.
|
|
161
|
+
csv_detective-0.8.1.dev1416.dist-info/METADATA,sha256=aCmQVKUNFvJLzTS8DHELQme0GS9jwrHGod4JLWIGt1o,1386
|
|
162
|
+
csv_detective-0.8.1.dev1416.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
163
|
+
csv_detective-0.8.1.dev1416.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
164
|
+
csv_detective-0.8.1.dev1416.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
165
|
+
csv_detective-0.8.1.dev1416.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -293,8 +293,17 @@ fields = {
|
|
|
293
293
|
False: ["adresse@mail"],
|
|
294
294
|
},
|
|
295
295
|
url: {
|
|
296
|
-
True: [
|
|
297
|
-
|
|
296
|
+
True: [
|
|
297
|
+
"www.data.gouv.fr",
|
|
298
|
+
"http://data.gouv.fr",
|
|
299
|
+
"https://www.youtube.com/@data-gouv-fr",
|
|
300
|
+
(
|
|
301
|
+
"https://tabular-api.data.gouv.fr/api/resources/"
|
|
302
|
+
"aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
|
|
303
|
+
"?score__greater=0.9&decompte__exact=13"
|
|
304
|
+
),
|
|
305
|
+
],
|
|
306
|
+
False: ["tmp@data.gouv.fr"],
|
|
298
307
|
},
|
|
299
308
|
uuid: {
|
|
300
309
|
True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
|
tests/test_labels.py
CHANGED
|
@@ -1,7 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from csv_detective.detect_labels import latitude_wgs, money
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
# money labels
|
|
5
7
|
def test_money_labels():
|
|
6
8
|
header = "Montant total"
|
|
7
|
-
assert money._is(header) ==
|
|
9
|
+
assert money._is(header) == 0.5
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.parametrize(
|
|
13
|
+
"params", [
|
|
14
|
+
("latitude", 1.0),
|
|
15
|
+
("lat", 1.0),
|
|
16
|
+
("coord_lat", 0.5),
|
|
17
|
+
("y", 1.0),
|
|
18
|
+
("nb_cycles", 0.0),
|
|
19
|
+
]
|
|
20
|
+
)
|
|
21
|
+
def test_latitude(params):
|
|
22
|
+
header, expected = params
|
|
23
|
+
assert expected == latitude_wgs._is(header)
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
def is_col_name_related_to_money(name):
|
|
2
|
-
# TODO : make this a little bit more clever (spacy ?)
|
|
3
|
-
col_name_related_to_money = False
|
|
4
|
-
money_themes = ['budget', 'salaire', 'euro', 'euros', 'prêt', 'montant']
|
|
5
|
-
# TODO attention 'européeen' est détecté OK
|
|
6
|
-
for theme in money_themes:
|
|
7
|
-
col_name_related_to_money = col_name_related_to_money or (theme in name)
|
|
8
|
-
return col_name_related_to_money
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/top_level.txt
RENAMED
|
File without changes
|