csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1416__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/other/url/__init__.py +7 -6
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
- csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
- csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
- csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
- csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
- csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
- csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
- csv_detective/detect_labels/other/booleen/__init__.py +4 -30
- csv_detective/detect_labels/other/email/__init__.py +14 -39
- csv_detective/detect_labels/other/float/__init__.py +4 -29
- csv_detective/detect_labels/other/int/__init__.py +4 -29
- csv_detective/detect_labels/other/money/__init__.py +5 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
- csv_detective/detect_labels/other/twitter/__init__.py +4 -29
- csv_detective/detect_labels/other/url/__init__.py +17 -42
- csv_detective/detect_labels/other/uuid/__init__.py +4 -29
- csv_detective/detect_labels/temp/date/__init__.py +22 -47
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
- csv_detective/detect_labels/temp/year/__init__.py +13 -38
- csv_detective/parsing/text.py +42 -20
- csv_detective/utils.py +1 -4
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/CHANGELOG.md +2 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/RECORD +62 -63
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/WHEEL +1 -1
- tests/test_fields.py +11 -2
- tests/test_labels.py +18 -2
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/top_level.txt +0 -0
|
@@ -1,45 +1,21 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# Does not detect CRS
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
25
20
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
21
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,34 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
# Not relevant to make it match with specific words (find other rules)
|
|
14
|
-
words_combinations_list = []
|
|
15
|
-
processed_header = _process_text(header)
|
|
16
|
-
|
|
17
|
-
header_matches_words_combination = float(
|
|
18
|
-
any(
|
|
19
|
-
[
|
|
20
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
21
|
-
]
|
|
22
|
-
)
|
|
23
|
-
)
|
|
24
|
-
words_combination_in_header = 0.5 * float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
is_word_in_string(
|
|
28
|
-
words_combination, processed_header
|
|
29
|
-
) for words_combination in words_combinations_list
|
|
30
|
-
]
|
|
31
|
-
)
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["is_", "has_", "est_"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,45 +1,20 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
8
|
+
"email",
|
|
9
|
+
"mail",
|
|
10
|
+
"courriel",
|
|
11
|
+
"contact",
|
|
12
|
+
"mel",
|
|
13
|
+
"lieucourriel",
|
|
14
|
+
"coordinates.emailcontact",
|
|
15
|
+
"e mail",
|
|
16
|
+
"mo mail",
|
|
17
|
+
"adresse mail",
|
|
18
|
+
"adresse email",
|
|
25
19
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
20
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['part', 'ratio']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["part", "ratio", "taux"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['nb', 'nombre', 'nbre']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["nb", "nombre", "nbre"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
'''
|
|
6
|
-
Returns 1 if at least one of the mentionned words is in the label, else 0
|
|
7
|
-
'''
|
|
3
|
+
PROPORTION = 0.5
|
|
8
4
|
|
|
9
|
-
words_list = ['budget', 'salaire', 'euro', 'euros', 'prêt', 'montant']
|
|
10
5
|
|
|
11
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = ['id', 'objectid']
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['twitter', 'twitter account', 'twitter username']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["twitter", "twitter account", "twitter username"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,48 +1,23 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
8
|
+
"url",
|
|
9
|
+
"url source",
|
|
10
|
+
"site web",
|
|
11
|
+
"source url",
|
|
12
|
+
"site internet",
|
|
13
|
+
"remote url",
|
|
14
|
+
"web",
|
|
15
|
+
"site",
|
|
16
|
+
"lien",
|
|
17
|
+
"site data",
|
|
18
|
+
"lien url",
|
|
19
|
+
"lien vers le fichier",
|
|
20
|
+
"sitweb",
|
|
21
|
+
"interneturl",
|
|
28
22
|
]
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
header_matches_words_combination = float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
35
|
-
]
|
|
36
|
-
)
|
|
37
|
-
)
|
|
38
|
-
words_combination_in_header = 0.5 * float(
|
|
39
|
-
any(
|
|
40
|
-
[
|
|
41
|
-
is_word_in_string(
|
|
42
|
-
words_combination, processed_header
|
|
43
|
-
) for words_combination in words_combinations_list
|
|
44
|
-
]
|
|
45
|
-
)
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
23
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['id', 'uuid', 'guid']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["id", "uuid", "guid"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,53 +1,28 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
8
|
+
"date",
|
|
9
|
+
"jour",
|
|
10
|
+
"date de mise a jour",
|
|
11
|
+
"sns date",
|
|
12
|
+
"date maj",
|
|
13
|
+
"rem date",
|
|
14
|
+
"periode",
|
|
15
|
+
"date de publication",
|
|
16
|
+
"dpc",
|
|
17
|
+
"extract date",
|
|
18
|
+
"date immatriculation",
|
|
19
|
+
"date jeu donnees",
|
|
20
|
+
"datemaj",
|
|
21
|
+
"dateouv",
|
|
22
|
+
"date der maj",
|
|
23
|
+
"dmaj",
|
|
24
|
+
"jour",
|
|
25
|
+
"yyyymmdd",
|
|
26
|
+
"aaaammjj",
|
|
33
27
|
]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
header_matches_words_combination = float(
|
|
37
|
-
any(
|
|
38
|
-
[
|
|
39
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
words_combination_in_header = 0.5 * float(
|
|
44
|
-
any(
|
|
45
|
-
[
|
|
46
|
-
is_word_in_string(
|
|
47
|
-
words_combination, processed_header
|
|
48
|
-
) for words_combination in words_combinations_list
|
|
49
|
-
]
|
|
50
|
-
)
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
28
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,45 +1,20 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
8
|
+
"datetime iso",
|
|
9
|
+
"datetime",
|
|
10
|
+
"timestamp",
|
|
11
|
+
"osm_timestamp",
|
|
12
|
+
"date",
|
|
13
|
+
"created at",
|
|
14
|
+
"last update",
|
|
15
|
+
"date maj",
|
|
16
|
+
"createdat",
|
|
17
|
+
"date naissance",
|
|
18
|
+
"date donnees",
|
|
25
19
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
20
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,44 +1,19 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
8
|
+
"datetime",
|
|
9
|
+
"timestamp",
|
|
10
|
+
"osm_timestamp",
|
|
11
|
+
"date",
|
|
12
|
+
"created at",
|
|
13
|
+
"last update",
|
|
14
|
+
"date maj",
|
|
15
|
+
"createdat",
|
|
16
|
+
"date naissance",
|
|
17
|
+
"date donnees",
|
|
24
18
|
] # Almost same as IS0, no example in data
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
header_matches_words_combination = float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
31
|
-
]
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
words_combination_in_header = 0.5 * float(
|
|
35
|
-
any(
|
|
36
|
-
[
|
|
37
|
-
is_word_in_string(
|
|
38
|
-
words_combination, processed_header
|
|
39
|
-
) for words_combination in words_combinations_list
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
19
|
+
return header_score(header, words_combinations_list)
|