csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1380__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
- csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
- csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
- csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
- csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
- csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
- csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
- csv_detective/detect_labels/other/booleen/__init__.py +4 -30
- csv_detective/detect_labels/other/email/__init__.py +14 -39
- csv_detective/detect_labels/other/float/__init__.py +4 -29
- csv_detective/detect_labels/other/int/__init__.py +4 -29
- csv_detective/detect_labels/other/money/__init__.py +5 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
- csv_detective/detect_labels/other/twitter/__init__.py +4 -29
- csv_detective/detect_labels/other/url/__init__.py +17 -42
- csv_detective/detect_labels/other/uuid/__init__.py +4 -29
- csv_detective/detect_labels/temp/date/__init__.py +22 -47
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
- csv_detective/detect_labels/temp/year/__init__.py +13 -38
- csv_detective/parsing/text.py +42 -20
- csv_detective/utils.py +0 -4
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/CHANGELOG.md +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/RECORD +60 -61
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/WHEEL +1 -1
- tests/test_labels.py +18 -2
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/top_level.txt +0 -0
|
@@ -1,50 +1,25 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
8
|
+
"uai",
|
|
9
|
+
"code etablissement",
|
|
10
|
+
"code uai",
|
|
11
|
+
"uai - identifiant",
|
|
12
|
+
"numero uai",
|
|
13
|
+
"rne",
|
|
20
14
|
"numero de l'etablissement",
|
|
21
|
-
|
|
22
|
-
|
|
15
|
+
"code rne",
|
|
16
|
+
"codeetab",
|
|
23
17
|
"code uai de l'etablissement",
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
"ref uai",
|
|
19
|
+
"cd rne",
|
|
20
|
+
"numerouai",
|
|
21
|
+
"numero d etablissement",
|
|
22
|
+
"code etablissement",
|
|
23
|
+
"numero etablissement",
|
|
30
24
|
]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
header_matches_words_combination = float(
|
|
34
|
-
any(
|
|
35
|
-
[
|
|
36
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
words_combination_in_header = 0.5 * float(
|
|
41
|
-
any(
|
|
42
|
-
[
|
|
43
|
-
is_word_in_string(
|
|
44
|
-
words_combination, processed_header
|
|
45
|
-
) for words_combination in words_combinations_list
|
|
46
|
-
]
|
|
47
|
-
)
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
25
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,41 +1,16 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
8
|
+
"jour semaine",
|
|
9
|
+
"type jour",
|
|
10
|
+
"jour de la semaine",
|
|
11
|
+
"saufjour",
|
|
12
|
+
"nomjour",
|
|
13
|
+
"jour",
|
|
14
|
+
"jour de fermeture",
|
|
21
15
|
]
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
is_word_in_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
16
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['mois de annee', 'mois']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["mois de annee", "mois", "month"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,41 +1,16 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
8
|
+
"iso country code",
|
|
9
|
+
"code pays",
|
|
10
|
+
"pays",
|
|
11
|
+
"country",
|
|
12
|
+
"nation",
|
|
13
|
+
"pays code",
|
|
14
|
+
"code pays (iso)",
|
|
21
15
|
]
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
is_word_in_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
16
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,41 +1,16 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
8
|
+
"iso country code",
|
|
9
|
+
"code pays",
|
|
10
|
+
"pays",
|
|
11
|
+
"country",
|
|
12
|
+
"nation",
|
|
13
|
+
"pays code",
|
|
14
|
+
"code pays (iso)",
|
|
21
15
|
]
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
is_word_in_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
16
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,41 +1,16 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
8
|
+
"iso country code",
|
|
9
|
+
"code pays",
|
|
10
|
+
"pays",
|
|
11
|
+
"country",
|
|
12
|
+
"nation",
|
|
13
|
+
"pays code",
|
|
14
|
+
"code pays (iso)",
|
|
21
15
|
]
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
is_word_in_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
16
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,42 +1,17 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
8
|
+
"json geojson",
|
|
9
|
+
"json",
|
|
10
|
+
"geojson",
|
|
11
|
+
"geo shape",
|
|
12
|
+
"geom",
|
|
13
|
+
"geometry",
|
|
14
|
+
"geo shape",
|
|
15
|
+
"geoshape",
|
|
22
16
|
]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
header_matches_words_combination = float(
|
|
26
|
-
any(
|
|
27
|
-
[
|
|
28
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
words_combination_in_header = 0.5 * float(
|
|
33
|
-
any(
|
|
34
|
-
[
|
|
35
|
-
is_word_in_string(
|
|
36
|
-
words_combination, processed_header
|
|
37
|
-
) for words_combination in words_combinations_list
|
|
38
|
-
]
|
|
39
|
-
)
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
17
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,55 +1,30 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
8
|
+
"latitude",
|
|
9
|
+
"lat",
|
|
10
|
+
"y",
|
|
11
|
+
"yf",
|
|
12
|
+
"yd",
|
|
13
|
+
"coordonnee y",
|
|
14
|
+
"coord y",
|
|
15
|
+
"ycoord",
|
|
16
|
+
"geocodage y gps",
|
|
17
|
+
"location latitude",
|
|
18
|
+
"ylatitude",
|
|
19
|
+
"ylat",
|
|
20
|
+
"latitude (y)",
|
|
21
|
+
"latitudeorg",
|
|
22
|
+
"coordinates.latitude",
|
|
23
|
+
"googlemap latitude",
|
|
24
|
+
"latitudelieu",
|
|
25
|
+
"latitude googlemap",
|
|
26
|
+
"latitude wgs84",
|
|
27
|
+
"y wgs84",
|
|
28
|
+
"latitude (wgs84)",
|
|
35
29
|
]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
header_matches_words_combination = float(
|
|
39
|
-
any(
|
|
40
|
-
[
|
|
41
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
42
|
-
]
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
words_combination_in_header = 0.5 * float(
|
|
46
|
-
any(
|
|
47
|
-
[
|
|
48
|
-
is_word_in_string(
|
|
49
|
-
words_combination, processed_header
|
|
50
|
-
) for words_combination in words_combinations_list
|
|
51
|
-
]
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
30
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,67 +1,43 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
8
|
+
"latlon wgs",
|
|
9
|
+
"latlon",
|
|
10
|
+
"geo point",
|
|
11
|
+
"geo point 2d",
|
|
12
|
+
"wgs84",
|
|
13
|
+
"geolocalisation",
|
|
14
|
+
"geo",
|
|
15
|
+
"coordonnees finales",
|
|
16
|
+
"coordonnees",
|
|
17
|
+
"coordonnees ban",
|
|
18
|
+
"xy",
|
|
19
|
+
"geometry x y",
|
|
20
|
+
"coordonnees insee",
|
|
21
|
+
"coordonnees geographiques",
|
|
22
|
+
"position",
|
|
23
|
+
"coordonnes gps",
|
|
24
|
+
"geopoint",
|
|
25
|
+
"geom x y",
|
|
26
|
+
"coord gps",
|
|
27
|
+
"latlong",
|
|
28
|
+
"position geographique",
|
|
29
|
+
"c geo",
|
|
30
|
+
"coordonnes geoloc",
|
|
31
|
+
"lat lon",
|
|
32
|
+
"code geo",
|
|
33
|
+
"geo localisation",
|
|
34
|
+
"coordonnes geo",
|
|
35
|
+
"geo cp",
|
|
36
|
+
"x y",
|
|
37
|
+
"geo coordinates",
|
|
38
|
+
"point geo",
|
|
39
|
+
"point geo insee",
|
|
40
|
+
"coordonnees geoloc",
|
|
41
|
+
"coordonnees xy",
|
|
48
42
|
]
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
header_matches_words_combination = float(
|
|
52
|
-
any(
|
|
53
|
-
[
|
|
54
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
55
|
-
]
|
|
56
|
-
)
|
|
57
|
-
)
|
|
58
|
-
words_combination_in_header = 0.5 * float(
|
|
59
|
-
any(
|
|
60
|
-
[
|
|
61
|
-
is_word_in_string(
|
|
62
|
-
words_combination, processed_header
|
|
63
|
-
) for words_combination in words_combinations_list
|
|
64
|
-
]
|
|
65
|
-
)
|
|
66
|
-
)
|
|
67
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
43
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,45 +1,21 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# Does not detect CRS
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
25
20
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
21
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,34 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
# Not relevant to make it match with specific words (find other rules)
|
|
14
|
-
words_combinations_list = []
|
|
15
|
-
processed_header = _process_text(header)
|
|
16
|
-
|
|
17
|
-
header_matches_words_combination = float(
|
|
18
|
-
any(
|
|
19
|
-
[
|
|
20
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
21
|
-
]
|
|
22
|
-
)
|
|
23
|
-
)
|
|
24
|
-
words_combination_in_header = 0.5 * float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
is_word_in_string(
|
|
28
|
-
words_combination, processed_header
|
|
29
|
-
) for words_combination in words_combinations_list
|
|
30
|
-
]
|
|
31
|
-
)
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["is_", "has_", "est_"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|