csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1380__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
- csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
- csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
- csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
- csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
- csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
- csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
- csv_detective/detect_labels/other/booleen/__init__.py +4 -30
- csv_detective/detect_labels/other/email/__init__.py +14 -39
- csv_detective/detect_labels/other/float/__init__.py +4 -29
- csv_detective/detect_labels/other/int/__init__.py +4 -29
- csv_detective/detect_labels/other/money/__init__.py +5 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
- csv_detective/detect_labels/other/twitter/__init__.py +4 -29
- csv_detective/detect_labels/other/url/__init__.py +17 -42
- csv_detective/detect_labels/other/uuid/__init__.py +4 -29
- csv_detective/detect_labels/temp/date/__init__.py +22 -47
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
- csv_detective/detect_labels/temp/year/__init__.py +13 -38
- csv_detective/parsing/text.py +42 -20
- csv_detective/utils.py +0 -4
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/CHANGELOG.md +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/RECORD +60 -61
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/WHEEL +1 -1
- tests/test_labels.py +18 -2
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,15 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
8
|
+
"adresse",
|
|
9
|
+
"adresse postale",
|
|
10
|
+
"adresse geographique",
|
|
11
|
+
"adr",
|
|
12
|
+
"adresse complete",
|
|
13
|
+
"adresse station",
|
|
20
14
|
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
header_matches_words_combination = float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
27
|
-
]
|
|
28
|
-
)
|
|
29
|
-
)
|
|
30
|
-
words_combination_in_header = 0.5 * float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
is_word_in_string(
|
|
34
|
-
words_combination, processed_header
|
|
35
|
-
) for words_combination in words_combinations_list
|
|
36
|
-
]
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
15
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,42 +1,17 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
8
|
+
"code commune insee",
|
|
9
|
+
"code insee",
|
|
10
|
+
"codes insee",
|
|
11
|
+
"code commune",
|
|
12
|
+
"code insee commune",
|
|
13
|
+
"insee",
|
|
14
|
+
"code com",
|
|
15
|
+
"com",
|
|
22
16
|
]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
header_matches_words_combination = float(
|
|
26
|
-
any(
|
|
27
|
-
[
|
|
28
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
words_combination_in_header = 0.5 * float(
|
|
33
|
-
any(
|
|
34
|
-
[
|
|
35
|
-
is_word_in_string(
|
|
36
|
-
words_combination, processed_header
|
|
37
|
-
) for words_combination in words_combinations_list
|
|
38
|
-
]
|
|
39
|
-
)
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
17
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,15 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
# "dep": Possible confusion with dep name?
|
|
8
|
+
words_combinations_list = [
|
|
9
|
+
"code departement",
|
|
10
|
+
"code_departement",
|
|
11
|
+
"dep",
|
|
12
|
+
"departement",
|
|
13
|
+
"dept",
|
|
14
|
+
]
|
|
15
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,12 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = [
|
|
8
|
+
"cadastre1",
|
|
9
|
+
"code fantoir",
|
|
10
|
+
"fantoir",
|
|
11
|
+
]
|
|
12
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,41 +1,16 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
8
|
+
"code postal",
|
|
9
|
+
"postal code",
|
|
10
|
+
"postcode",
|
|
11
|
+
"post code",
|
|
12
|
+
"cp",
|
|
13
|
+
"codes postaux",
|
|
14
|
+
"location postcode",
|
|
21
15
|
]
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
is_word_in_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
16
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,14 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
# "reg" : possible confusion with region name?
|
|
8
|
+
words_combinations_list = [
|
|
9
|
+
"code region",
|
|
10
|
+
"reg",
|
|
11
|
+
"code insee region",
|
|
12
|
+
"region",
|
|
13
|
+
]
|
|
14
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,12 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = [
|
|
8
|
+
"commune",
|
|
9
|
+
"ville",
|
|
10
|
+
"libelle commune",
|
|
11
|
+
]
|
|
12
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,47 +1,22 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
8
|
+
"departement",
|
|
9
|
+
"libelle du departement",
|
|
10
|
+
"deplib",
|
|
11
|
+
"nom dept",
|
|
12
|
+
"dept",
|
|
13
|
+
"libdepartement",
|
|
14
|
+
"nom departement",
|
|
15
|
+
"libelle dep",
|
|
16
|
+
"libelle departement",
|
|
17
|
+
"lb departements",
|
|
18
|
+
"dep libusage",
|
|
19
|
+
"lb departement",
|
|
20
|
+
"nom dep",
|
|
27
21
|
]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
header_matches_words_combination = float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
34
|
-
]
|
|
35
|
-
)
|
|
36
|
-
)
|
|
37
|
-
words_combination_in_header = 0.5 * float(
|
|
38
|
-
any(
|
|
39
|
-
[
|
|
40
|
-
is_word_in_string(
|
|
41
|
-
words_combination, processed_header
|
|
42
|
-
) for words_combination in words_combinations_list
|
|
43
|
-
]
|
|
44
|
-
)
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
22
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,13 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = [
|
|
8
|
+
"insee canton",
|
|
9
|
+
"canton",
|
|
10
|
+
"cant",
|
|
11
|
+
"nom canton",
|
|
12
|
+
]
|
|
13
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,54 +1,30 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# Does not always detect CRS
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
header_matches_words_combination = float(
|
|
38
|
-
any(
|
|
39
|
-
[
|
|
40
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
words_combination_in_header = 0.5 * float(
|
|
45
|
-
any(
|
|
46
|
-
[
|
|
47
|
-
is_word_in_string(
|
|
48
|
-
words_combination, processed_header
|
|
49
|
-
) for words_combination in words_combinations_list
|
|
50
|
-
]
|
|
51
|
-
)
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
9
|
+
"latitude",
|
|
10
|
+
"lat",
|
|
11
|
+
"y",
|
|
12
|
+
"yf",
|
|
13
|
+
"yd",
|
|
14
|
+
"y l93",
|
|
15
|
+
"coordonnee y",
|
|
16
|
+
"latitude lb93",
|
|
17
|
+
"coord y",
|
|
18
|
+
"ycoord",
|
|
19
|
+
"geocodage y gps",
|
|
20
|
+
"location latitude",
|
|
21
|
+
"ylatitude",
|
|
22
|
+
"ylat",
|
|
23
|
+
"latitude (y)",
|
|
24
|
+
"latitudeorg",
|
|
25
|
+
"coordinates.latitude",
|
|
26
|
+
"googlemap latitude",
|
|
27
|
+
"latitudelieu",
|
|
28
|
+
"latitude googlemap",
|
|
29
|
+
]
|
|
30
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,55 +1,30 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
8
|
+
"latitude",
|
|
9
|
+
"lat",
|
|
10
|
+
"y",
|
|
11
|
+
"yf",
|
|
12
|
+
"yd",
|
|
13
|
+
"coordonnee y",
|
|
14
|
+
"coord y",
|
|
15
|
+
"ycoord",
|
|
16
|
+
"geocodage y gps",
|
|
17
|
+
"location latitude",
|
|
18
|
+
"ylatitude",
|
|
19
|
+
"ylat",
|
|
20
|
+
"latitude (y)",
|
|
21
|
+
"latitudeorg",
|
|
22
|
+
"coordinates.latitude",
|
|
23
|
+
"googlemap latitude",
|
|
24
|
+
"latitudelieu",
|
|
25
|
+
"latitude googlemap",
|
|
26
|
+
"latitude wgs84",
|
|
27
|
+
"y wgs84",
|
|
28
|
+
"latitude (wgs84)",
|
|
35
29
|
]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
header_matches_words_combination = float(
|
|
39
|
-
any(
|
|
40
|
-
[
|
|
41
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
42
|
-
]
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
words_combination_in_header = 0.5 * float(
|
|
46
|
-
any(
|
|
47
|
-
[
|
|
48
|
-
is_word_in_string(
|
|
49
|
-
words_combination, processed_header
|
|
50
|
-
) for words_combination in words_combinations_list
|
|
51
|
-
]
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
30
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,44 +1,21 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# Does not detect CRS
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
header_matches_words_combination = float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
31
|
-
]
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
words_combination_in_header = 0.5 * float(
|
|
35
|
-
any(
|
|
36
|
-
[
|
|
37
|
-
is_word_in_string(
|
|
38
|
-
words_combination, processed_header
|
|
39
|
-
) for words_combination in words_combinations_list
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
20
|
+
]
|
|
21
|
+
return header_score(header, words_combinations_list)
|