csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1416__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/other/url/__init__.py +7 -6
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
- csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
- csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
- csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
- csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
- csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
- csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
- csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
- csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
- csv_detective/detect_labels/other/booleen/__init__.py +4 -30
- csv_detective/detect_labels/other/email/__init__.py +14 -39
- csv_detective/detect_labels/other/float/__init__.py +4 -29
- csv_detective/detect_labels/other/int/__init__.py +4 -29
- csv_detective/detect_labels/other/money/__init__.py +5 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
- csv_detective/detect_labels/other/twitter/__init__.py +4 -29
- csv_detective/detect_labels/other/url/__init__.py +17 -42
- csv_detective/detect_labels/other/uuid/__init__.py +4 -29
- csv_detective/detect_labels/temp/date/__init__.py +22 -47
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
- csv_detective/detect_labels/temp/year/__init__.py +13 -38
- csv_detective/parsing/text.py +42 -20
- csv_detective/utils.py +1 -4
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/CHANGELOG.md +2 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/RECORD +62 -63
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/WHEEL +1 -1
- tests/test_fields.py +11 -2
- tests/test_labels.py +18 -2
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1416.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1416.dist-info}/top_level.txt +0 -0
|
@@ -1,44 +1,21 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# Does not detect CRS
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
header_matches_words_combination = float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
31
|
-
]
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
words_combination_in_header = 0.5 * float(
|
|
35
|
-
any(
|
|
36
|
-
[
|
|
37
|
-
is_word_in_string(
|
|
38
|
-
words_combination, processed_header
|
|
39
|
-
) for words_combination in words_combinations_list
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
20
|
+
]
|
|
21
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,45 +1,21 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# Does not detect CRS
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
25
20
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
21
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,45 +1,20 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
8
|
+
"pays",
|
|
9
|
+
"payslieu",
|
|
10
|
+
"paysorg",
|
|
11
|
+
"country",
|
|
12
|
+
"pays lib",
|
|
13
|
+
"lieupays",
|
|
14
|
+
"pays beneficiaire",
|
|
15
|
+
"nom du pays",
|
|
16
|
+
"journey start country",
|
|
17
|
+
"libelle pays",
|
|
18
|
+
"journey end country",
|
|
25
19
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
20
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,45 +1,20 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
8
|
+
"region",
|
|
9
|
+
"libelle region",
|
|
10
|
+
"nom region",
|
|
11
|
+
"libelle reg",
|
|
12
|
+
"nom reg",
|
|
13
|
+
"reg libusage",
|
|
14
|
+
"nom de la region",
|
|
15
|
+
"regionorg",
|
|
16
|
+
"regionlieu",
|
|
17
|
+
"reg",
|
|
18
|
+
"nom officiel region",
|
|
25
19
|
]
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
is_word_in_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
20
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['code csp insee', 'code csp']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["code csp insee", "code csp"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,38 +1,13 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
8
|
+
"code rna",
|
|
9
|
+
"rna",
|
|
10
|
+
"n° inscription association",
|
|
11
|
+
"identifiant association",
|
|
18
12
|
]
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
header_matches_words_combination = float(
|
|
22
|
-
any(
|
|
23
|
-
[
|
|
24
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
25
|
-
]
|
|
26
|
-
)
|
|
27
|
-
)
|
|
28
|
-
words_combination_in_header = 0.5 * float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
is_word_in_string(
|
|
32
|
-
words_combination, processed_header
|
|
33
|
-
) for words_combination in words_combinations_list
|
|
34
|
-
]
|
|
35
|
-
)
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
13
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['code waldec', 'waldec']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["code waldec", "waldec"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,37 +1,13 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
6
|
+
def _is(header: str) -> float:
|
|
12
7
|
# To improve? No specific header found in data
|
|
13
8
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
9
|
+
"csp insee",
|
|
10
|
+
"csp",
|
|
11
|
+
"categorie socioprofessionnelle",
|
|
17
12
|
]
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
header_matches_words_combination = float(
|
|
21
|
-
any(
|
|
22
|
-
[
|
|
23
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
24
|
-
]
|
|
25
|
-
)
|
|
26
|
-
)
|
|
27
|
-
words_combination_in_header = 0.5 * float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
is_word_in_string(
|
|
31
|
-
words_combination, processed_header
|
|
32
|
-
) for words_combination in words_combinations_list
|
|
33
|
-
]
|
|
34
|
-
)
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
13
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,9 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
'''
|
|
12
|
-
# To improve: no header specific to 'fr' found in data
|
|
13
|
-
words_combinations_list = ['date']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
# To improve: no header specific to "fr" found in data
|
|
8
|
+
words_combinations_list = ["date"]
|
|
9
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,40 +1,15 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
8
|
+
"code ape",
|
|
9
|
+
"code activite (ape)",
|
|
10
|
+
"code naf",
|
|
11
|
+
"code naf organisme designe",
|
|
12
|
+
"code naf organisme designant",
|
|
13
|
+
"base sirene : code ape de l'etablissement siege",
|
|
20
14
|
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
header_matches_words_combination = float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
27
|
-
]
|
|
28
|
-
)
|
|
29
|
-
)
|
|
30
|
-
words_combination_in_header = 0.5 * float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
is_word_in_string(
|
|
34
|
-
words_combination, processed_header
|
|
35
|
-
) for words_combination in words_combinations_list
|
|
36
|
-
]
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
15
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,33 +1,8 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['sexe', 'sex', 'civilite', 'genre', 'id sexe']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
is_word_in_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
6
|
+
def _is(header: str) -> float:
|
|
7
|
+
words_combinations_list = ["sexe", "sex", "civilite", "genre", "id sexe"]
|
|
8
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,41 +1,16 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
8
|
+
"siren",
|
|
9
|
+
"siren organisme designe",
|
|
10
|
+
"siren organisme designant",
|
|
11
|
+
"n° siren",
|
|
12
|
+
"siren organisme",
|
|
13
|
+
"siren titulaire",
|
|
14
|
+
"numero siren",
|
|
21
15
|
]
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
is_word_in_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
16
|
+
return header_score(header, words_combinations_list)
|
|
@@ -1,40 +1,15 @@
|
|
|
1
|
-
from csv_detective.
|
|
2
|
-
from csv_detective.parsing.text import _process_text
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.5
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
6
|
+
def _is(header: str) -> float:
|
|
13
7
|
words_combinations_list = [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
8
|
+
"siret",
|
|
9
|
+
"siret d",
|
|
10
|
+
"num siret",
|
|
11
|
+
"siretacheteur",
|
|
12
|
+
"n° siret",
|
|
13
|
+
"coll siret",
|
|
20
14
|
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
header_matches_words_combination = float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
27
|
-
]
|
|
28
|
-
)
|
|
29
|
-
)
|
|
30
|
-
words_combination_in_header = 0.5 * float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
is_word_in_string(
|
|
34
|
-
words_combination, processed_header
|
|
35
|
-
) for words_combination in words_combinations_list
|
|
36
|
-
]
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
15
|
+
return header_score(header, words_combinations_list)
|