csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['commune', 'ville', 'libelle commune']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'departement',
|
|
15
|
-
'libelle du departement',
|
|
16
|
-
'deplib',
|
|
17
|
-
'nom dept',
|
|
18
|
-
'dept',
|
|
19
|
-
'libdepartement',
|
|
20
|
-
'nom departement',
|
|
21
|
-
'libelle dep',
|
|
22
|
-
'libelle departement',
|
|
23
|
-
'lb departements',
|
|
24
|
-
'dep libusage',
|
|
25
|
-
'lb departement',
|
|
26
|
-
'nom dep'
|
|
27
|
-
]
|
|
28
|
-
processed_header = _process_text(header)
|
|
29
|
-
|
|
30
|
-
header_matches_words_combination = float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
34
|
-
]
|
|
35
|
-
)
|
|
36
|
-
)
|
|
37
|
-
words_combination_in_header = 0.5 * float(
|
|
38
|
-
any(
|
|
39
|
-
[
|
|
40
|
-
full_word_strictly_inside_string(
|
|
41
|
-
words_combination, processed_header
|
|
42
|
-
) for words_combination in words_combinations_list
|
|
43
|
-
]
|
|
44
|
-
)
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['insee canton', 'canton', 'cant', 'nom canton']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# Does not always detect CRS
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'latitude',
|
|
15
|
-
'lat',
|
|
16
|
-
'y',
|
|
17
|
-
'yf',
|
|
18
|
-
'yd',
|
|
19
|
-
'y l93',
|
|
20
|
-
'coordonnee y',
|
|
21
|
-
'latitude lb93',
|
|
22
|
-
'coord y',
|
|
23
|
-
'ycoord',
|
|
24
|
-
'geocodage y gps',
|
|
25
|
-
'location latitude',
|
|
26
|
-
'ylatitude',
|
|
27
|
-
'ylat',
|
|
28
|
-
'latitude (y)',
|
|
29
|
-
'latitudeorg',
|
|
30
|
-
'coordinates.latitude',
|
|
31
|
-
'googlemap latitude',
|
|
32
|
-
'latitudelieu',
|
|
33
|
-
'latitude googlemap'
|
|
34
|
-
]
|
|
35
|
-
processed_header = _process_text(header)
|
|
36
|
-
|
|
37
|
-
header_matches_words_combination = float(
|
|
38
|
-
any(
|
|
39
|
-
[
|
|
40
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
words_combination_in_header = 0.5 * float(
|
|
45
|
-
any(
|
|
46
|
-
[
|
|
47
|
-
full_word_strictly_inside_string(
|
|
48
|
-
words_combination, processed_header
|
|
49
|
-
) for words_combination in words_combinations_list
|
|
50
|
-
]
|
|
51
|
-
)
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'latitude',
|
|
15
|
-
'lat',
|
|
16
|
-
'y',
|
|
17
|
-
'yf',
|
|
18
|
-
'yd',
|
|
19
|
-
'coordonnee y',
|
|
20
|
-
'coord y',
|
|
21
|
-
'ycoord',
|
|
22
|
-
'geocodage y gps',
|
|
23
|
-
'location latitude',
|
|
24
|
-
'ylatitude',
|
|
25
|
-
'ylat',
|
|
26
|
-
'latitude (y)',
|
|
27
|
-
'latitudeorg',
|
|
28
|
-
'coordinates.latitude',
|
|
29
|
-
'googlemap latitude',
|
|
30
|
-
'latitudelieu',
|
|
31
|
-
'latitude googlemap',
|
|
32
|
-
'latitude wgs84',
|
|
33
|
-
'y wgs84',
|
|
34
|
-
'latitude (wgs84)'
|
|
35
|
-
]
|
|
36
|
-
processed_header = _process_text(header)
|
|
37
|
-
|
|
38
|
-
header_matches_words_combination = float(
|
|
39
|
-
any(
|
|
40
|
-
[
|
|
41
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
42
|
-
]
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
words_combination_in_header = 0.5 * float(
|
|
46
|
-
any(
|
|
47
|
-
[
|
|
48
|
-
full_word_strictly_inside_string(
|
|
49
|
-
words_combination, processed_header
|
|
50
|
-
) for words_combination in words_combinations_list
|
|
51
|
-
]
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# Does not detect CRS
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'longitude',
|
|
15
|
-
'lon',
|
|
16
|
-
'long',
|
|
17
|
-
'geocodage x gps',
|
|
18
|
-
'location longitude',
|
|
19
|
-
'xlongitude',
|
|
20
|
-
'lng',
|
|
21
|
-
'xlong',
|
|
22
|
-
'x',
|
|
23
|
-
'xf',
|
|
24
|
-
'xd']
|
|
25
|
-
processed_header = _process_text(header)
|
|
26
|
-
|
|
27
|
-
header_matches_words_combination = float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
31
|
-
]
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
words_combination_in_header = 0.5 * float(
|
|
35
|
-
any(
|
|
36
|
-
[
|
|
37
|
-
full_word_strictly_inside_string(
|
|
38
|
-
words_combination, processed_header
|
|
39
|
-
) for words_combination in words_combinations_list
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# Does not detect CRS
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'longitude',
|
|
15
|
-
'lon',
|
|
16
|
-
'long',
|
|
17
|
-
'geocodage x gps',
|
|
18
|
-
'location longitude',
|
|
19
|
-
'xlongitude',
|
|
20
|
-
'lng',
|
|
21
|
-
'xlong',
|
|
22
|
-
'x',
|
|
23
|
-
'xf',
|
|
24
|
-
'xd'
|
|
25
|
-
]
|
|
26
|
-
processed_header = _process_text(header)
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
full_word_strictly_inside_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'pays',
|
|
15
|
-
'payslieu',
|
|
16
|
-
'paysorg',
|
|
17
|
-
'country',
|
|
18
|
-
'pays lib',
|
|
19
|
-
'lieupays',
|
|
20
|
-
'pays beneficiaire',
|
|
21
|
-
'nom du pays',
|
|
22
|
-
'journey start country',
|
|
23
|
-
'libelle pays',
|
|
24
|
-
'journey end country'
|
|
25
|
-
]
|
|
26
|
-
processed_header = _process_text(header)
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
full_word_strictly_inside_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'region',
|
|
15
|
-
'libelle region',
|
|
16
|
-
'nom region',
|
|
17
|
-
'libelle reg',
|
|
18
|
-
'nom reg',
|
|
19
|
-
'reg libusage',
|
|
20
|
-
'nom de la region',
|
|
21
|
-
'regionorg',
|
|
22
|
-
'regionlieu',
|
|
23
|
-
'reg',
|
|
24
|
-
'nom officiel region'
|
|
25
|
-
]
|
|
26
|
-
processed_header = _process_text(header)
|
|
27
|
-
|
|
28
|
-
header_matches_words_combination = float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
32
|
-
]
|
|
33
|
-
)
|
|
34
|
-
)
|
|
35
|
-
words_combination_in_header = 0.5 * float(
|
|
36
|
-
any(
|
|
37
|
-
[
|
|
38
|
-
full_word_strictly_inside_string(
|
|
39
|
-
words_combination, processed_header
|
|
40
|
-
) for words_combination in words_combinations_list
|
|
41
|
-
]
|
|
42
|
-
)
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
File without changes
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['code csp insee', 'code csp']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'code rna',
|
|
15
|
-
'rna',
|
|
16
|
-
'n° inscription association',
|
|
17
|
-
'identifiant association'
|
|
18
|
-
]
|
|
19
|
-
processed_header = _process_text(header)
|
|
20
|
-
|
|
21
|
-
header_matches_words_combination = float(
|
|
22
|
-
any(
|
|
23
|
-
[
|
|
24
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
25
|
-
]
|
|
26
|
-
)
|
|
27
|
-
)
|
|
28
|
-
words_combination_in_header = 0.5 * float(
|
|
29
|
-
any(
|
|
30
|
-
[
|
|
31
|
-
full_word_strictly_inside_string(
|
|
32
|
-
words_combination, processed_header
|
|
33
|
-
) for words_combination in words_combinations_list
|
|
34
|
-
]
|
|
35
|
-
)
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['code waldec', 'waldec']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# To improve? No specific header found in data
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'csp insee',
|
|
15
|
-
'csp',
|
|
16
|
-
'categorie socioprofessionnelle'
|
|
17
|
-
]
|
|
18
|
-
processed_header = _process_text(header)
|
|
19
|
-
|
|
20
|
-
header_matches_words_combination = float(
|
|
21
|
-
any(
|
|
22
|
-
[
|
|
23
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
24
|
-
]
|
|
25
|
-
)
|
|
26
|
-
)
|
|
27
|
-
words_combination_in_header = 0.5 * float(
|
|
28
|
-
any(
|
|
29
|
-
[
|
|
30
|
-
full_word_strictly_inside_string(
|
|
31
|
-
words_combination, processed_header
|
|
32
|
-
) for words_combination in words_combinations_list
|
|
33
|
-
]
|
|
34
|
-
)
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# To improve: no header specific to 'fr' found in data
|
|
13
|
-
words_combinations_list = ['date']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'code ape',
|
|
15
|
-
'code activite (ape)',
|
|
16
|
-
'code naf',
|
|
17
|
-
'code naf organisme designe',
|
|
18
|
-
'code naf organisme designant',
|
|
19
|
-
'base sirene : code ape de l\'etablissement siege'
|
|
20
|
-
]
|
|
21
|
-
processed_header = _process_text(header)
|
|
22
|
-
|
|
23
|
-
header_matches_words_combination = float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
27
|
-
]
|
|
28
|
-
)
|
|
29
|
-
)
|
|
30
|
-
words_combination_in_header = 0.5 * float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
full_word_strictly_inside_string(
|
|
34
|
-
words_combination, processed_header
|
|
35
|
-
) for words_combination in words_combinations_list
|
|
36
|
-
]
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['sexe', 'sex', 'civilite', 'genre', 'id sexe']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|