csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2319__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +12 -15
- csv_detective/explore_csv.py +28 -9
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
- csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
- csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
- csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
- csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
- csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
- csv_detective/formats/geojson.py +36 -0
- csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
- csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
- csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
- csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
- csv_detective/formats/sexe.py +17 -0
- csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +45 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +3 -4
- csv_detective/output/dataframe.py +3 -3
- csv_detective/output/profile.py +2 -3
- csv_detective/output/schema.py +2 -2
- csv_detective/parsing/columns.py +35 -50
- csv_detective/parsing/csv.py +2 -2
- csv_detective/parsing/load.py +4 -5
- csv_detective/validate.py +9 -4
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/METADATA +6 -5
- csv_detective-0.9.3.dev2319.dist-info/RECORD +102 -0
- tests/test_fields.py +39 -364
- tests/test_file.py +1 -1
- tests/test_labels.py +5 -3
- tests/test_structure.py +40 -36
- csv_detective/detect_fields/FR/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/__init__.py +0 -112
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/email/__init__.py +0 -10
- csv_detective/detect_fields/other/money/__init__.py +0 -11
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/percent/__init__.py +0 -9
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -14
- csv_detective/detect_fields/other/uuid/__init__.py +0 -10
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
- csv_detective/detect_labels/__init__.py +0 -94
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -8
- csv_detective/detect_labels/other/email/__init__.py +0 -20
- csv_detective/detect_labels/other/float/__init__.py +0 -8
- csv_detective/detect_labels/other/int/__init__.py +0 -8
- csv_detective/detect_labels/other/money/__init__.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_labels/other/twitter/__init__.py +0 -8
- csv_detective/detect_labels/other/url/__init__.py +0 -23
- csv_detective/detect_labels/other/uuid/__init__.py +0 -8
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -28
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
- csv_detective/detect_labels/temp/year/__init__.py +0 -19
- csv_detective/load_tests.py +0 -59
- csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"uai",
|
|
9
|
-
"code etablissement",
|
|
10
|
-
"code uai",
|
|
11
|
-
"uai - identifiant",
|
|
12
|
-
"numero uai",
|
|
13
|
-
"rne",
|
|
14
|
-
"numero de l'etablissement",
|
|
15
|
-
"code rne",
|
|
16
|
-
"codeetab",
|
|
17
|
-
"code uai de l'etablissement",
|
|
18
|
-
"ref uai",
|
|
19
|
-
"cd rne",
|
|
20
|
-
"numerouai",
|
|
21
|
-
"numero d etablissement",
|
|
22
|
-
"code etablissement",
|
|
23
|
-
"numero etablissement",
|
|
24
|
-
]
|
|
25
|
-
return header_score(header, words_combinations_list)
|
|
File without changes
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"jour semaine",
|
|
9
|
-
"type jour",
|
|
10
|
-
"jour de la semaine",
|
|
11
|
-
"saufjour",
|
|
12
|
-
"nomjour",
|
|
13
|
-
"jour",
|
|
14
|
-
"jour de fermeture",
|
|
15
|
-
]
|
|
16
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
from .FR.geo import (
|
|
2
|
-
adresse,
|
|
3
|
-
code_commune_insee,
|
|
4
|
-
code_departement,
|
|
5
|
-
code_fantoir,
|
|
6
|
-
code_postal,
|
|
7
|
-
code_region,
|
|
8
|
-
commune,
|
|
9
|
-
departement,
|
|
10
|
-
insee_canton,
|
|
11
|
-
latitude_l93,
|
|
12
|
-
latitude_wgs_fr_metropole,
|
|
13
|
-
longitude_l93,
|
|
14
|
-
longitude_wgs_fr_metropole,
|
|
15
|
-
pays,
|
|
16
|
-
region,
|
|
17
|
-
)
|
|
18
|
-
from .FR.other import (
|
|
19
|
-
code_csp_insee,
|
|
20
|
-
code_rna,
|
|
21
|
-
code_waldec,
|
|
22
|
-
csp_insee,
|
|
23
|
-
date_fr,
|
|
24
|
-
insee_ape700,
|
|
25
|
-
sexe,
|
|
26
|
-
siren,
|
|
27
|
-
siret,
|
|
28
|
-
tel_fr,
|
|
29
|
-
uai,
|
|
30
|
-
)
|
|
31
|
-
from .FR.temp import jour_de_la_semaine, mois_de_annee
|
|
32
|
-
from .geo import (
|
|
33
|
-
iso_country_code_alpha2,
|
|
34
|
-
iso_country_code_alpha3,
|
|
35
|
-
iso_country_code_numeric,
|
|
36
|
-
json_geojson,
|
|
37
|
-
latitude_wgs,
|
|
38
|
-
latlon_wgs,
|
|
39
|
-
longitude_wgs,
|
|
40
|
-
lonlat_wgs,
|
|
41
|
-
)
|
|
42
|
-
from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
|
|
43
|
-
from .temp import date, datetime_rfc822, year
|
|
44
|
-
|
|
45
|
-
__all__ = [
|
|
46
|
-
"adresse",
|
|
47
|
-
"code_commune_insee",
|
|
48
|
-
"code_departement",
|
|
49
|
-
"code_fantoir",
|
|
50
|
-
"code_postal",
|
|
51
|
-
"code_region",
|
|
52
|
-
"commune",
|
|
53
|
-
"departement",
|
|
54
|
-
"insee_canton",
|
|
55
|
-
"latitude_l93",
|
|
56
|
-
"latitude_wgs_fr_metropole",
|
|
57
|
-
"longitude_l93",
|
|
58
|
-
"longitude_wgs_fr_metropole",
|
|
59
|
-
"pays",
|
|
60
|
-
"region",
|
|
61
|
-
"code_csp_insee",
|
|
62
|
-
"code_rna",
|
|
63
|
-
"code_waldec",
|
|
64
|
-
"csp_insee",
|
|
65
|
-
"date_fr",
|
|
66
|
-
"insee_ape700",
|
|
67
|
-
"sexe",
|
|
68
|
-
"siren",
|
|
69
|
-
"siret",
|
|
70
|
-
"tel_fr",
|
|
71
|
-
"uai",
|
|
72
|
-
"iso_country_code_alpha2",
|
|
73
|
-
"iso_country_code_alpha3",
|
|
74
|
-
"iso_country_code_numeric",
|
|
75
|
-
"json_geojson",
|
|
76
|
-
"latitude_wgs",
|
|
77
|
-
"latlon_wgs",
|
|
78
|
-
"longitude_wgs",
|
|
79
|
-
"lonlat_wgs",
|
|
80
|
-
"jour_de_la_semaine",
|
|
81
|
-
"mois_de_annee",
|
|
82
|
-
"booleen",
|
|
83
|
-
"email",
|
|
84
|
-
"float",
|
|
85
|
-
"int",
|
|
86
|
-
"money",
|
|
87
|
-
"mongo_object_id",
|
|
88
|
-
"twitter",
|
|
89
|
-
"url",
|
|
90
|
-
"uuid",
|
|
91
|
-
"date",
|
|
92
|
-
"datetime_rfc822",
|
|
93
|
-
"year",
|
|
94
|
-
]
|
|
File without changes
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"iso country code",
|
|
9
|
-
"code pays",
|
|
10
|
-
"pays",
|
|
11
|
-
"country",
|
|
12
|
-
"nation",
|
|
13
|
-
"pays code",
|
|
14
|
-
"code pays (iso)",
|
|
15
|
-
]
|
|
16
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"iso country code",
|
|
9
|
-
"code pays",
|
|
10
|
-
"pays",
|
|
11
|
-
"country",
|
|
12
|
-
"nation",
|
|
13
|
-
"pays code",
|
|
14
|
-
"code pays (iso)",
|
|
15
|
-
]
|
|
16
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"iso country code",
|
|
9
|
-
"code pays",
|
|
10
|
-
"pays",
|
|
11
|
-
"country",
|
|
12
|
-
"nation",
|
|
13
|
-
"pays code",
|
|
14
|
-
"code pays (iso)",
|
|
15
|
-
]
|
|
16
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"json geojson",
|
|
9
|
-
"json",
|
|
10
|
-
"geojson",
|
|
11
|
-
"geo shape",
|
|
12
|
-
"geom",
|
|
13
|
-
"geometry",
|
|
14
|
-
"geo shape",
|
|
15
|
-
"geoshape",
|
|
16
|
-
]
|
|
17
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"latitude",
|
|
9
|
-
"lat",
|
|
10
|
-
"y",
|
|
11
|
-
"yf",
|
|
12
|
-
"yd",
|
|
13
|
-
"coordonnee y",
|
|
14
|
-
"coord y",
|
|
15
|
-
"ycoord",
|
|
16
|
-
"geocodage y gps",
|
|
17
|
-
"location latitude",
|
|
18
|
-
"ylatitude",
|
|
19
|
-
"ylat",
|
|
20
|
-
"latitude (y)",
|
|
21
|
-
"latitudeorg",
|
|
22
|
-
"coordinates.latitude",
|
|
23
|
-
"googlemap latitude",
|
|
24
|
-
"latitudelieu",
|
|
25
|
-
"latitude googlemap",
|
|
26
|
-
"latitude wgs84",
|
|
27
|
-
"y wgs84",
|
|
28
|
-
"latitude (wgs84)",
|
|
29
|
-
]
|
|
30
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
COMMON_COORDS_LABELS = [
|
|
6
|
-
"ban",
|
|
7
|
-
"coordinates",
|
|
8
|
-
"coordonnees",
|
|
9
|
-
"coordonnees insee",
|
|
10
|
-
"geo",
|
|
11
|
-
"geopoint",
|
|
12
|
-
"geoloc",
|
|
13
|
-
"geolocalisation",
|
|
14
|
-
"geom",
|
|
15
|
-
"geometry",
|
|
16
|
-
"gps",
|
|
17
|
-
"localisation",
|
|
18
|
-
"point",
|
|
19
|
-
"position",
|
|
20
|
-
"wgs84",
|
|
21
|
-
]
|
|
22
|
-
|
|
23
|
-
specific = [
|
|
24
|
-
"latlon",
|
|
25
|
-
"lat lon",
|
|
26
|
-
"x y",
|
|
27
|
-
"xy",
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
# we aim wide to catch exact matches if possible for the highest possible score
|
|
31
|
-
words = (
|
|
32
|
-
COMMON_COORDS_LABELS
|
|
33
|
-
+ specific
|
|
34
|
-
+ [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]]
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _is(header: str) -> float:
|
|
39
|
-
return header_score(header, words)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
# Does not detect CRS
|
|
8
|
-
words_combinations_list = [
|
|
9
|
-
"longitude",
|
|
10
|
-
"lon",
|
|
11
|
-
"long",
|
|
12
|
-
"geocodage x gps",
|
|
13
|
-
"location longitude",
|
|
14
|
-
"xlongitude",
|
|
15
|
-
"lng",
|
|
16
|
-
"xlong",
|
|
17
|
-
"x",
|
|
18
|
-
"xf",
|
|
19
|
-
"xd",
|
|
20
|
-
]
|
|
21
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
from ..latlon_wgs import COMMON_COORDS_LABELS
|
|
4
|
-
|
|
5
|
-
PROPORTION = 0.5
|
|
6
|
-
|
|
7
|
-
specific = [
|
|
8
|
-
"lonlat",
|
|
9
|
-
"lon lat",
|
|
10
|
-
"y x",
|
|
11
|
-
"yx",
|
|
12
|
-
]
|
|
13
|
-
|
|
14
|
-
# we aim wide to catch exact matches if possible for the highest possible score
|
|
15
|
-
words = (
|
|
16
|
-
COMMON_COORDS_LABELS
|
|
17
|
-
+ specific
|
|
18
|
-
+ [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]]
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _is(header: str) -> float:
|
|
23
|
-
return header_score(header, words)
|
|
File without changes
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"email",
|
|
9
|
-
"mail",
|
|
10
|
-
"courriel",
|
|
11
|
-
"contact",
|
|
12
|
-
"mel",
|
|
13
|
-
"lieucourriel",
|
|
14
|
-
"coordinates.emailcontact",
|
|
15
|
-
"e mail",
|
|
16
|
-
"mo mail",
|
|
17
|
-
"adresse mail",
|
|
18
|
-
"adresse email",
|
|
19
|
-
]
|
|
20
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"url",
|
|
9
|
-
"url source",
|
|
10
|
-
"site web",
|
|
11
|
-
"source url",
|
|
12
|
-
"site internet",
|
|
13
|
-
"remote url",
|
|
14
|
-
"web",
|
|
15
|
-
"site",
|
|
16
|
-
"lien",
|
|
17
|
-
"site data",
|
|
18
|
-
"lien url",
|
|
19
|
-
"lien vers le fichier",
|
|
20
|
-
"sitweb",
|
|
21
|
-
"interneturl",
|
|
22
|
-
]
|
|
23
|
-
return header_score(header, words_combinations_list)
|
|
File without changes
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"date",
|
|
9
|
-
"jour",
|
|
10
|
-
"date de mise a jour",
|
|
11
|
-
"sns date",
|
|
12
|
-
"date maj",
|
|
13
|
-
"rem date",
|
|
14
|
-
"periode",
|
|
15
|
-
"date de publication",
|
|
16
|
-
"dpc",
|
|
17
|
-
"extract date",
|
|
18
|
-
"date immatriculation",
|
|
19
|
-
"date jeu donnees",
|
|
20
|
-
"datemaj",
|
|
21
|
-
"dateouv",
|
|
22
|
-
"date der maj",
|
|
23
|
-
"dmaj",
|
|
24
|
-
"jour",
|
|
25
|
-
"yyyymmdd",
|
|
26
|
-
"aaaammjj",
|
|
27
|
-
]
|
|
28
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"datetime",
|
|
9
|
-
"timestamp",
|
|
10
|
-
"osm_timestamp",
|
|
11
|
-
"date",
|
|
12
|
-
"created at",
|
|
13
|
-
"last update",
|
|
14
|
-
"date maj",
|
|
15
|
-
"createdat",
|
|
16
|
-
"date naissance",
|
|
17
|
-
"date donnees",
|
|
18
|
-
] # Almost same as IS0, no example in data
|
|
19
|
-
return header_score(header, words_combinations_list)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from csv_detective.parsing.text import header_score
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.5
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(header: str) -> float:
|
|
7
|
-
words_combinations_list = [
|
|
8
|
-
"year",
|
|
9
|
-
"annee",
|
|
10
|
-
"annee depot",
|
|
11
|
-
"an nais",
|
|
12
|
-
"exercice",
|
|
13
|
-
"data year",
|
|
14
|
-
"annee de publication",
|
|
15
|
-
"exercice comptable",
|
|
16
|
-
"annee de naissance",
|
|
17
|
-
"annee ouverture",
|
|
18
|
-
]
|
|
19
|
-
return header_score(header, words_combinations_list)
|
csv_detective/load_tests.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
from csv_detective import detect_fields, detect_labels # noqa
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def get_all_packages(detect_type) -> list:
|
|
7
|
-
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
8
|
-
modules = []
|
|
9
|
-
for dirpath, _, filenames in os.walk(root_dir):
|
|
10
|
-
for filename in filenames:
|
|
11
|
-
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
12
|
-
if file.endswith("__init__.py"):
|
|
13
|
-
module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1]
|
|
14
|
-
if module:
|
|
15
|
-
modules.append(detect_type + module)
|
|
16
|
-
return modules
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def return_all_tests(
|
|
20
|
-
user_input_tests: str | list,
|
|
21
|
-
detect_type: str,
|
|
22
|
-
) -> dict[str, dict]:
|
|
23
|
-
"""
|
|
24
|
-
returns all tests that have a method _is and are listed in the user_input_tests
|
|
25
|
-
the function can select a sub_package from csv_detective
|
|
26
|
-
user_input_tests may look like this:
|
|
27
|
-
- "ALL": all possible tests are made
|
|
28
|
-
- "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
|
|
29
|
-
this specifc (group of) test(s) only
|
|
30
|
-
- ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
|
|
31
|
-
specific (groups of) tests by add "-" at the start (e.g "-temp.date")
|
|
32
|
-
"""
|
|
33
|
-
assert detect_type in ["detect_fields", "detect_labels"]
|
|
34
|
-
all_packages = get_all_packages(detect_type=detect_type)
|
|
35
|
-
|
|
36
|
-
if isinstance(user_input_tests, str):
|
|
37
|
-
user_input_tests = [user_input_tests]
|
|
38
|
-
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
39
|
-
tests_to_do = [detect_type]
|
|
40
|
-
else:
|
|
41
|
-
tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
|
|
42
|
-
tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
|
|
43
|
-
# removing specified (groups of) tests
|
|
44
|
-
all_tests = [
|
|
45
|
-
# this is why we need to import detect_fields/labels
|
|
46
|
-
eval(x)
|
|
47
|
-
for x in all_packages
|
|
48
|
-
if any([y == x[: len(y)] for y in tests_to_do])
|
|
49
|
-
and all([y != x[: len(y)] for y in tests_skipped])
|
|
50
|
-
]
|
|
51
|
-
return {
|
|
52
|
-
test.__name__.split(".")[-1]: {
|
|
53
|
-
"func": test._is,
|
|
54
|
-
"prop": test.PROPORTION,
|
|
55
|
-
"module": test,
|
|
56
|
-
}
|
|
57
|
-
for test in all_tests
|
|
58
|
-
if "_is" in dir(test)
|
|
59
|
-
}
|