csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +0 -2
- csv_detective/cli.py +6 -9
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
- csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
- csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +4 -2
- csv_detective/detect_fields/other/int/__init__.py +3 -3
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
- csv_detective/detect_fields/other/twitter/__init__.py +2 -2
- csv_detective/detect_fields/other/uuid/__init__.py +4 -5
- csv_detective/detect_fields/temp/date/__init__.py +3 -2
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
- csv_detective/detect_fields/temp/year/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detection/columns.py +9 -9
- csv_detective/detection/encoding.py +6 -4
- csv_detective/detection/engine.py +6 -5
- csv_detective/detection/formats.py +19 -19
- csv_detective/detection/headers.py +3 -5
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/variables.py +4 -4
- csv_detective/explore_csv.py +7 -8
- csv_detective/load_tests.py +6 -14
- csv_detective/output/__init__.py +3 -7
- csv_detective/output/dataframe.py +9 -5
- csv_detective/output/example.py +13 -13
- csv_detective/output/profile.py +30 -23
- csv_detective/output/schema.py +20 -23
- csv_detective/output/utils.py +15 -15
- csv_detective/parsing/columns.py +23 -12
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +10 -11
- csv_detective/parsing/load.py +11 -8
- csv_detective/parsing/text.py +4 -9
- csv_detective/s3_utils.py +3 -7
- csv_detective/utils.py +4 -2
- csv_detective/validate.py +18 -13
- csv_detective-0.8.1.dev1674.data/data/share/csv_detective/README.md → csv_detective-0.8.1.dev1720.dist-info/METADATA +32 -0
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/RECORD +81 -81
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/top_level.txt +2 -0
- tests/test_example.py +2 -6
- tests/test_fields.py +16 -10
- tests/test_file.py +10 -9
- tests/test_labels.py +3 -2
- tests/test_structure.py +3 -1
- tests/test_validation.py +9 -6
- venv/bin/activate_this.py +38 -0
- venv/bin/jp.py +54 -0
- venv/bin/runxlrd.py +410 -0
- csv_detective-0.8.1.dev1674.data/data/share/csv_detective/CHANGELOG.md +0 -186
- csv_detective-0.8.1.dev1674.dist-info/METADATA +0 -268
- csv_detective-0.8.1.dev1674.dist-info/licenses/LICENSE +0 -21
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1674.data/data/share/csv_detective → csv_detective-0.8.1.dev1720.dist-info/licenses}/LICENSE +0 -0
csv_detective/__init__.py
CHANGED
csv_detective/cli.py
CHANGED
|
@@ -4,23 +4,20 @@ Command line client for csv_detective
|
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
6
|
import json
|
|
7
|
+
|
|
7
8
|
from csv_detective.explore_csv import routine
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def run():
|
|
11
12
|
explorer = argparse.ArgumentParser(description="Analyse a tabular file")
|
|
12
|
-
explorer.add_argument(
|
|
13
|
-
"file_path",
|
|
14
|
-
type=str,
|
|
15
|
-
help="Enter path of tabular file to explore"
|
|
16
|
-
)
|
|
13
|
+
explorer.add_argument("file_path", type=str, help="Enter path of tabular file to explore")
|
|
17
14
|
explorer.add_argument(
|
|
18
15
|
"-n",
|
|
19
16
|
"--num_rows",
|
|
20
17
|
dest="num_rows",
|
|
21
18
|
type=int,
|
|
22
19
|
nargs="?",
|
|
23
|
-
help="Number of rows to use for detection (default 500)"
|
|
20
|
+
help="Number of rows to use for detection (default 500)",
|
|
24
21
|
)
|
|
25
22
|
explorer.add_argument(
|
|
26
23
|
"-s",
|
|
@@ -28,14 +25,14 @@ def run():
|
|
|
28
25
|
dest="sep",
|
|
29
26
|
type=str,
|
|
30
27
|
nargs="?",
|
|
31
|
-
help="Columns separator (detected if not specified)"
|
|
28
|
+
help="Columns separator (detected if not specified)",
|
|
32
29
|
)
|
|
33
30
|
explorer.add_argument(
|
|
34
31
|
"--save",
|
|
35
32
|
dest="save_results",
|
|
36
33
|
type=int,
|
|
37
34
|
nargs="?",
|
|
38
|
-
help="Whether to save the resulting analysis to json (1 = save, 0 = don't)"
|
|
35
|
+
help="Whether to save the resulting analysis to json (1 = save, 0 = don't)",
|
|
39
36
|
)
|
|
40
37
|
explorer.add_argument(
|
|
41
38
|
"-v",
|
|
@@ -43,7 +40,7 @@ def run():
|
|
|
43
40
|
dest="verbose",
|
|
44
41
|
type=int,
|
|
45
42
|
nargs="?",
|
|
46
|
-
help="Verbose (0 = quiet, 1 = details)"
|
|
43
|
+
help="Verbose (0 = quiet, 1 = details)",
|
|
47
44
|
)
|
|
48
45
|
|
|
49
46
|
opts = explorer.parse_args()
|
|
@@ -3,97 +3,97 @@ from csv_detective.parsing.text import _process_text
|
|
|
3
3
|
PROPORTION = 0.55
|
|
4
4
|
# ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
|
|
5
5
|
voies = {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
6
|
+
"aire ",
|
|
7
|
+
"allee ",
|
|
8
|
+
"avenue ",
|
|
9
|
+
"base ",
|
|
10
|
+
"boulevard ",
|
|
11
|
+
"cami ",
|
|
12
|
+
"carrefour ",
|
|
13
|
+
"chemin ",
|
|
14
|
+
"cheminement ",
|
|
15
|
+
"chaussee ",
|
|
16
|
+
"cite ",
|
|
17
|
+
"clos ",
|
|
18
|
+
"coin ",
|
|
19
|
+
"corniche ",
|
|
20
|
+
"cote ",
|
|
21
|
+
"cour ",
|
|
22
|
+
"cours ",
|
|
23
|
+
"domaine ",
|
|
24
|
+
"descente ",
|
|
25
|
+
"ecart ",
|
|
26
|
+
"esplanade ",
|
|
27
|
+
"faubourg ",
|
|
28
|
+
"gare ",
|
|
29
|
+
"grande rue",
|
|
30
|
+
"hameau ",
|
|
31
|
+
"halle ",
|
|
32
|
+
"ilot ",
|
|
33
|
+
"impasse ",
|
|
34
|
+
"lieu dit",
|
|
35
|
+
"lotissement ",
|
|
36
|
+
"marche ",
|
|
37
|
+
"montee ",
|
|
38
|
+
"parc ",
|
|
39
|
+
"passage ",
|
|
40
|
+
"place ",
|
|
41
|
+
"plan ",
|
|
42
|
+
"plaine ",
|
|
43
|
+
"plateau ",
|
|
44
|
+
"pont ",
|
|
45
|
+
"port ",
|
|
46
|
+
"promenade ",
|
|
47
|
+
"parvis ",
|
|
48
|
+
"quartier ",
|
|
49
|
+
"quai ",
|
|
50
|
+
"residence ",
|
|
51
|
+
"ruelle ",
|
|
52
|
+
"rocade ",
|
|
53
|
+
"rond point",
|
|
54
|
+
"route ",
|
|
55
|
+
"rue ",
|
|
56
56
|
# 'sente - sentier',
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
"square ",
|
|
58
|
+
"tour ",
|
|
59
59
|
# 'terre-plein',
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
60
|
+
"traverse ",
|
|
61
|
+
"villa ",
|
|
62
|
+
"village ",
|
|
63
|
+
"voie ",
|
|
64
|
+
"zone artisanale",
|
|
65
|
+
"zone d’amenagement concerte",
|
|
66
|
+
"zone d’amenagement differe",
|
|
67
|
+
"zone industrielle",
|
|
68
|
+
"zone ",
|
|
69
69
|
# 'r',
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
"av ",
|
|
71
|
+
"pl ",
|
|
72
|
+
"bd ",
|
|
73
|
+
"cami ",
|
|
74
74
|
# 'che',
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
75
|
+
"chs ",
|
|
76
|
+
"dom ",
|
|
77
|
+
"ham ",
|
|
78
|
+
"ld ",
|
|
79
79
|
# 'pro',
|
|
80
80
|
# 'rte',
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
"vlge ",
|
|
82
|
+
"za ",
|
|
83
|
+
"zac ",
|
|
84
|
+
"zad ",
|
|
85
|
+
"zi ",
|
|
86
86
|
# 'car',
|
|
87
|
-
|
|
87
|
+
"fg ",
|
|
88
88
|
# 'lot',
|
|
89
|
-
|
|
89
|
+
"imp ",
|
|
90
90
|
# 'qu',
|
|
91
|
-
|
|
91
|
+
"mte",
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
|
|
95
95
|
def _is(val):
|
|
96
|
-
|
|
96
|
+
"""Repere des adresses"""
|
|
97
97
|
if not isinstance(val, str) or len(val) > 150:
|
|
98
98
|
return False
|
|
99
99
|
val = _process_text(val)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from frformat import NumeroDepartement, Options
|
|
1
|
+
from frformat import Millesime, NumeroDepartement, Options
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
|
|
@@ -6,7 +6,7 @@ _options = Options(
|
|
|
6
6
|
ignore_case=True,
|
|
7
7
|
ignore_accents=True,
|
|
8
8
|
replace_non_alphanumeric_with_space=True,
|
|
9
|
-
ignore_extra_whitespace=True
|
|
9
|
+
ignore_extra_whitespace=True,
|
|
10
10
|
)
|
|
11
11
|
_numero_departement = NumeroDepartement(Millesime.LATEST, _options)
|
|
12
12
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from frformat import Commune,
|
|
1
|
+
from frformat import Commune, Millesime, Options
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.9
|
|
4
4
|
|
|
@@ -6,7 +6,7 @@ _options = Options(
|
|
|
6
6
|
ignore_case=True,
|
|
7
7
|
ignore_accents=True,
|
|
8
8
|
replace_non_alphanumeric_with_space=True,
|
|
9
|
-
ignore_extra_whitespace=True
|
|
9
|
+
ignore_extra_whitespace=True,
|
|
10
10
|
)
|
|
11
11
|
_commune = Commune(Millesime.LATEST, _options)
|
|
12
12
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from frformat import Departement,
|
|
1
|
+
from frformat import Departement, Millesime, Options
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.9
|
|
4
4
|
|
|
@@ -6,7 +6,7 @@ _options = Options(
|
|
|
6
6
|
ignore_case=True,
|
|
7
7
|
ignore_accents=True,
|
|
8
8
|
replace_non_alphanumeric_with_space=True,
|
|
9
|
-
ignore_extra_whitespace=True
|
|
9
|
+
ignore_extra_whitespace=True,
|
|
10
10
|
)
|
|
11
11
|
_departement = Departement(Millesime.LATEST, _options)
|
|
12
12
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from frformat import Canton,
|
|
1
|
+
from frformat import Canton, Millesime, Options
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.9
|
|
4
4
|
_options = Options(
|
|
5
5
|
ignore_case=True,
|
|
6
6
|
ignore_accents=True,
|
|
7
7
|
replace_non_alphanumeric_with_space=True,
|
|
8
|
-
ignore_extra_whitespace=True
|
|
8
|
+
ignore_extra_whitespace=True,
|
|
9
9
|
)
|
|
10
10
|
_canton = Canton(Millesime.LATEST, _options)
|
|
11
11
|
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from frformat import LatitudeL93
|
|
2
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
3
2
|
|
|
3
|
+
from csv_detective.detect_fields.other.float import _is as is_float
|
|
4
4
|
from csv_detective.detect_fields.other.float import float_casting
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
PROPORTION = 0.9
|
|
8
7
|
|
|
9
8
|
_latitudel93 = LatitudeL93()
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from frformat import LongitudeL93
|
|
2
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
3
2
|
|
|
3
|
+
from csv_detective.detect_fields.other.float import _is as is_float
|
|
4
4
|
from csv_detective.detect_fields.other.float import float_casting
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
PROPORTION = 0.9
|
|
8
7
|
|
|
9
8
|
_longitudel93 = LongitudeL93()
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
from frformat import
|
|
1
|
+
from frformat import Millesime, Options, Pays
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.6
|
|
4
4
|
|
|
5
5
|
_options = Options(
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
ignore_case=True,
|
|
7
|
+
ignore_accents=True,
|
|
8
|
+
replace_non_alphanumeric_with_space=True,
|
|
9
|
+
ignore_extra_whitespace=True,
|
|
10
|
+
)
|
|
11
11
|
_pays = Pays(Millesime.LATEST, _options)
|
|
12
12
|
|
|
13
13
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from frformat import
|
|
1
|
+
from frformat import Millesime, Options, Region
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
|
|
5
|
-
_extra_valid_values_set = frozenset(
|
|
5
|
+
_extra_valid_values_set = frozenset(
|
|
6
|
+
{
|
|
6
7
|
"alsace",
|
|
7
8
|
"aquitaine",
|
|
8
9
|
"ara",
|
|
@@ -30,7 +31,8 @@ _extra_valid_values_set = frozenset({
|
|
|
30
31
|
"poitou charentes",
|
|
31
32
|
"reunion",
|
|
32
33
|
"rhone alpes",
|
|
33
|
-
|
|
34
|
+
}
|
|
35
|
+
)
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
_options = Options(
|
|
@@ -38,7 +40,7 @@ _options = Options(
|
|
|
38
40
|
ignore_accents=True,
|
|
39
41
|
replace_non_alphanumeric_with_space=True,
|
|
40
42
|
ignore_extra_whitespace=True,
|
|
41
|
-
extra_valid_values=_extra_valid_values_set
|
|
43
|
+
extra_valid_values=_extra_valid_values_set,
|
|
42
44
|
)
|
|
43
45
|
_region = Region(Millesime.LATEST, _options)
|
|
44
46
|
|
|
@@ -1,28 +1,29 @@
|
|
|
1
|
-
from csv_detective.parsing.text import _process_text
|
|
2
1
|
import re
|
|
3
2
|
|
|
3
|
+
from csv_detective.parsing.text import _process_text
|
|
4
|
+
|
|
4
5
|
PROPORTION = 1
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def _is(val):
|
|
8
|
-
|
|
9
|
+
"""Repère les code csp telles que définies par l'INSEE"""
|
|
9
10
|
if not isinstance(val, str):
|
|
10
11
|
return False
|
|
11
12
|
val = _process_text(val)
|
|
12
13
|
if len(val) != 4:
|
|
13
14
|
return False
|
|
14
|
-
a = bool(re.match(r
|
|
15
|
+
a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
|
|
15
16
|
b = val in {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
"7100",
|
|
18
|
+
"7200",
|
|
19
|
+
"7400",
|
|
20
|
+
"7500",
|
|
21
|
+
"7700",
|
|
22
|
+
"7800",
|
|
23
|
+
"8100",
|
|
24
|
+
"8300",
|
|
25
|
+
"8400",
|
|
26
|
+
"8500",
|
|
27
|
+
"8600",
|
|
27
28
|
}
|
|
28
29
|
return a or b
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from os.path import dirname, join
|
|
2
|
+
|
|
2
3
|
from csv_detective.parsing.text import _process_text
|
|
3
4
|
|
|
4
5
|
PROPORTION = 1
|
|
5
|
-
f = open(join(dirname(__file__),
|
|
6
|
-
codes_insee = f.read().split(
|
|
6
|
+
f = open(join(dirname(__file__), "csp_insee.txt"), "r")
|
|
7
|
+
codes_insee = f.read().split("\n")
|
|
7
8
|
# removing empty str due to additionnal line in file
|
|
8
9
|
del codes_insee[-1]
|
|
9
10
|
codes_insee = set(codes_insee)
|
|
@@ -11,7 +12,7 @@ f.close()
|
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def _is(val):
|
|
14
|
-
|
|
15
|
+
"""Repère les csp telles que définies par l'INSEE"""
|
|
15
16
|
if not isinstance(val, str):
|
|
16
17
|
return False
|
|
17
18
|
val = _process_text(val)
|
|
@@ -2,11 +2,11 @@ import re
|
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
regex = (
|
|
5
|
-
r
|
|
6
|
-
r
|
|
5
|
+
r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
|
|
6
|
+
r"|octobre|novembre|decembre)[ \-]\d{4}$"
|
|
7
7
|
)
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _is(val):
|
|
11
|
-
|
|
11
|
+
"""Repere les dates textuelles FR"""
|
|
12
12
|
return isinstance(val, str) and bool(re.match(regex, val))
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from os.path import dirname, join
|
|
2
|
+
|
|
2
3
|
from csv_detective.parsing.text import _process_text
|
|
3
4
|
|
|
4
5
|
PROPORTION = 1
|
|
5
|
-
f = open(join(dirname(__file__),
|
|
6
|
-
condes_insee_ape = f.read().split(
|
|
6
|
+
f = open(join(dirname(__file__), "insee_ape700.txt"), "r")
|
|
7
|
+
condes_insee_ape = f.read().split("\n")
|
|
7
8
|
# removing empty str due to additionnal line in file
|
|
8
9
|
del condes_insee_ape[-1]
|
|
9
10
|
condes_insee_ape = set(condes_insee_ape)
|
|
@@ -11,7 +12,7 @@ f.close()
|
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def _is(val):
|
|
14
|
-
|
|
15
|
+
"""Repère les codes APE700 de l'INSEE"""
|
|
15
16
|
if not isinstance(val, str):
|
|
16
17
|
return False
|
|
17
18
|
val = _process_text(val).upper()
|
|
@@ -4,8 +4,8 @@ PROPORTION = 1
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
7
|
+
"""Repère le sexe"""
|
|
8
8
|
if not isinstance(val, str):
|
|
9
9
|
return False
|
|
10
10
|
val = _process_text(val)
|
|
11
|
-
return val in {
|
|
11
|
+
return val in {"homme", "femme", "h", "f", "m", "masculin", "feminin"}
|
|
@@ -4,11 +4,11 @@ PROPORTION = 0.9
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
7
|
+
"""Repere les codes SIREN"""
|
|
8
8
|
if not isinstance(val, str):
|
|
9
9
|
return False
|
|
10
|
-
val = val.replace(
|
|
11
|
-
if not bool(re.match(r
|
|
10
|
+
val = val.replace(" ", "")
|
|
11
|
+
if not bool(re.match(r"^[0-9]{9}$", val)):
|
|
12
12
|
return False
|
|
13
13
|
# Vérification par clé propre aux codes siren
|
|
14
14
|
cle = 0
|
|
@@ -4,11 +4,11 @@ PROPORTION = 0.8
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
7
|
+
"""Détection des identifiants SIRET (SIRENE)"""
|
|
8
8
|
if not isinstance(val, str):
|
|
9
9
|
return False
|
|
10
|
-
val = val.replace(
|
|
11
|
-
if not bool(re.match(r
|
|
10
|
+
val = val.replace(" ", "")
|
|
11
|
+
if not bool(re.match(r"^[0-9]{14}$", val)):
|
|
12
12
|
return False
|
|
13
13
|
|
|
14
14
|
# Vérification par clé de luhn du SIREN
|
|
@@ -4,14 +4,14 @@ PROPORTION = 0.7
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
7
|
+
"""Repère les numeros de telephone francais"""
|
|
8
8
|
if not isinstance(val, str):
|
|
9
9
|
return False
|
|
10
10
|
|
|
11
11
|
if len(val) < 10:
|
|
12
12
|
return False
|
|
13
13
|
|
|
14
|
-
val = val.replace(
|
|
14
|
+
val = val.replace(".", "").replace("-", "").replace(" ", "")
|
|
15
15
|
|
|
16
|
-
match_1 = bool(re.match(r
|
|
16
|
+
match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val))
|
|
17
17
|
return match_1
|
|
@@ -4,12 +4,12 @@ PROPORTION = 1
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
7
|
+
"""Repere les codes UAI de l'éducation nationale"""
|
|
8
8
|
|
|
9
9
|
# test sur la longueur
|
|
10
10
|
if not isinstance(val, str) or len(val) != 8:
|
|
11
11
|
return False
|
|
12
12
|
|
|
13
|
-
if not bool(re.match(r
|
|
13
|
+
if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)):
|
|
14
14
|
return False
|
|
15
15
|
return True
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
PROPORTION = 1
|
|
2
2
|
jours = {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
3
|
+
"lundi",
|
|
4
|
+
"mardi",
|
|
5
|
+
"mercredi",
|
|
6
|
+
"jeudi",
|
|
7
|
+
"vendredi",
|
|
8
|
+
"samedi",
|
|
9
|
+
"dimanche",
|
|
10
|
+
"lun",
|
|
11
|
+
"mar",
|
|
12
|
+
"mer",
|
|
13
|
+
"jeu",
|
|
14
|
+
"ven",
|
|
15
|
+
"sam",
|
|
16
|
+
"dim",
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def _is(val):
|
|
21
|
-
|
|
21
|
+
"""Renvoie True si les champs peuvent être des jours de la semaine"""
|
|
22
22
|
if not isinstance(val, str):
|
|
23
23
|
return False
|
|
24
24
|
val = val.lower()
|