csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
# flake8: noqa
|
|
2
|
-
from .FR.other import (
|
|
3
|
-
code_csp_insee,
|
|
4
|
-
csp_insee,
|
|
5
|
-
sexe,
|
|
6
|
-
siren,
|
|
7
|
-
tel_fr,
|
|
8
|
-
uai,
|
|
9
|
-
siret,
|
|
10
|
-
insee_ape700,
|
|
11
|
-
date_fr,
|
|
12
|
-
code_waldec,
|
|
13
|
-
code_rna
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
from .other import (
|
|
17
|
-
email,
|
|
18
|
-
url,
|
|
19
|
-
booleen,
|
|
20
|
-
mongo_object_id,
|
|
21
|
-
twitter,
|
|
22
|
-
float,
|
|
23
|
-
int,
|
|
24
|
-
uuid,
|
|
25
|
-
json
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
from .FR.geo import (
|
|
29
|
-
adresse,
|
|
30
|
-
code_commune_insee,
|
|
31
|
-
code_postal,
|
|
32
|
-
commune,
|
|
33
|
-
departement,
|
|
34
|
-
pays,
|
|
35
|
-
region,
|
|
36
|
-
code_departement,
|
|
37
|
-
code_fantoir,
|
|
38
|
-
longitude_wgs_fr_metropole,
|
|
39
|
-
latitude_wgs_fr_metropole,
|
|
40
|
-
code_region,
|
|
41
|
-
latitude_l93,
|
|
42
|
-
longitude_l93,
|
|
43
|
-
insee_canton
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
from .geo import (
|
|
47
|
-
iso_country_code_alpha2,
|
|
48
|
-
iso_country_code_alpha3,
|
|
49
|
-
iso_country_code_numeric,
|
|
50
|
-
latitude_wgs,
|
|
51
|
-
longitude_wgs,
|
|
52
|
-
latlon_wgs,
|
|
53
|
-
json_geojson
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
from .FR.temp import jour_de_la_semaine, mois_de_annee
|
|
57
|
-
from .temp import year, date, datetime_iso, datetime_rfc822
|
|
File without changes
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
PROPORTION = 1
|
|
5
|
-
|
|
6
|
-
with open(join(dirname(__file__), 'iso_country_code_alpha2.txt'), 'r') as iofile:
|
|
7
|
-
liste_pays = iofile.read().split('\n')
|
|
8
|
-
liste_pays = set(liste_pays)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _is(val):
|
|
12
|
-
'''Renvoie True si val peut etre un code iso pays alpha-2, False sinon'''
|
|
13
|
-
if not bool(re.match(r'[A-Z]{2}$', val)):
|
|
14
|
-
return False
|
|
15
|
-
return val in liste_pays
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
PROPORTION = 1
|
|
5
|
-
|
|
6
|
-
with open(join(dirname(__file__), 'iso_country_code_alpha3.txt'), 'r') as iofile:
|
|
7
|
-
liste_pays = iofile.read().split('\n')
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def _is(val):
|
|
11
|
-
'''Renvoie True si val peut etre un code iso pays alpha-3, False sinon'''
|
|
12
|
-
if not bool(re.match(r'[A-Z]{3}$', val)):
|
|
13
|
-
return False
|
|
14
|
-
return val in set(liste_pays)
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
PROPORTION = 1
|
|
5
|
-
|
|
6
|
-
with open(join(dirname(__file__), 'iso_country_code_numeric.txt'), 'r') as iofile:
|
|
7
|
-
liste_pays = iofile.read().split('\n')
|
|
8
|
-
liste_pays = set(liste_pays)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _is(val):
|
|
12
|
-
'''Renvoie True si val peut etre un code iso pays numerique, False sinon'''
|
|
13
|
-
if not bool(re.match(r'[0-9]{3}$', val)):
|
|
14
|
-
return False
|
|
15
|
-
return val in liste_pays
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from json import JSONDecodeError
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.9
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(val):
|
|
8
|
-
'''Renvoie True si val peut etre geojson'''
|
|
9
|
-
|
|
10
|
-
try:
|
|
11
|
-
j = json.loads(val)
|
|
12
|
-
if 'type' in j and 'coordinates' in j:
|
|
13
|
-
return True
|
|
14
|
-
if 'geometry' in j:
|
|
15
|
-
if 'coordinates' in j['geometry']:
|
|
16
|
-
return True
|
|
17
|
-
except JSONDecodeError:
|
|
18
|
-
pass
|
|
19
|
-
except TypeError:
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
return False
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.9
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(val):
|
|
7
|
-
'''Renvoie True si val peut etre une latitude'''
|
|
8
|
-
try:
|
|
9
|
-
return is_float(val) and float(val) >= -90 and float(val) <= 90
|
|
10
|
-
except ValueError:
|
|
11
|
-
return False
|
|
12
|
-
except OverflowError:
|
|
13
|
-
return False
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.9
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(val):
|
|
7
|
-
'''Renvoie True si val peut etre une longitude'''
|
|
8
|
-
try:
|
|
9
|
-
is_float(val) and float(val) >= -180 and float(val) <= 180
|
|
10
|
-
except ValueError:
|
|
11
|
-
return False
|
|
12
|
-
except OverflowError:
|
|
13
|
-
return False
|
|
File without changes
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
PROPORTION = 1
|
|
2
|
-
liste_bool = {
|
|
3
|
-
'0',
|
|
4
|
-
'1',
|
|
5
|
-
'vrai',
|
|
6
|
-
'faux',
|
|
7
|
-
'true',
|
|
8
|
-
'false',
|
|
9
|
-
'oui',
|
|
10
|
-
'non',
|
|
11
|
-
'yes',
|
|
12
|
-
'no',
|
|
13
|
-
'y',
|
|
14
|
-
'n',
|
|
15
|
-
'o'
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def _is(val):
|
|
20
|
-
'''Détection les booléens'''
|
|
21
|
-
return val.lower() in liste_bool
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
PROPORTION = 1
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def float_casting(str2cast):
|
|
5
|
-
return float(str2cast.replace(',', '.'))
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def _is(val):
|
|
9
|
-
'''Detects floats, assuming that tables will not have scientific
|
|
10
|
-
notations (3e6) or "+" in the string. "-" is still accepted.'''
|
|
11
|
-
try:
|
|
12
|
-
if any([k in val for k in ['_', '+', 'e', 'E']]):
|
|
13
|
-
return False
|
|
14
|
-
float_casting(val)
|
|
15
|
-
return True
|
|
16
|
-
except ValueError:
|
|
17
|
-
return False
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from json import JSONDecodeError
|
|
3
|
-
|
|
4
|
-
PROPORTION = 1
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(val):
|
|
8
|
-
'''Detects json'''
|
|
9
|
-
try:
|
|
10
|
-
loaded = json.loads(val)
|
|
11
|
-
if isinstance(loaded, list) or (
|
|
12
|
-
isinstance(loaded, dict) and not (
|
|
13
|
-
any(
|
|
14
|
-
[
|
|
15
|
-
geo in loaded for geo in ['coordinates', 'geometry']
|
|
16
|
-
]
|
|
17
|
-
)
|
|
18
|
-
)
|
|
19
|
-
):
|
|
20
|
-
return True
|
|
21
|
-
else:
|
|
22
|
-
return False
|
|
23
|
-
except JSONDecodeError:
|
|
24
|
-
return False
|
|
File without changes
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from dateutil.parser import parse, ParserError
|
|
3
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
4
|
-
from unidecode import unidecode
|
|
5
|
-
|
|
6
|
-
PROPORTION = 1
|
|
7
|
-
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def is_dateutil_date(val: str) -> bool:
|
|
11
|
-
# we don't want to get datetimes here, so length restriction
|
|
12
|
-
# longest date string expected here is DD-septembre-YYYY, so 17 characters
|
|
13
|
-
if len(val) > 17:
|
|
14
|
-
return False
|
|
15
|
-
try:
|
|
16
|
-
res = parse(val, fuzzy=False)
|
|
17
|
-
if res.hour or res.minute or res.second:
|
|
18
|
-
return False
|
|
19
|
-
return True
|
|
20
|
-
except (ParserError, ValueError, TypeError, OverflowError):
|
|
21
|
-
return False
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _is(val):
|
|
25
|
-
'''Renvoie True si val peut être une date, False sinon
|
|
26
|
-
On ne garde que les regex pour les cas où parse() ne convient pas'''
|
|
27
|
-
|
|
28
|
-
# matches 02/12 03 and 02_12 2003
|
|
29
|
-
a = bool(
|
|
30
|
-
re.match(
|
|
31
|
-
r'^(0[1-9]|[12][0-9]|3[01])[ -/_](0[1-9]|1[012])[ -/_]'
|
|
32
|
-
r'([0-9]{2}|(19|20)[0-9]{2}$)',
|
|
33
|
-
val
|
|
34
|
-
)
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
# matches 02052003
|
|
38
|
-
b = bool(
|
|
39
|
-
re.match(
|
|
40
|
-
r'^(0[1-9]|[12][0-9]|3[01])(0[1-9]|1[012])([0-9]{2}|'
|
|
41
|
-
r'(19|20){2}$)',
|
|
42
|
-
val
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
# matches JJ*MM*AAAA
|
|
47
|
-
c = bool(
|
|
48
|
-
re.match(
|
|
49
|
-
r'^(0[1-9]|[12][0-9]|3[01]).?(0[1-9]|1[012]).?(19|20)?\d\d$', val))
|
|
50
|
-
|
|
51
|
-
# matches JJ-mmm-AAAA and matches JJ-mmm...mm-AAAA
|
|
52
|
-
d = bool(
|
|
53
|
-
re.match(
|
|
54
|
-
r'^(0[1-9]|[12][0-9]|3[01])[ -/_;.:,](jan|fev|feb|mar|avr|apr'
|
|
55
|
-
r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
|
|
56
|
-
r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)[ -/_;.:,]'
|
|
57
|
-
r'([0-9]{2}$|(19|20)[0-9]{2}$)',
|
|
58
|
-
unidecode(val)
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
return (is_dateutil_date(val) and not is_float(val)) or a or b or c or d
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
PROPORTION = 1
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(val):
|
|
7
|
-
'''Renvoie True si val peut être une date au format iso, False sinon
|
|
8
|
-
Exemple: 2023-01-15T12:30:45.123456Z'''
|
|
9
|
-
a = bool(
|
|
10
|
-
re.match(
|
|
11
|
-
r'^\d{4}-(0[1-9]|1[012])\-(0[1-9]|[12][0-9]|3[01])[Tt]'
|
|
12
|
-
r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9])'
|
|
13
|
-
r'(\.\d+)?([Zz]|[-+](0[0-9]|1[0-2]):[0-5][0-9])?$',
|
|
14
|
-
val
|
|
15
|
-
)
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
return a
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
PROPORTION = 1
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(val):
|
|
7
|
-
'''Renvoie True si val peut être une date au format rfc822, False sinon
|
|
8
|
-
Exemple: Tue, 19 Dec 2023 15:30:45 +0000'''
|
|
9
|
-
|
|
10
|
-
val = val.lower()
|
|
11
|
-
a = bool(
|
|
12
|
-
re.match(
|
|
13
|
-
r'^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} '
|
|
14
|
-
r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) '
|
|
15
|
-
r'(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$',
|
|
16
|
-
val,
|
|
17
|
-
re.IGNORECASE
|
|
18
|
-
)
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return a
|
|
File without changes
|
|
File without changes
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'adresse',
|
|
15
|
-
'adresse postale',
|
|
16
|
-
'adresse geographique',
|
|
17
|
-
'adr',
|
|
18
|
-
'adresse complete',
|
|
19
|
-
'adresse station'
|
|
20
|
-
]
|
|
21
|
-
processed_header = _process_text(header)
|
|
22
|
-
|
|
23
|
-
header_matches_words_combination = float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
27
|
-
]
|
|
28
|
-
)
|
|
29
|
-
)
|
|
30
|
-
words_combination_in_header = 0.5 * float(
|
|
31
|
-
any(
|
|
32
|
-
[
|
|
33
|
-
full_word_strictly_inside_string(
|
|
34
|
-
words_combination, processed_header
|
|
35
|
-
) for words_combination in words_combinations_list
|
|
36
|
-
]
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'code commune insee',
|
|
15
|
-
'code insee',
|
|
16
|
-
'codes insee',
|
|
17
|
-
'code commune',
|
|
18
|
-
'code insee commune',
|
|
19
|
-
'insee',
|
|
20
|
-
'code com',
|
|
21
|
-
'com'
|
|
22
|
-
]
|
|
23
|
-
processed_header = _process_text(header)
|
|
24
|
-
|
|
25
|
-
header_matches_words_combination = float(
|
|
26
|
-
any(
|
|
27
|
-
[
|
|
28
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
words_combination_in_header = 0.5 * float(
|
|
33
|
-
any(
|
|
34
|
-
[
|
|
35
|
-
full_word_strictly_inside_string(
|
|
36
|
-
words_combination, processed_header
|
|
37
|
-
) for words_combination in words_combinations_list
|
|
38
|
-
]
|
|
39
|
-
)
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# 'dep': Possible confusion with dep name?
|
|
13
|
-
words_combinations_list = ['code departement', 'code_departement', 'dep', 'departement', 'dept']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = ['cadastre1', 'code fantoir', 'fantoir']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
|
|
13
|
-
words_combinations_list = [
|
|
14
|
-
'code postal',
|
|
15
|
-
'postal code',
|
|
16
|
-
'postcode',
|
|
17
|
-
'post code',
|
|
18
|
-
'cp',
|
|
19
|
-
'codes postaux',
|
|
20
|
-
'location postcode'
|
|
21
|
-
]
|
|
22
|
-
processed_header = _process_text(header)
|
|
23
|
-
|
|
24
|
-
header_matches_words_combination = float(
|
|
25
|
-
any(
|
|
26
|
-
[
|
|
27
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
28
|
-
]
|
|
29
|
-
)
|
|
30
|
-
)
|
|
31
|
-
words_combination_in_header = 0.5 * float(
|
|
32
|
-
any(
|
|
33
|
-
[
|
|
34
|
-
full_word_strictly_inside_string(
|
|
35
|
-
words_combination, processed_header
|
|
36
|
-
) for words_combination in words_combinations_list
|
|
37
|
-
]
|
|
38
|
-
)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from csv_detective.utils import full_word_strictly_inside_string
|
|
2
|
-
from csv_detective.process_text import _process_text
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.5
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def _is(header):
|
|
8
|
-
'''
|
|
9
|
-
Returns 1 if the (processed) header matches one of the expected words combination,
|
|
10
|
-
else 0
|
|
11
|
-
'''
|
|
12
|
-
# 'reg' : possible confusion with region name?
|
|
13
|
-
words_combinations_list = ['code region', 'reg', 'code insee region', 'region']
|
|
14
|
-
processed_header = _process_text(header)
|
|
15
|
-
|
|
16
|
-
header_matches_words_combination = float(
|
|
17
|
-
any(
|
|
18
|
-
[
|
|
19
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
20
|
-
]
|
|
21
|
-
)
|
|
22
|
-
)
|
|
23
|
-
words_combination_in_header = 0.5 * float(
|
|
24
|
-
any(
|
|
25
|
-
[
|
|
26
|
-
full_word_strictly_inside_string(
|
|
27
|
-
words_combination, processed_header
|
|
28
|
-
) for words_combination in words_combinations_list
|
|
29
|
-
]
|
|
30
|
-
)
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
return max(header_matches_words_combination, words_combination_in_header)
|