csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/cli.py +6 -9
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
- csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
- csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
- csv_detective/detect_fields/__init__.py +94 -43
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +4 -2
- csv_detective/detect_fields/other/int/__init__.py +3 -3
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
- csv_detective/detect_fields/other/twitter/__init__.py +2 -2
- csv_detective/detect_fields/other/uuid/__init__.py +4 -5
- csv_detective/detect_fields/temp/date/__init__.py +3 -2
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
- csv_detective/detect_fields/temp/year/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
- csv_detective/detect_labels/__init__.py +51 -1
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detection/columns.py +9 -9
- csv_detective/detection/encoding.py +6 -4
- csv_detective/detection/engine.py +6 -5
- csv_detective/detection/formats.py +19 -19
- csv_detective/detection/headers.py +3 -5
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/variables.py +6 -7
- csv_detective/explore_csv.py +7 -8
- csv_detective/load_tests.py +7 -16
- csv_detective/output/__init__.py +3 -7
- csv_detective/output/dataframe.py +9 -5
- csv_detective/output/example.py +13 -13
- csv_detective/output/profile.py +30 -23
- csv_detective/output/schema.py +20 -23
- csv_detective/output/utils.py +15 -15
- csv_detective/parsing/columns.py +23 -12
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +10 -11
- csv_detective/parsing/load.py +11 -8
- csv_detective/parsing/text.py +4 -9
- csv_detective/s3_utils.py +3 -7
- csv_detective/utils.py +4 -2
- csv_detective/validate.py +18 -13
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/METADATA +12 -2
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/RECORD +79 -79
- tests/test_example.py +2 -6
- tests/test_fields.py +16 -10
- tests/test_file.py +10 -9
- tests/test_labels.py +3 -2
- tests/test_structure.py +4 -3
- tests/test_validation.py +9 -6
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt +0 -0
|
@@ -2,37 +2,37 @@ from unidecode import unidecode
|
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
mois = {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
5
|
+
"janvier",
|
|
6
|
+
"fevrier",
|
|
7
|
+
"mars",
|
|
8
|
+
"avril",
|
|
9
|
+
"mai",
|
|
10
|
+
"juin",
|
|
11
|
+
"juillet",
|
|
12
|
+
"aout",
|
|
13
|
+
"septembre",
|
|
14
|
+
"octobre",
|
|
15
|
+
"novembre",
|
|
16
|
+
"decembre",
|
|
17
|
+
"jan",
|
|
18
|
+
"fev",
|
|
19
|
+
"mar",
|
|
20
|
+
"avr",
|
|
21
|
+
"mai",
|
|
22
|
+
"jun",
|
|
23
|
+
"jui",
|
|
24
|
+
"juil",
|
|
25
|
+
"aou",
|
|
26
|
+
"sep",
|
|
27
|
+
"sept",
|
|
28
|
+
"oct",
|
|
29
|
+
"nov",
|
|
30
|
+
"dec",
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def _is(val):
|
|
35
|
-
|
|
35
|
+
"""Renvoie True si les champs peuvent être des mois de l'année"""
|
|
36
36
|
if not isinstance(val, str):
|
|
37
37
|
return False
|
|
38
38
|
val = unidecode(val.lower())
|
|
@@ -1,61 +1,112 @@
|
|
|
1
|
-
# flake8: noqa
|
|
2
|
-
from .FR.other import (
|
|
3
|
-
code_csp_insee,
|
|
4
|
-
csp_insee,
|
|
5
|
-
sexe,
|
|
6
|
-
siren,
|
|
7
|
-
tel_fr,
|
|
8
|
-
uai,
|
|
9
|
-
siret,
|
|
10
|
-
insee_ape700,
|
|
11
|
-
date_fr,
|
|
12
|
-
code_import,
|
|
13
|
-
code_waldec,
|
|
14
|
-
code_rna,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
from .other import (
|
|
18
|
-
email,
|
|
19
|
-
url,
|
|
20
|
-
booleen,
|
|
21
|
-
money,
|
|
22
|
-
mongo_object_id,
|
|
23
|
-
percent,
|
|
24
|
-
twitter,
|
|
25
|
-
float,
|
|
26
|
-
int,
|
|
27
|
-
uuid,
|
|
28
|
-
json,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
1
|
from .FR.geo import (
|
|
32
2
|
adresse,
|
|
33
3
|
code_commune_insee,
|
|
34
|
-
code_postal,
|
|
35
|
-
commune,
|
|
36
|
-
departement,
|
|
37
|
-
pays,
|
|
38
|
-
region,
|
|
39
4
|
code_departement,
|
|
40
5
|
code_fantoir,
|
|
41
|
-
|
|
42
|
-
latitude_wgs_fr_metropole,
|
|
6
|
+
code_postal,
|
|
43
7
|
code_region,
|
|
8
|
+
commune,
|
|
9
|
+
departement,
|
|
10
|
+
insee_canton,
|
|
44
11
|
latitude_l93,
|
|
12
|
+
latitude_wgs_fr_metropole,
|
|
45
13
|
longitude_l93,
|
|
46
|
-
|
|
14
|
+
longitude_wgs_fr_metropole,
|
|
15
|
+
pays,
|
|
16
|
+
region,
|
|
47
17
|
)
|
|
48
|
-
|
|
18
|
+
from .FR.other import (
|
|
19
|
+
code_csp_insee,
|
|
20
|
+
code_import,
|
|
21
|
+
code_rna,
|
|
22
|
+
code_waldec,
|
|
23
|
+
csp_insee,
|
|
24
|
+
date_fr,
|
|
25
|
+
insee_ape700,
|
|
26
|
+
sexe,
|
|
27
|
+
siren,
|
|
28
|
+
siret,
|
|
29
|
+
tel_fr,
|
|
30
|
+
uai,
|
|
31
|
+
)
|
|
32
|
+
from .FR.temp import jour_de_la_semaine, mois_de_annee
|
|
49
33
|
from .geo import (
|
|
50
34
|
iso_country_code_alpha2,
|
|
51
35
|
iso_country_code_alpha3,
|
|
52
36
|
iso_country_code_numeric,
|
|
37
|
+
json_geojson,
|
|
53
38
|
latitude_wgs,
|
|
54
|
-
longitude_wgs,
|
|
55
39
|
latlon_wgs,
|
|
40
|
+
longitude_wgs,
|
|
56
41
|
lonlat_wgs,
|
|
57
|
-
json_geojson,
|
|
58
42
|
)
|
|
43
|
+
from .other import (
|
|
44
|
+
booleen,
|
|
45
|
+
email,
|
|
46
|
+
float,
|
|
47
|
+
int,
|
|
48
|
+
json,
|
|
49
|
+
money,
|
|
50
|
+
mongo_object_id,
|
|
51
|
+
percent,
|
|
52
|
+
twitter,
|
|
53
|
+
url,
|
|
54
|
+
uuid,
|
|
55
|
+
)
|
|
56
|
+
from .temp import date, datetime_aware, datetime_naive, datetime_rfc822, year
|
|
59
57
|
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
__all__ = [
|
|
59
|
+
"adresse",
|
|
60
|
+
"code_commune_insee",
|
|
61
|
+
"code_departement",
|
|
62
|
+
"code_fantoir",
|
|
63
|
+
"code_postal",
|
|
64
|
+
"code_region",
|
|
65
|
+
"commune",
|
|
66
|
+
"departement",
|
|
67
|
+
"insee_canton",
|
|
68
|
+
"latitude_l93",
|
|
69
|
+
"latitude_wgs_fr_metropole",
|
|
70
|
+
"longitude_l93",
|
|
71
|
+
"longitude_wgs_fr_metropole",
|
|
72
|
+
"pays",
|
|
73
|
+
"region",
|
|
74
|
+
"code_csp_insee",
|
|
75
|
+
"code_import",
|
|
76
|
+
"code_rna",
|
|
77
|
+
"code_waldec",
|
|
78
|
+
"csp_insee",
|
|
79
|
+
"date_fr",
|
|
80
|
+
"insee_ape700",
|
|
81
|
+
"sexe",
|
|
82
|
+
"siren",
|
|
83
|
+
"siret",
|
|
84
|
+
"tel_fr",
|
|
85
|
+
"uai",
|
|
86
|
+
"jour_de_la_semaine",
|
|
87
|
+
"mois_de_annee",
|
|
88
|
+
"iso_country_code_alpha2",
|
|
89
|
+
"iso_country_code_alpha3",
|
|
90
|
+
"iso_country_code_numeric",
|
|
91
|
+
"json_geojson",
|
|
92
|
+
"latitude_wgs",
|
|
93
|
+
"latlon_wgs",
|
|
94
|
+
"longitude_wgs",
|
|
95
|
+
"lonlat_wgs",
|
|
96
|
+
"booleen",
|
|
97
|
+
"email",
|
|
98
|
+
"float",
|
|
99
|
+
"int",
|
|
100
|
+
"json",
|
|
101
|
+
"money",
|
|
102
|
+
"mongo_object_id",
|
|
103
|
+
"percent",
|
|
104
|
+
"twitter",
|
|
105
|
+
"url",
|
|
106
|
+
"uuid",
|
|
107
|
+
"date",
|
|
108
|
+
"datetime_aware",
|
|
109
|
+
"datetime_naive",
|
|
110
|
+
"datetime_rfc822",
|
|
111
|
+
"year",
|
|
112
|
+
]
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
1
|
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
3
|
|
|
4
4
|
PROPORTION = 1
|
|
5
5
|
|
|
6
|
-
with open(join(dirname(__file__),
|
|
7
|
-
liste_pays = iofile.read().split(
|
|
6
|
+
with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile:
|
|
7
|
+
liste_pays = iofile.read().split("\n")
|
|
8
8
|
liste_pays = set(liste_pays)
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _is(val):
|
|
12
|
-
|
|
13
|
-
if not isinstance(val, str) or not bool(re.match(r
|
|
12
|
+
"""Renvoie True si val peut etre un code iso pays alpha-2, False sinon"""
|
|
13
|
+
if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
|
|
14
14
|
return False
|
|
15
15
|
return val in liste_pays
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
1
|
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
3
|
|
|
4
4
|
PROPORTION = 1
|
|
5
5
|
|
|
6
|
-
with open(join(dirname(__file__),
|
|
7
|
-
liste_pays = iofile.read().split(
|
|
6
|
+
with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile:
|
|
7
|
+
liste_pays = iofile.read().split("\n")
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _is(val):
|
|
11
|
-
|
|
12
|
-
if not isinstance(val, str) or not bool(re.match(r
|
|
11
|
+
"""Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
|
|
12
|
+
if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
|
|
13
13
|
return False
|
|
14
14
|
return val in set(liste_pays)
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
1
|
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
3
|
|
|
4
4
|
PROPORTION = 1
|
|
5
5
|
|
|
6
|
-
with open(join(dirname(__file__),
|
|
7
|
-
liste_pays = iofile.read().split(
|
|
6
|
+
with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile:
|
|
7
|
+
liste_pays = iofile.read().split("\n")
|
|
8
8
|
liste_pays = set(liste_pays)
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _is(val):
|
|
12
|
-
|
|
13
|
-
if not isinstance(val, str) or not bool(re.match(r
|
|
12
|
+
"""Renvoie True si val peut etre un code iso pays numerique, False sinon"""
|
|
13
|
+
if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
|
|
14
14
|
return False
|
|
15
15
|
return val in liste_pays
|
|
@@ -4,5 +4,7 @@ PROPORTION = 1
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
8
|
-
return isinstance(val, str) and bool(
|
|
7
|
+
"""Detects e-mails"""
|
|
8
|
+
return isinstance(val, str) and bool(
|
|
9
|
+
re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
|
|
10
|
+
)
|
|
@@ -2,11 +2,11 @@ PROPORTION = 1
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def _is(val):
|
|
5
|
-
|
|
5
|
+
"""Detects integers"""
|
|
6
6
|
if (
|
|
7
7
|
not isinstance(val, str)
|
|
8
|
-
or any([v in val for v in [
|
|
9
|
-
or (val.startswith(
|
|
8
|
+
or any([v in val for v in [".", "_", "+"]])
|
|
9
|
+
or (val.startswith("0") and len(val) > 1)
|
|
10
10
|
):
|
|
11
11
|
return False
|
|
12
12
|
try:
|
|
@@ -4,8 +4,7 @@ PROPORTION = 0.8
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
8
|
-
return isinstance(val, str) and bool(
|
|
9
|
-
r
|
|
10
|
-
|
|
11
|
-
))
|
|
7
|
+
"""Detects UUIDs"""
|
|
8
|
+
return isinstance(val, str) and bool(
|
|
9
|
+
re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
|
|
10
|
+
)
|
|
@@ -2,7 +2,8 @@ from datetime import datetime
|
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from dateparser import parse as date_parser
|
|
5
|
-
from dateutil.parser import
|
|
5
|
+
from dateutil.parser import ParserError
|
|
6
|
+
from dateutil.parser import parse as dateutil_parser
|
|
6
7
|
|
|
7
8
|
PROPORTION = 1
|
|
8
9
|
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
@@ -22,7 +23,7 @@ threshold = 0.3
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def _is(val):
|
|
25
|
-
|
|
26
|
+
"""Renvoie True si val peut être une date, False sinon"""
|
|
26
27
|
# early stops, to cut processing time
|
|
27
28
|
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
|
|
28
29
|
return False
|
|
@@ -4,15 +4,15 @@ PROPORTION = 1
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
8
|
-
Exemple: Tue, 19 Dec 2023 15:30:45 +0000
|
|
7
|
+
"""Renvoie True si val peut être une date au format rfc822, False sinon
|
|
8
|
+
Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
|
|
9
9
|
|
|
10
10
|
return isinstance(val, str) and bool(
|
|
11
11
|
re.match(
|
|
12
|
-
r
|
|
13
|
-
r
|
|
14
|
-
r
|
|
12
|
+
r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
|
|
13
|
+
r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
|
|
14
|
+
r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
|
|
15
15
|
val.lower(),
|
|
16
|
-
re.IGNORECASE
|
|
16
|
+
re.IGNORECASE,
|
|
17
17
|
)
|
|
18
18
|
)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# flake8: noqa
|
|
2
1
|
from .FR.geo import (
|
|
3
2
|
adresse,
|
|
4
3
|
code_commune_insee,
|
|
@@ -42,3 +41,54 @@ from .geo import (
|
|
|
42
41
|
)
|
|
43
42
|
from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
|
|
44
43
|
from .temp import date, datetime_rfc822, year
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"adresse",
|
|
47
|
+
"code_commune_insee",
|
|
48
|
+
"code_departement",
|
|
49
|
+
"code_fantoir",
|
|
50
|
+
"code_postal",
|
|
51
|
+
"code_region",
|
|
52
|
+
"commune",
|
|
53
|
+
"departement",
|
|
54
|
+
"insee_canton",
|
|
55
|
+
"latitude_l93",
|
|
56
|
+
"latitude_wgs_fr_metropole",
|
|
57
|
+
"longitude_l93",
|
|
58
|
+
"longitude_wgs_fr_metropole",
|
|
59
|
+
"pays",
|
|
60
|
+
"region",
|
|
61
|
+
"code_csp_insee",
|
|
62
|
+
"code_rna",
|
|
63
|
+
"code_waldec",
|
|
64
|
+
"csp_insee",
|
|
65
|
+
"date_fr",
|
|
66
|
+
"insee_ape700",
|
|
67
|
+
"sexe",
|
|
68
|
+
"siren",
|
|
69
|
+
"siret",
|
|
70
|
+
"tel_fr",
|
|
71
|
+
"uai",
|
|
72
|
+
"iso_country_code_alpha2",
|
|
73
|
+
"iso_country_code_alpha3",
|
|
74
|
+
"iso_country_code_numeric",
|
|
75
|
+
"json_geojson",
|
|
76
|
+
"latitude_wgs",
|
|
77
|
+
"latlon_wgs",
|
|
78
|
+
"longitude_wgs",
|
|
79
|
+
"lonlat_wgs",
|
|
80
|
+
"jour_de_la_semaine",
|
|
81
|
+
"mois_de_annee",
|
|
82
|
+
"booleen",
|
|
83
|
+
"email",
|
|
84
|
+
"float",
|
|
85
|
+
"int",
|
|
86
|
+
"money",
|
|
87
|
+
"mongo_object_id",
|
|
88
|
+
"twitter",
|
|
89
|
+
"url",
|
|
90
|
+
"uuid",
|
|
91
|
+
"date",
|
|
92
|
+
"datetime_rfc822",
|
|
93
|
+
"year",
|
|
94
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import TextIO
|
|
3
2
|
from time import time
|
|
3
|
+
from typing import TextIO
|
|
4
4
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
@@ -47,19 +47,21 @@ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int
|
|
|
47
47
|
if return_int == 0:
|
|
48
48
|
if verbose:
|
|
49
49
|
display_logs_depending_process_time(
|
|
50
|
-
f
|
|
50
|
+
f"No heading column detected in {round(time() - start, 3)}s",
|
|
51
51
|
time() - start,
|
|
52
52
|
)
|
|
53
53
|
return 0
|
|
54
54
|
if verbose:
|
|
55
55
|
display_logs_depending_process_time(
|
|
56
|
-
f
|
|
56
|
+
f"{return_int} heading columns detected in {round(time() - start, 3)}s",
|
|
57
57
|
time() - start,
|
|
58
58
|
)
|
|
59
59
|
return return_int
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def detect_trailing_columns(
|
|
62
|
+
def detect_trailing_columns(
|
|
63
|
+
file: TextIO, sep: str, heading_columns: int, verbose: bool = False
|
|
64
|
+
) -> int:
|
|
63
65
|
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
64
66
|
if verbose:
|
|
65
67
|
start = time()
|
|
@@ -70,20 +72,18 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
|
|
|
70
72
|
line = file.readline()
|
|
71
73
|
return_int = min(
|
|
72
74
|
return_int,
|
|
73
|
-
len(line.replace("\n", ""))
|
|
74
|
-
- len(line.replace("\n", "").strip(sep))
|
|
75
|
-
- heading_columns,
|
|
75
|
+
len(line.replace("\n", "")) - len(line.replace("\n", "").strip(sep)) - heading_columns,
|
|
76
76
|
)
|
|
77
77
|
if return_int == 0:
|
|
78
78
|
if verbose:
|
|
79
79
|
display_logs_depending_process_time(
|
|
80
|
-
f
|
|
80
|
+
f"No trailing column detected in {round(time() - start, 3)}s",
|
|
81
81
|
time() - start,
|
|
82
82
|
)
|
|
83
83
|
return 0
|
|
84
84
|
if verbose:
|
|
85
85
|
display_logs_depending_process_time(
|
|
86
|
-
f
|
|
86
|
+
f"{return_int} trailing columns detected in {round(time() - start, 3)}s",
|
|
87
87
|
time() - start,
|
|
88
88
|
)
|
|
89
89
|
return return_int
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from time import time
|
|
3
2
|
from io import BytesIO
|
|
3
|
+
from time import time
|
|
4
4
|
|
|
5
5
|
from cchardet import detect
|
|
6
6
|
|
|
@@ -16,12 +16,14 @@ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
|
|
|
16
16
|
logging.info("Detecting encoding")
|
|
17
17
|
encoding_dict = detect(binary_file.read())
|
|
18
18
|
if not encoding_dict["encoding"]:
|
|
19
|
-
raise ValueError(
|
|
19
|
+
raise ValueError(
|
|
20
|
+
"Could not detect the file's encoding. Consider specifying it in the routine call."
|
|
21
|
+
)
|
|
20
22
|
if verbose:
|
|
21
23
|
message = f'Detected encoding: "{encoding_dict["encoding"]}"'
|
|
22
|
-
message += f
|
|
24
|
+
message += f" in {round(time() - start, 3)}s (confidence: {round(encoding_dict['confidence'] * 100)}%)"
|
|
23
25
|
display_logs_depending_process_time(
|
|
24
26
|
message,
|
|
25
27
|
time() - start,
|
|
26
28
|
)
|
|
27
|
-
return encoding_dict[
|
|
29
|
+
return encoding_dict["encoding"]
|
|
@@ -22,11 +22,11 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
|
|
|
22
22
|
mapping = {
|
|
23
23
|
"application/gzip": "gzip",
|
|
24
24
|
"application/x-gzip": "gzip",
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "openpyxl",
|
|
26
|
+
"application/vnd.ms-excel": "xlrd",
|
|
27
|
+
"application/vnd.oasis.opendocument.spreadsheet": "odf",
|
|
28
28
|
# all these files could be recognized as zip, may need to check all cases then
|
|
29
|
-
|
|
29
|
+
"application/zip": "openpyxl",
|
|
30
30
|
}
|
|
31
31
|
# if none of the above, we move forwards with the csv process
|
|
32
32
|
if is_url(file_path):
|
|
@@ -37,7 +37,8 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
|
|
|
37
37
|
if verbose:
|
|
38
38
|
message = (
|
|
39
39
|
f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
|
|
40
|
-
if engine
|
|
40
|
+
if engine
|
|
41
|
+
else "Processing the file as a csv"
|
|
41
42
|
)
|
|
42
43
|
display_logs_depending_process_time(
|
|
43
44
|
message,
|