csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +0 -2
- csv_detective/cli.py +6 -9
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
- csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
- csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +4 -2
- csv_detective/detect_fields/other/int/__init__.py +3 -3
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
- csv_detective/detect_fields/other/twitter/__init__.py +2 -2
- csv_detective/detect_fields/other/uuid/__init__.py +4 -5
- csv_detective/detect_fields/temp/date/__init__.py +3 -2
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
- csv_detective/detect_fields/temp/year/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detection/columns.py +9 -9
- csv_detective/detection/encoding.py +6 -4
- csv_detective/detection/engine.py +6 -5
- csv_detective/detection/formats.py +19 -19
- csv_detective/detection/headers.py +3 -5
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/variables.py +4 -4
- csv_detective/explore_csv.py +7 -8
- csv_detective/load_tests.py +6 -14
- csv_detective/output/__init__.py +3 -7
- csv_detective/output/dataframe.py +9 -5
- csv_detective/output/example.py +13 -13
- csv_detective/output/profile.py +30 -23
- csv_detective/output/schema.py +20 -23
- csv_detective/output/utils.py +15 -15
- csv_detective/parsing/columns.py +23 -12
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +10 -11
- csv_detective/parsing/load.py +11 -8
- csv_detective/parsing/text.py +4 -9
- csv_detective/s3_utils.py +3 -7
- csv_detective/utils.py +4 -2
- csv_detective/validate.py +18 -13
- csv_detective-0.8.1.dev1674.data/data/share/csv_detective/README.md → csv_detective-0.8.1.dev1720.dist-info/METADATA +32 -0
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/RECORD +81 -81
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/top_level.txt +2 -0
- tests/test_example.py +2 -6
- tests/test_fields.py +16 -10
- tests/test_file.py +10 -9
- tests/test_labels.py +3 -2
- tests/test_structure.py +3 -1
- tests/test_validation.py +9 -6
- venv/bin/activate_this.py +38 -0
- venv/bin/jp.py +54 -0
- venv/bin/runxlrd.py +410 -0
- csv_detective-0.8.1.dev1674.data/data/share/csv_detective/CHANGELOG.md +0 -186
- csv_detective-0.8.1.dev1674.dist-info/METADATA +0 -268
- csv_detective-0.8.1.dev1674.dist-info/licenses/LICENSE +0 -21
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1674.data/data/share/csv_detective → csv_detective-0.8.1.dev1720.dist-info/licenses}/LICENSE +0 -0
|
@@ -2,37 +2,37 @@ from unidecode import unidecode
|
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
mois = {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
5
|
+
"janvier",
|
|
6
|
+
"fevrier",
|
|
7
|
+
"mars",
|
|
8
|
+
"avril",
|
|
9
|
+
"mai",
|
|
10
|
+
"juin",
|
|
11
|
+
"juillet",
|
|
12
|
+
"aout",
|
|
13
|
+
"septembre",
|
|
14
|
+
"octobre",
|
|
15
|
+
"novembre",
|
|
16
|
+
"decembre",
|
|
17
|
+
"jan",
|
|
18
|
+
"fev",
|
|
19
|
+
"mar",
|
|
20
|
+
"avr",
|
|
21
|
+
"mai",
|
|
22
|
+
"jun",
|
|
23
|
+
"jui",
|
|
24
|
+
"juil",
|
|
25
|
+
"aou",
|
|
26
|
+
"sep",
|
|
27
|
+
"sept",
|
|
28
|
+
"oct",
|
|
29
|
+
"nov",
|
|
30
|
+
"dec",
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def _is(val):
|
|
35
|
-
|
|
35
|
+
"""Renvoie True si les champs peuvent être des mois de l'année"""
|
|
36
36
|
if not isinstance(val, str):
|
|
37
37
|
return False
|
|
38
38
|
val = unidecode(val.lower())
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
1
|
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
3
|
|
|
4
4
|
PROPORTION = 1
|
|
5
5
|
|
|
6
|
-
with open(join(dirname(__file__),
|
|
7
|
-
liste_pays = iofile.read().split(
|
|
6
|
+
with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile:
|
|
7
|
+
liste_pays = iofile.read().split("\n")
|
|
8
8
|
liste_pays = set(liste_pays)
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _is(val):
|
|
12
|
-
|
|
13
|
-
if not isinstance(val, str) or not bool(re.match(r
|
|
12
|
+
"""Renvoie True si val peut etre un code iso pays alpha-2, False sinon"""
|
|
13
|
+
if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
|
|
14
14
|
return False
|
|
15
15
|
return val in liste_pays
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
1
|
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
3
|
|
|
4
4
|
PROPORTION = 1
|
|
5
5
|
|
|
6
|
-
with open(join(dirname(__file__),
|
|
7
|
-
liste_pays = iofile.read().split(
|
|
6
|
+
with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile:
|
|
7
|
+
liste_pays = iofile.read().split("\n")
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _is(val):
|
|
11
|
-
|
|
12
|
-
if not isinstance(val, str) or not bool(re.match(r
|
|
11
|
+
"""Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
|
|
12
|
+
if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
|
|
13
13
|
return False
|
|
14
14
|
return val in set(liste_pays)
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
1
|
import re
|
|
2
|
+
from os.path import dirname, join
|
|
3
3
|
|
|
4
4
|
PROPORTION = 1
|
|
5
5
|
|
|
6
|
-
with open(join(dirname(__file__),
|
|
7
|
-
liste_pays = iofile.read().split(
|
|
6
|
+
with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile:
|
|
7
|
+
liste_pays = iofile.read().split("\n")
|
|
8
8
|
liste_pays = set(liste_pays)
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _is(val):
|
|
12
|
-
|
|
13
|
-
if not isinstance(val, str) or not bool(re.match(r
|
|
12
|
+
"""Renvoie True si val peut etre un code iso pays numerique, False sinon"""
|
|
13
|
+
if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
|
|
14
14
|
return False
|
|
15
15
|
return val in liste_pays
|
|
@@ -4,5 +4,7 @@ PROPORTION = 1
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
8
|
-
return isinstance(val, str) and bool(
|
|
7
|
+
"""Detects e-mails"""
|
|
8
|
+
return isinstance(val, str) and bool(
|
|
9
|
+
re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
|
|
10
|
+
)
|
|
@@ -2,11 +2,11 @@ PROPORTION = 1
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def _is(val):
|
|
5
|
-
|
|
5
|
+
"""Detects integers"""
|
|
6
6
|
if (
|
|
7
7
|
not isinstance(val, str)
|
|
8
|
-
or any([v in val for v in [
|
|
9
|
-
or (val.startswith(
|
|
8
|
+
or any([v in val for v in [".", "_", "+"]])
|
|
9
|
+
or (val.startswith("0") and len(val) > 1)
|
|
10
10
|
):
|
|
11
11
|
return False
|
|
12
12
|
try:
|
|
@@ -4,8 +4,7 @@ PROPORTION = 0.8
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
8
|
-
return isinstance(val, str) and bool(
|
|
9
|
-
r
|
|
10
|
-
|
|
11
|
-
))
|
|
7
|
+
"""Detects UUIDs"""
|
|
8
|
+
return isinstance(val, str) and bool(
|
|
9
|
+
re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
|
|
10
|
+
)
|
|
@@ -2,7 +2,8 @@ from datetime import datetime
|
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from dateparser import parse as date_parser
|
|
5
|
-
from dateutil.parser import
|
|
5
|
+
from dateutil.parser import ParserError
|
|
6
|
+
from dateutil.parser import parse as dateutil_parser
|
|
6
7
|
|
|
7
8
|
PROPORTION = 1
|
|
8
9
|
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
@@ -22,7 +23,7 @@ threshold = 0.3
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def _is(val):
|
|
25
|
-
|
|
26
|
+
"""Renvoie True si val peut être une date, False sinon"""
|
|
26
27
|
# early stops, to cut processing time
|
|
27
28
|
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
|
|
28
29
|
return False
|
|
@@ -4,15 +4,15 @@ PROPORTION = 1
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
|
-
|
|
8
|
-
Exemple: Tue, 19 Dec 2023 15:30:45 +0000
|
|
7
|
+
"""Renvoie True si val peut être une date au format rfc822, False sinon
|
|
8
|
+
Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
|
|
9
9
|
|
|
10
10
|
return isinstance(val, str) and bool(
|
|
11
11
|
re.match(
|
|
12
|
-
r
|
|
13
|
-
r
|
|
14
|
-
r
|
|
12
|
+
r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
|
|
13
|
+
r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
|
|
14
|
+
r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
|
|
15
15
|
val.lower(),
|
|
16
|
-
re.IGNORECASE
|
|
16
|
+
re.IGNORECASE,
|
|
17
17
|
)
|
|
18
18
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import TextIO
|
|
3
2
|
from time import time
|
|
3
|
+
from typing import TextIO
|
|
4
4
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
@@ -47,19 +47,21 @@ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int
|
|
|
47
47
|
if return_int == 0:
|
|
48
48
|
if verbose:
|
|
49
49
|
display_logs_depending_process_time(
|
|
50
|
-
f
|
|
50
|
+
f"No heading column detected in {round(time() - start, 3)}s",
|
|
51
51
|
time() - start,
|
|
52
52
|
)
|
|
53
53
|
return 0
|
|
54
54
|
if verbose:
|
|
55
55
|
display_logs_depending_process_time(
|
|
56
|
-
f
|
|
56
|
+
f"{return_int} heading columns detected in {round(time() - start, 3)}s",
|
|
57
57
|
time() - start,
|
|
58
58
|
)
|
|
59
59
|
return return_int
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def detect_trailing_columns(
|
|
62
|
+
def detect_trailing_columns(
|
|
63
|
+
file: TextIO, sep: str, heading_columns: int, verbose: bool = False
|
|
64
|
+
) -> int:
|
|
63
65
|
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
64
66
|
if verbose:
|
|
65
67
|
start = time()
|
|
@@ -70,20 +72,18 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
|
|
|
70
72
|
line = file.readline()
|
|
71
73
|
return_int = min(
|
|
72
74
|
return_int,
|
|
73
|
-
len(line.replace("\n", ""))
|
|
74
|
-
- len(line.replace("\n", "").strip(sep))
|
|
75
|
-
- heading_columns,
|
|
75
|
+
len(line.replace("\n", "")) - len(line.replace("\n", "").strip(sep)) - heading_columns,
|
|
76
76
|
)
|
|
77
77
|
if return_int == 0:
|
|
78
78
|
if verbose:
|
|
79
79
|
display_logs_depending_process_time(
|
|
80
|
-
f
|
|
80
|
+
f"No trailing column detected in {round(time() - start, 3)}s",
|
|
81
81
|
time() - start,
|
|
82
82
|
)
|
|
83
83
|
return 0
|
|
84
84
|
if verbose:
|
|
85
85
|
display_logs_depending_process_time(
|
|
86
|
-
f
|
|
86
|
+
f"{return_int} trailing columns detected in {round(time() - start, 3)}s",
|
|
87
87
|
time() - start,
|
|
88
88
|
)
|
|
89
89
|
return return_int
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from time import time
|
|
3
2
|
from io import BytesIO
|
|
3
|
+
from time import time
|
|
4
4
|
|
|
5
5
|
from cchardet import detect
|
|
6
6
|
|
|
@@ -16,12 +16,14 @@ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
|
|
|
16
16
|
logging.info("Detecting encoding")
|
|
17
17
|
encoding_dict = detect(binary_file.read())
|
|
18
18
|
if not encoding_dict["encoding"]:
|
|
19
|
-
raise ValueError(
|
|
19
|
+
raise ValueError(
|
|
20
|
+
"Could not detect the file's encoding. Consider specifying it in the routine call."
|
|
21
|
+
)
|
|
20
22
|
if verbose:
|
|
21
23
|
message = f'Detected encoding: "{encoding_dict["encoding"]}"'
|
|
22
|
-
message += f
|
|
24
|
+
message += f" in {round(time() - start, 3)}s (confidence: {round(encoding_dict['confidence'] * 100)}%)"
|
|
23
25
|
display_logs_depending_process_time(
|
|
24
26
|
message,
|
|
25
27
|
time() - start,
|
|
26
28
|
)
|
|
27
|
-
return encoding_dict[
|
|
29
|
+
return encoding_dict["encoding"]
|
|
@@ -22,11 +22,11 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
|
|
|
22
22
|
mapping = {
|
|
23
23
|
"application/gzip": "gzip",
|
|
24
24
|
"application/x-gzip": "gzip",
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "openpyxl",
|
|
26
|
+
"application/vnd.ms-excel": "xlrd",
|
|
27
|
+
"application/vnd.oasis.opendocument.spreadsheet": "odf",
|
|
28
28
|
# all these files could be recognized as zip, may need to check all cases then
|
|
29
|
-
|
|
29
|
+
"application/zip": "openpyxl",
|
|
30
30
|
}
|
|
31
31
|
# if none of the above, we move forwards with the csv process
|
|
32
32
|
if is_url(file_path):
|
|
@@ -37,7 +37,8 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
|
|
|
37
37
|
if verbose:
|
|
38
38
|
message = (
|
|
39
39
|
f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
|
|
40
|
-
if engine
|
|
40
|
+
if engine
|
|
41
|
+
else "Processing the file as a csv"
|
|
41
42
|
)
|
|
42
43
|
display_logs_depending_process_time(
|
|
43
44
|
message,
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
3
|
from typing import Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
|
+
|
|
7
8
|
from csv_detective.detection.variables import (
|
|
8
9
|
detect_categorical_variable,
|
|
9
10
|
# detect_continuous_variable,
|
|
10
11
|
)
|
|
11
12
|
from csv_detective.load_tests import return_all_tests
|
|
12
13
|
from csv_detective.output.utils import prepare_output_dict
|
|
13
|
-
from csv_detective.parsing.columns import test_col, test_label
|
|
14
|
+
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
|
|
14
15
|
from csv_detective.validate import validate
|
|
15
16
|
|
|
16
17
|
|
|
@@ -42,10 +43,12 @@ def detect_formats(
|
|
|
42
43
|
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
43
44
|
# )
|
|
44
45
|
|
|
45
|
-
analysis.update(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
analysis.update(
|
|
47
|
+
{
|
|
48
|
+
"categorical": res_categorical,
|
|
49
|
+
# "continuous": res_continuous,
|
|
50
|
+
}
|
|
51
|
+
)
|
|
49
52
|
|
|
50
53
|
# list testing to be performed
|
|
51
54
|
all_tests_fields = return_all_tests(
|
|
@@ -60,7 +63,9 @@ def detect_formats(
|
|
|
60
63
|
return analysis
|
|
61
64
|
|
|
62
65
|
# Perform testing on fields
|
|
63
|
-
scores_table_fields = test_col(
|
|
66
|
+
scores_table_fields = test_col(
|
|
67
|
+
table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose
|
|
68
|
+
)
|
|
64
69
|
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
65
70
|
|
|
66
71
|
# Perform testing on labels
|
|
@@ -71,16 +76,14 @@ def detect_formats(
|
|
|
71
76
|
# This is because the fields are more important than the labels and yields a max
|
|
72
77
|
# of 1.5 for the final score.
|
|
73
78
|
scores_table = scores_table_fields * (
|
|
74
|
-
1
|
|
75
|
-
+ scores_table_labels.reindex(
|
|
76
|
-
index=scores_table_fields.index, fill_value=0
|
|
77
|
-
).values / 2
|
|
79
|
+
1 + scores_table_labels.reindex(index=scores_table_fields.index, fill_value=0).values / 2
|
|
78
80
|
)
|
|
79
81
|
|
|
80
82
|
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
81
83
|
# a detection (skipping the ones that have been excluded by the users).
|
|
82
84
|
formats_with_mandatory_label = [
|
|
83
|
-
f
|
|
85
|
+
f
|
|
86
|
+
for f in [
|
|
84
87
|
"code_departement",
|
|
85
88
|
"code_commune_insee",
|
|
86
89
|
"code_postal",
|
|
@@ -90,7 +93,8 @@ def detect_formats(
|
|
|
90
93
|
"longitude_wgs_fr_metropole",
|
|
91
94
|
"latitude_l93",
|
|
92
95
|
"longitude_l93",
|
|
93
|
-
]
|
|
96
|
+
]
|
|
97
|
+
if f in scores_table.index
|
|
94
98
|
]
|
|
95
99
|
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
96
100
|
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
@@ -123,9 +127,7 @@ def detect_formats(
|
|
|
123
127
|
analysis[detection_method] = {
|
|
124
128
|
col_name: [
|
|
125
129
|
{
|
|
126
|
-
"python_type": metier_to_python_type.get(
|
|
127
|
-
detection["format"], "string"
|
|
128
|
-
),
|
|
130
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
129
131
|
**detection,
|
|
130
132
|
}
|
|
131
133
|
for detection in detections
|
|
@@ -136,9 +138,7 @@ def detect_formats(
|
|
|
136
138
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
137
139
|
analysis[detection_method] = {
|
|
138
140
|
col_name: {
|
|
139
|
-
"python_type": metier_to_python_type.get(
|
|
140
|
-
detection["format"], "string"
|
|
141
|
-
),
|
|
141
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
142
142
|
**detection,
|
|
143
143
|
}
|
|
144
144
|
for col_name, detection in analysis[detection_method].items()
|
|
@@ -15,18 +15,16 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
|
|
|
15
15
|
header = file.readline()
|
|
16
16
|
position = file.tell()
|
|
17
17
|
chaine = [c for c in header.replace("\n", "").split(sep) if c]
|
|
18
|
-
if chaine[-1] not in ["", "\n"] and all(
|
|
19
|
-
[mot not in ["", "\n"] for mot in chaine[1:-1]]
|
|
20
|
-
):
|
|
18
|
+
if chaine[-1] not in ["", "\n"] and all([mot not in ["", "\n"] for mot in chaine[1:-1]]):
|
|
21
19
|
next_row = file.readline()
|
|
22
20
|
file.seek(position)
|
|
23
21
|
if header != next_row:
|
|
24
22
|
if verbose:
|
|
25
23
|
display_logs_depending_process_time(
|
|
26
|
-
f
|
|
24
|
+
f"Detected headers in {round(time() - start, 3)}s",
|
|
27
25
|
time() - start,
|
|
28
26
|
)
|
|
29
27
|
return i, chaine
|
|
30
28
|
if verbose:
|
|
31
|
-
logging.info(
|
|
29
|
+
logging.info("No header detected")
|
|
32
30
|
return 0, None
|
csv_detective/detection/rows.py
CHANGED
|
@@ -5,7 +5,7 @@ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
|
5
5
|
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
6
|
to end up with the header at the right place"""
|
|
7
7
|
idx = 0
|
|
8
|
-
if all([str(c).startswith(
|
|
8
|
+
if all([str(c).startswith("Unnamed:") for c in table.columns]):
|
|
9
9
|
# there is on offset between the index in the file (idx here)
|
|
10
10
|
# and the index in the dataframe, because of the header
|
|
11
11
|
idx = 1
|
|
@@ -7,7 +7,9 @@ import pandas as pd
|
|
|
7
7
|
from csv_detective.utils import display_logs_depending_process_time
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def detect_continuous_variable(
|
|
10
|
+
def detect_continuous_variable(
|
|
11
|
+
table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False
|
|
12
|
+
):
|
|
11
13
|
"""
|
|
12
14
|
Detects whether a column contains continuous variables. We consider a continuous column
|
|
13
15
|
one that contains a considerable amount of float values.
|
|
@@ -41,9 +43,7 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
|
|
|
41
43
|
if verbose:
|
|
42
44
|
start = time()
|
|
43
45
|
logging.info("Detecting continuous columns")
|
|
44
|
-
res = table.apply(
|
|
45
|
-
lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
|
|
46
|
-
)
|
|
46
|
+
res = table.apply(lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th))
|
|
47
47
|
if verbose:
|
|
48
48
|
display_logs_depending_process_time(
|
|
49
49
|
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
csv_detective/explore_csv.py
CHANGED
|
@@ -55,7 +55,10 @@ def routine(
|
|
|
55
55
|
dict: a dict with information about the csv and possible types for each column
|
|
56
56
|
"""
|
|
57
57
|
|
|
58
|
-
if not (
|
|
58
|
+
if not (
|
|
59
|
+
isinstance(save_results, bool)
|
|
60
|
+
or (isinstance(save_results, str) and save_results.endswith(".json"))
|
|
61
|
+
):
|
|
59
62
|
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
60
63
|
|
|
61
64
|
if verbose:
|
|
@@ -100,8 +103,7 @@ def routine(
|
|
|
100
103
|
finally:
|
|
101
104
|
if verbose:
|
|
102
105
|
display_logs_depending_process_time(
|
|
103
|
-
f"Routine completed in {round(time() - start_routine, 3)}s",
|
|
104
|
-
time() - start_routine
|
|
106
|
+
f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
105
107
|
)
|
|
106
108
|
|
|
107
109
|
|
|
@@ -119,7 +121,6 @@ def validate_then_detect(
|
|
|
119
121
|
cast_json: bool = True,
|
|
120
122
|
verbose: bool = False,
|
|
121
123
|
):
|
|
122
|
-
|
|
123
124
|
if verbose:
|
|
124
125
|
start_routine = time()
|
|
125
126
|
if is_url(file_path):
|
|
@@ -170,8 +171,7 @@ def validate_then_detect(
|
|
|
170
171
|
finally:
|
|
171
172
|
if verbose:
|
|
172
173
|
display_logs_depending_process_time(
|
|
173
|
-
f"Process completed in {round(time() - start_routine, 3)}s",
|
|
174
|
-
time() - start_routine
|
|
174
|
+
f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
175
175
|
)
|
|
176
176
|
|
|
177
177
|
|
|
@@ -226,8 +226,7 @@ def routine_minio(
|
|
|
226
226
|
if location_dict is not None:
|
|
227
227
|
if any(
|
|
228
228
|
[
|
|
229
|
-
(location_key not in location_dict)
|
|
230
|
-
or (location_dict[location_key] is None)
|
|
229
|
+
(location_key not in location_dict) or (location_dict[location_key] is None)
|
|
231
230
|
for location_key in ["netloc", "bucket", "key"]
|
|
232
231
|
]
|
|
233
232
|
):
|
csv_detective/load_tests.py
CHANGED
|
@@ -12,10 +12,7 @@ def get_all_packages(detect_type) -> list:
|
|
|
12
12
|
for filename in filenames:
|
|
13
13
|
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
14
14
|
if file.endswith("__init__.py"):
|
|
15
|
-
module = (
|
|
16
|
-
file.replace("__init__.py", "")
|
|
17
|
-
.replace("/", ".").replace("\\", ".")[:-1]
|
|
18
|
-
)
|
|
15
|
+
module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1]
|
|
19
16
|
if module:
|
|
20
17
|
modules.append(detect_type + module)
|
|
21
18
|
return modules
|
|
@@ -43,20 +40,15 @@ def return_all_tests(
|
|
|
43
40
|
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
44
41
|
tests_to_do = [detect_type]
|
|
45
42
|
else:
|
|
46
|
-
tests_to_do = [
|
|
47
|
-
|
|
48
|
-
]
|
|
49
|
-
tests_skipped = [
|
|
50
|
-
f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
|
|
51
|
-
]
|
|
43
|
+
tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
|
|
44
|
+
tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
|
|
52
45
|
all_tests = [
|
|
53
46
|
# this is why we need to import detect_fields/labels
|
|
54
|
-
eval(x)
|
|
47
|
+
eval(x)
|
|
48
|
+
for x in all_packages
|
|
55
49
|
if any([y == x[: len(y)] for y in tests_to_do])
|
|
56
50
|
and all([y != x[: len(y)] for y in tests_skipped])
|
|
57
51
|
]
|
|
58
52
|
# to remove groups of tests
|
|
59
|
-
all_tests = [
|
|
60
|
-
test for test in all_tests if "_is" in dir(test)
|
|
61
|
-
]
|
|
53
|
+
all_tests = [test for test in all_tests if "_is" in dir(test)]
|
|
62
54
|
return all_tests
|