csv-detective 0.8.1.dev1460__py3-none-any.whl → 0.8.1.dev1482__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +27 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +170 -0
- csv_detective/detection/headers.py +32 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +98 -0
- csv_detective/parsing/__init__.py +0 -0
- csv_detective/parsing/columns.py +141 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +55 -0
- csv_detective/parsing/excel.py +169 -0
- csv_detective/parsing/load.py +97 -0
- csv_detective/parsing/text.py +61 -0
- {csv_detective-0.8.1.dev1460.data → csv_detective-0.8.1.dev1482.data}/data/share/csv_detective/CHANGELOG.md +1 -0
- {csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/RECORD +25 -9
- {csv_detective-0.8.1.dev1460.data → csv_detective-0.8.1.dev1482.data}/data/share/csv_detective/LICENSE +0 -0
- {csv_detective-0.8.1.dev1460.data → csv_detective-0.8.1.dev1482.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TextIO
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_extra_columns(file: TextIO, sep: str):
|
|
9
|
+
"""regarde s'il y a des colonnes en trop
|
|
10
|
+
Attention, file ne doit pas avoir de ligne vide"""
|
|
11
|
+
file.seek(0)
|
|
12
|
+
retour = False
|
|
13
|
+
nb_useless_col = 99999
|
|
14
|
+
|
|
15
|
+
for i in range(10):
|
|
16
|
+
line = file.readline()
|
|
17
|
+
# regarde si on a un retour
|
|
18
|
+
if retour:
|
|
19
|
+
assert line[-1] == "\n"
|
|
20
|
+
if line[-1] == "\n":
|
|
21
|
+
retour = True
|
|
22
|
+
|
|
23
|
+
# regarde le nombre de derniere colonne inutile
|
|
24
|
+
deb = 0 + retour
|
|
25
|
+
line = line[::-1][deb:]
|
|
26
|
+
k = 0
|
|
27
|
+
for sign in line:
|
|
28
|
+
if sign != sep:
|
|
29
|
+
break
|
|
30
|
+
k += 1
|
|
31
|
+
if k == 0:
|
|
32
|
+
return 0, retour
|
|
33
|
+
nb_useless_col = min(k, nb_useless_col)
|
|
34
|
+
return nb_useless_col, retour
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
|
|
38
|
+
"""Tests first 10 lines to see if there are empty heading columns"""
|
|
39
|
+
if verbose:
|
|
40
|
+
start = time()
|
|
41
|
+
logging.info("Detecting heading columns")
|
|
42
|
+
file.seek(0)
|
|
43
|
+
return_int = float("Inf")
|
|
44
|
+
for i in range(10):
|
|
45
|
+
line = file.readline()
|
|
46
|
+
return_int = min(return_int, len(line) - len(line.strip(sep)))
|
|
47
|
+
if return_int == 0:
|
|
48
|
+
if verbose:
|
|
49
|
+
display_logs_depending_process_time(
|
|
50
|
+
f'No heading column detected in {round(time() - start, 3)}s',
|
|
51
|
+
time() - start,
|
|
52
|
+
)
|
|
53
|
+
return 0
|
|
54
|
+
if verbose:
|
|
55
|
+
display_logs_depending_process_time(
|
|
56
|
+
f'{return_int} heading columns detected in {round(time() - start, 3)}s',
|
|
57
|
+
time() - start,
|
|
58
|
+
)
|
|
59
|
+
return return_int
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
|
|
63
|
+
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
64
|
+
if verbose:
|
|
65
|
+
start = time()
|
|
66
|
+
logging.info("Detecting trailing columns")
|
|
67
|
+
file.seek(0)
|
|
68
|
+
return_int = float("Inf")
|
|
69
|
+
for i in range(10):
|
|
70
|
+
line = file.readline()
|
|
71
|
+
return_int = min(
|
|
72
|
+
return_int,
|
|
73
|
+
len(line.replace("\n", ""))
|
|
74
|
+
- len(line.replace("\n", "").strip(sep))
|
|
75
|
+
- heading_columns,
|
|
76
|
+
)
|
|
77
|
+
if return_int == 0:
|
|
78
|
+
if verbose:
|
|
79
|
+
display_logs_depending_process_time(
|
|
80
|
+
f'No trailing column detected in {round(time() - start, 3)}s',
|
|
81
|
+
time() - start,
|
|
82
|
+
)
|
|
83
|
+
return 0
|
|
84
|
+
if verbose:
|
|
85
|
+
display_logs_depending_process_time(
|
|
86
|
+
f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
|
|
87
|
+
time() - start,
|
|
88
|
+
)
|
|
89
|
+
return return_int
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
|
|
5
|
+
from cchardet import detect
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Detects file encoding using faust-cchardet (forked from the original cchardet)
|
|
13
|
+
"""
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting encoding")
|
|
17
|
+
encoding_dict = detect(binary_file.read())
|
|
18
|
+
if not encoding_dict["encoding"]:
|
|
19
|
+
raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
|
|
20
|
+
if verbose:
|
|
21
|
+
message = f'Detected encoding: "{encoding_dict["encoding"]}"'
|
|
22
|
+
message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
|
|
23
|
+
display_logs_depending_process_time(
|
|
24
|
+
message,
|
|
25
|
+
time() - start,
|
|
26
|
+
)
|
|
27
|
+
return encoding_dict['encoding']
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from time import time
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import magic
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time, is_url
|
|
8
|
+
|
|
9
|
+
COMPRESSION_ENGINES = ["gzip"]
|
|
10
|
+
EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
|
|
11
|
+
engine_to_file = {
|
|
12
|
+
"openpyxl": "Excel",
|
|
13
|
+
"xlrd": "old Excel",
|
|
14
|
+
"odf": "OpenOffice",
|
|
15
|
+
"gzip": "csv.gz",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def detect_engine(file_path: str, verbose=False) -> Optional[str]:
|
|
20
|
+
if verbose:
|
|
21
|
+
start = time()
|
|
22
|
+
mapping = {
|
|
23
|
+
"application/gzip": "gzip",
|
|
24
|
+
"application/x-gzip": "gzip",
|
|
25
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
|
|
26
|
+
'application/vnd.ms-excel': 'xlrd',
|
|
27
|
+
'application/vnd.oasis.opendocument.spreadsheet': 'odf',
|
|
28
|
+
# all these files could be recognized as zip, may need to check all cases then
|
|
29
|
+
'application/zip': 'openpyxl',
|
|
30
|
+
}
|
|
31
|
+
# if none of the above, we move forwards with the csv process
|
|
32
|
+
if is_url(file_path):
|
|
33
|
+
remote_content = requests.get(file_path).content
|
|
34
|
+
engine = mapping.get(magic.from_buffer(remote_content, mime=True))
|
|
35
|
+
else:
|
|
36
|
+
engine = mapping.get(magic.from_file(file_path, mime=True))
|
|
37
|
+
if verbose:
|
|
38
|
+
message = (
|
|
39
|
+
f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
|
|
40
|
+
if engine else "Processing the file as a csv"
|
|
41
|
+
)
|
|
42
|
+
display_logs_depending_process_time(
|
|
43
|
+
message,
|
|
44
|
+
time() - start,
|
|
45
|
+
)
|
|
46
|
+
return engine
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from csv_detective.detection.variables import (
|
|
8
|
+
detect_categorical_variable,
|
|
9
|
+
# detect_continuous_variable,
|
|
10
|
+
)
|
|
11
|
+
from csv_detective.load_tests import return_all_tests
|
|
12
|
+
from csv_detective.output.utils import prepare_output_dict
|
|
13
|
+
from csv_detective.parsing.columns import test_col, test_label, MAX_ROWS_ANALYSIS
|
|
14
|
+
from csv_detective.validate import validate
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def detect_formats(
|
|
18
|
+
table: pd.DataFrame,
|
|
19
|
+
analysis: dict,
|
|
20
|
+
file_path: str,
|
|
21
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
22
|
+
limited_output: bool = True,
|
|
23
|
+
skipna: bool = True,
|
|
24
|
+
verbose: bool = False,
|
|
25
|
+
):
|
|
26
|
+
on_sample = len(table) > MAX_ROWS_ANALYSIS
|
|
27
|
+
if on_sample:
|
|
28
|
+
if verbose:
|
|
29
|
+
logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
|
|
30
|
+
table = table.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
|
|
31
|
+
|
|
32
|
+
if table.empty:
|
|
33
|
+
res_categorical = []
|
|
34
|
+
# res_continuous = []
|
|
35
|
+
else:
|
|
36
|
+
# Detects columns that are categorical
|
|
37
|
+
res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
|
|
38
|
+
res_categorical = list(res_categorical)
|
|
39
|
+
# Detect columns that are continuous (we already know the categorical) :
|
|
40
|
+
# we don't need this for now, cuts processing time
|
|
41
|
+
# res_continuous = list(
|
|
42
|
+
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
43
|
+
# )
|
|
44
|
+
|
|
45
|
+
analysis.update({
|
|
46
|
+
"categorical": res_categorical,
|
|
47
|
+
# "continuous": res_continuous,
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
# list testing to be performed
|
|
51
|
+
all_tests_fields = return_all_tests(
|
|
52
|
+
user_input_tests, detect_type="detect_fields"
|
|
53
|
+
) # list all tests for the fields
|
|
54
|
+
all_tests_labels = return_all_tests(
|
|
55
|
+
user_input_tests, detect_type="detect_labels"
|
|
56
|
+
) # list all tests for the labels
|
|
57
|
+
|
|
58
|
+
# if no testing then return
|
|
59
|
+
if not all_tests_fields and not all_tests_labels:
|
|
60
|
+
return analysis
|
|
61
|
+
|
|
62
|
+
# Perform testing on fields
|
|
63
|
+
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
64
|
+
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
65
|
+
|
|
66
|
+
# Perform testing on labels
|
|
67
|
+
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
68
|
+
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
69
|
+
|
|
70
|
+
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
71
|
+
# This is because the fields are more important than the labels and yields a max
|
|
72
|
+
# of 1.5 for the final score.
|
|
73
|
+
scores_table = scores_table_fields * (
|
|
74
|
+
1
|
|
75
|
+
+ scores_table_labels.reindex(
|
|
76
|
+
index=scores_table_fields.index, fill_value=0
|
|
77
|
+
).values / 2
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
81
|
+
# a detection (skipping the ones that have been excluded by the users).
|
|
82
|
+
formats_with_mandatory_label = [
|
|
83
|
+
f for f in [
|
|
84
|
+
"code_departement",
|
|
85
|
+
"code_commune_insee",
|
|
86
|
+
"code_postal",
|
|
87
|
+
"latitude_wgs",
|
|
88
|
+
"longitude_wgs",
|
|
89
|
+
"latitude_wgs_fr_metropole",
|
|
90
|
+
"longitude_wgs_fr_metropole",
|
|
91
|
+
"latitude_l93",
|
|
92
|
+
"longitude_l93",
|
|
93
|
+
] if f in scores_table.index
|
|
94
|
+
]
|
|
95
|
+
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
96
|
+
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
97
|
+
scores_table.loc[formats_with_mandatory_label, :],
|
|
98
|
+
0,
|
|
99
|
+
)
|
|
100
|
+
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
101
|
+
|
|
102
|
+
metier_to_python_type = {
|
|
103
|
+
"booleen": "bool",
|
|
104
|
+
"int": "int",
|
|
105
|
+
"float": "float",
|
|
106
|
+
"string": "string",
|
|
107
|
+
"json": "json",
|
|
108
|
+
"json_geojson": "json",
|
|
109
|
+
"datetime": "datetime",
|
|
110
|
+
"datetime_iso": "datetime",
|
|
111
|
+
"datetime_rfc822": "datetime",
|
|
112
|
+
"date": "date",
|
|
113
|
+
"latitude": "float",
|
|
114
|
+
"latitude_l93": "float",
|
|
115
|
+
"latitude_wgs": "float",
|
|
116
|
+
"latitude_wgs_fr_metropole": "float",
|
|
117
|
+
"longitude": "float",
|
|
118
|
+
"longitude_l93": "float",
|
|
119
|
+
"longitude_wgs": "float",
|
|
120
|
+
"longitude_wgs_fr_metropole": "float",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if not limited_output:
|
|
124
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
125
|
+
analysis[detection_method] = {
|
|
126
|
+
col_name: [
|
|
127
|
+
{
|
|
128
|
+
"python_type": metier_to_python_type.get(
|
|
129
|
+
detection["format"], "string"
|
|
130
|
+
),
|
|
131
|
+
**detection,
|
|
132
|
+
}
|
|
133
|
+
for detection in detections
|
|
134
|
+
]
|
|
135
|
+
for col_name, detections in analysis[detection_method].items()
|
|
136
|
+
}
|
|
137
|
+
else:
|
|
138
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
139
|
+
analysis[detection_method] = {
|
|
140
|
+
col_name: {
|
|
141
|
+
"python_type": metier_to_python_type.get(
|
|
142
|
+
detection["format"], "string"
|
|
143
|
+
),
|
|
144
|
+
**detection,
|
|
145
|
+
}
|
|
146
|
+
for col_name, detection in analysis[detection_method].items()
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Add detection with formats as keys
|
|
150
|
+
analysis["formats"] = defaultdict(list)
|
|
151
|
+
for header, col_metadata in analysis["columns"].items():
|
|
152
|
+
analysis["formats"][col_metadata["format"]].append(header)
|
|
153
|
+
|
|
154
|
+
if on_sample:
|
|
155
|
+
if verbose:
|
|
156
|
+
logging.warning("Validating that analysis on the sample works on the whole file")
|
|
157
|
+
is_valid, _, _ = validate(
|
|
158
|
+
file_path=file_path,
|
|
159
|
+
previous_analysis=analysis,
|
|
160
|
+
num_rows=-1,
|
|
161
|
+
encoding=analysis.get("encoding"),
|
|
162
|
+
sep=analysis.get("separator"),
|
|
163
|
+
sheet_name=analysis.get("sheet_name"),
|
|
164
|
+
verbose=verbose,
|
|
165
|
+
skipna=skipna,
|
|
166
|
+
)
|
|
167
|
+
if not is_valid:
|
|
168
|
+
raise ValueError("Could not infer detected formats on the whole file")
|
|
169
|
+
|
|
170
|
+
return analysis
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import Optional, TextIO
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
|
|
9
|
+
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
|
+
if verbose:
|
|
11
|
+
start = time()
|
|
12
|
+
logging.info("Detecting headers")
|
|
13
|
+
file.seek(0)
|
|
14
|
+
for i in range(10):
|
|
15
|
+
header = file.readline()
|
|
16
|
+
position = file.tell()
|
|
17
|
+
chaine = [c for c in header.replace("\n", "").split(sep) if c]
|
|
18
|
+
if chaine[-1] not in ["", "\n"] and all(
|
|
19
|
+
[mot not in ["", "\n"] for mot in chaine[1:-1]]
|
|
20
|
+
):
|
|
21
|
+
next_row = file.readline()
|
|
22
|
+
file.seek(position)
|
|
23
|
+
if header != next_row:
|
|
24
|
+
if verbose:
|
|
25
|
+
display_logs_depending_process_time(
|
|
26
|
+
f'Detected headers in {round(time() - start, 3)}s',
|
|
27
|
+
time() - start,
|
|
28
|
+
)
|
|
29
|
+
return i, chaine
|
|
30
|
+
if verbose:
|
|
31
|
+
logging.info('No header detected')
|
|
32
|
+
return 0, None
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
+
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
|
+
to end up with the header at the right place"""
|
|
7
|
+
idx = 0
|
|
8
|
+
if all([str(c).startswith('Unnamed:') for c in table.columns]):
|
|
9
|
+
# there is on offset between the index in the file (idx here)
|
|
10
|
+
# and the index in the dataframe, because of the header
|
|
11
|
+
idx = 1
|
|
12
|
+
while table.iloc[idx - 1].isna().all():
|
|
13
|
+
idx += 1
|
|
14
|
+
cols = table.iloc[idx - 1]
|
|
15
|
+
table = table.iloc[idx:]
|
|
16
|
+
table.columns = cols.to_list()
|
|
17
|
+
# +1 here because the headers should count as a row
|
|
18
|
+
return table, idx
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TextIO
|
|
5
|
+
|
|
6
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_separator(file: TextIO, verbose: bool = False) -> str:
|
|
10
|
+
"""Detects csv separator"""
|
|
11
|
+
# TODO: add a robust detection:
|
|
12
|
+
# si on a un point virgule comme texte et \t comme séparateur, on renvoie
|
|
13
|
+
# pour l'instant un point virgule
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting separator")
|
|
17
|
+
file.seek(0)
|
|
18
|
+
header = file.readline()
|
|
19
|
+
possible_separators = [";", ",", "|", "\t"]
|
|
20
|
+
sep_count = dict()
|
|
21
|
+
for sep in possible_separators:
|
|
22
|
+
sep_count[sep] = header.count(sep)
|
|
23
|
+
sep = max(sep_count, key=sep_count.get)
|
|
24
|
+
# testing that the first 10 (arbitrary) rows all have the same number of fields
|
|
25
|
+
# as the header. Prevents downstream unwanted behaviour where pandas can load
|
|
26
|
+
# the file (in a weird way) but the process is irrelevant.
|
|
27
|
+
file.seek(0)
|
|
28
|
+
reader = csv.reader(file, delimiter=sep)
|
|
29
|
+
rows_lengths = set()
|
|
30
|
+
for idx, row in enumerate(reader):
|
|
31
|
+
if idx > 10:
|
|
32
|
+
break
|
|
33
|
+
rows_lengths.add(len(row))
|
|
34
|
+
if len(rows_lengths) > 1:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if verbose:
|
|
40
|
+
display_logs_depending_process_time(
|
|
41
|
+
f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
|
|
42
|
+
time() - start,
|
|
43
|
+
)
|
|
44
|
+
return sep
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from ast import literal_eval
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
|
|
11
|
+
"""
|
|
12
|
+
Detects whether a column contains continuous variables. We consider a continuous column
|
|
13
|
+
one that contains a considerable amount of float values.
|
|
14
|
+
We removed the integers as we then end up with postal codes, insee codes, and all sort
|
|
15
|
+
of codes and types.
|
|
16
|
+
This is not optimal but it will do for now.
|
|
17
|
+
"""
|
|
18
|
+
# if we need this again in the future, could be first based on columns detected as int/float to cut time
|
|
19
|
+
|
|
20
|
+
def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
|
|
21
|
+
count = serie.value_counts().to_dict()
|
|
22
|
+
total_nb = len(serie)
|
|
23
|
+
if float in count:
|
|
24
|
+
nb_floats = count[float]
|
|
25
|
+
else:
|
|
26
|
+
return False
|
|
27
|
+
if nb_floats / total_nb >= continuous_th:
|
|
28
|
+
return True
|
|
29
|
+
else:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
def parses_to_integer(value: str):
|
|
33
|
+
try:
|
|
34
|
+
value = value.replace(",", ".")
|
|
35
|
+
value = literal_eval(value)
|
|
36
|
+
return type(value)
|
|
37
|
+
# flake8: noqa
|
|
38
|
+
except:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
if verbose:
|
|
42
|
+
start = time()
|
|
43
|
+
logging.info("Detecting continuous columns")
|
|
44
|
+
res = table.apply(
|
|
45
|
+
lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
|
|
46
|
+
)
|
|
47
|
+
if verbose:
|
|
48
|
+
display_logs_depending_process_time(
|
|
49
|
+
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
|
50
|
+
time() - start,
|
|
51
|
+
)
|
|
52
|
+
return res.index[res]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def detect_categorical_variable(
|
|
56
|
+
table: pd.DataFrame,
|
|
57
|
+
threshold_pct_categorical: float = 0.05,
|
|
58
|
+
max_number_categorical_values: int = 25,
|
|
59
|
+
verbose: bool = False,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Heuristically detects whether a table (df) contains categorical values according to
|
|
63
|
+
the number of unique values contained.
|
|
64
|
+
As the idea of detecting categorical values is to then try to learn models to predict
|
|
65
|
+
them, we limit categorical values to at most 25 different modes or at most 5% disparity.
|
|
66
|
+
Postal code, insee code, code region and so on, may be thus not considered categorical values.
|
|
67
|
+
:param table:
|
|
68
|
+
:param threshold_pct_categorical:
|
|
69
|
+
:param max_number_categorical_values:
|
|
70
|
+
:return:
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def abs_number_different_values(column_values: pd.Series):
|
|
74
|
+
return column_values.nunique()
|
|
75
|
+
|
|
76
|
+
def rel_number_different_values(column_values: pd.Series):
|
|
77
|
+
return column_values.nunique() / len(column_values)
|
|
78
|
+
|
|
79
|
+
def detect_categorical(column_values: pd.Series):
|
|
80
|
+
abs_unique_values = abs_number_different_values(column_values)
|
|
81
|
+
rel_unique_values = rel_number_different_values(column_values)
|
|
82
|
+
if (
|
|
83
|
+
abs_unique_values <= max_number_categorical_values
|
|
84
|
+
or rel_unique_values <= threshold_pct_categorical
|
|
85
|
+
):
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
if verbose:
|
|
90
|
+
start = time()
|
|
91
|
+
logging.info("Detecting categorical columns")
|
|
92
|
+
res = table.apply(lambda serie: detect_categorical(serie))
|
|
93
|
+
if verbose:
|
|
94
|
+
display_logs_depending_process_time(
|
|
95
|
+
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
96
|
+
time() - start,
|
|
97
|
+
)
|
|
98
|
+
return res.index[res], res
|
|
File without changes
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
MAX_ROWS_ANALYSIS = 1e5
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_col_val(
|
|
13
|
+
serie: pd.Series,
|
|
14
|
+
test_func: Callable,
|
|
15
|
+
proportion: float = 0.9,
|
|
16
|
+
skipna: bool = True,
|
|
17
|
+
limited_output: bool = False,
|
|
18
|
+
verbose: bool = False,
|
|
19
|
+
):
|
|
20
|
+
"""Tests values of the serie using test_func.
|
|
21
|
+
- skipna : if True indicates that NaNs are not counted as False
|
|
22
|
+
- proportion : indicates the proportion of values that have to pass the test
|
|
23
|
+
for the serie to be detected as a certain format
|
|
24
|
+
"""
|
|
25
|
+
if verbose:
|
|
26
|
+
start = time()
|
|
27
|
+
|
|
28
|
+
# TODO : change for a cleaner method and only test columns in modules labels
|
|
29
|
+
def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
|
|
30
|
+
return serie.sample(n=_range).apply(test_func)
|
|
31
|
+
try:
|
|
32
|
+
if skipna:
|
|
33
|
+
serie = serie[serie.notnull()]
|
|
34
|
+
ser_len = len(serie)
|
|
35
|
+
if ser_len == 0:
|
|
36
|
+
return 0.0
|
|
37
|
+
if not limited_output:
|
|
38
|
+
result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
|
|
39
|
+
return result if result >= proportion else 0.0
|
|
40
|
+
else:
|
|
41
|
+
if proportion == 1: # Then try first 1 value, then 5, then all
|
|
42
|
+
for _range in [
|
|
43
|
+
min(1, ser_len),
|
|
44
|
+
min(5, ser_len),
|
|
45
|
+
ser_len,
|
|
46
|
+
]: # Pour ne pas faire d'opérations inutiles, on commence par 1,
|
|
47
|
+
# puis 5 valeurs puis la serie complète
|
|
48
|
+
if all(apply_test_func(serie, test_func, _range)):
|
|
49
|
+
# print(serie.name, ': check OK')
|
|
50
|
+
pass
|
|
51
|
+
else:
|
|
52
|
+
return 0.0
|
|
53
|
+
return 1.0
|
|
54
|
+
else:
|
|
55
|
+
# if we have a proportion, statistically it's OK to analyse up to 10k rows
|
|
56
|
+
# (arbitrary number) and get a significant result
|
|
57
|
+
to_analyse = min(ser_len, MAX_ROWS_ANALYSIS)
|
|
58
|
+
result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
|
|
59
|
+
return result if result >= proportion else 0.0
|
|
60
|
+
finally:
|
|
61
|
+
if verbose and time() - start > 3:
|
|
62
|
+
display_logs_depending_process_time(
|
|
63
|
+
f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
|
|
64
|
+
time() - start
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_col_label(label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False):
|
|
69
|
+
"""Tests label (from header) using test_func.
|
|
70
|
+
- proportion : indicates the minimum score to pass the test for the serie
|
|
71
|
+
to be detected as a certain format
|
|
72
|
+
"""
|
|
73
|
+
if not limited_output:
|
|
74
|
+
return test_func(label)
|
|
75
|
+
else:
|
|
76
|
+
result = test_func(label)
|
|
77
|
+
return result if result >= proportion else 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna: bool = True, verbose: bool = False):
|
|
81
|
+
if verbose:
|
|
82
|
+
start = time()
|
|
83
|
+
logging.info("Testing columns to get types")
|
|
84
|
+
test_funcs = dict()
|
|
85
|
+
for test in all_tests:
|
|
86
|
+
name = test.__name__.split(".")[-1]
|
|
87
|
+
test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
|
|
88
|
+
return_table = pd.DataFrame(columns=table.columns)
|
|
89
|
+
for idx, (key, value) in enumerate(test_funcs.items()):
|
|
90
|
+
if verbose:
|
|
91
|
+
start_type = time()
|
|
92
|
+
logging.info(f"\t- Starting with type '{key}'")
|
|
93
|
+
# improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
|
|
94
|
+
# => the following needs to change, "apply" means all columns are tested for one type at once
|
|
95
|
+
return_table.loc[key] = table.apply(
|
|
96
|
+
lambda serie: test_col_val(
|
|
97
|
+
serie,
|
|
98
|
+
value["func"],
|
|
99
|
+
value["prop"],
|
|
100
|
+
skipna=skipna,
|
|
101
|
+
limited_output=limited_output,
|
|
102
|
+
verbose=verbose,
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
if verbose:
|
|
106
|
+
display_logs_depending_process_time(
|
|
107
|
+
f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
|
|
108
|
+
time() - start_type
|
|
109
|
+
)
|
|
110
|
+
if verbose:
|
|
111
|
+
display_logs_depending_process_time(f"Done testing columns in {round(time() - start, 3)}s", time() - start)
|
|
112
|
+
return return_table
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False):
|
|
116
|
+
if verbose:
|
|
117
|
+
start = time()
|
|
118
|
+
logging.info("Testing labels to get types")
|
|
119
|
+
test_funcs = dict()
|
|
120
|
+
for test in all_tests:
|
|
121
|
+
name = test.__name__.split(".")[-1]
|
|
122
|
+
test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
|
|
123
|
+
|
|
124
|
+
return_table = pd.DataFrame(columns=table.columns)
|
|
125
|
+
for idx, (key, value) in enumerate(test_funcs.items()):
|
|
126
|
+
if verbose:
|
|
127
|
+
start_type = time()
|
|
128
|
+
return_table.loc[key] = [
|
|
129
|
+
test_col_label(
|
|
130
|
+
col_name, value["func"], value["prop"], limited_output=limited_output
|
|
131
|
+
)
|
|
132
|
+
for col_name in table.columns
|
|
133
|
+
]
|
|
134
|
+
if verbose:
|
|
135
|
+
display_logs_depending_process_time(
|
|
136
|
+
f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
|
|
137
|
+
time() - start_type
|
|
138
|
+
)
|
|
139
|
+
if verbose:
|
|
140
|
+
display_logs_depending_process_time(f"Done testing labels in {round(time() - start, 3)}s", time() - start)
|
|
141
|
+
return return_table
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def unzip(binary_file: BytesIO, engine: str) -> BytesIO:
|
|
6
|
+
if engine == "gzip":
|
|
7
|
+
with gzip.open(binary_file, mode="rb") as binary_file:
|
|
8
|
+
file_content = binary_file.read()
|
|
9
|
+
else:
|
|
10
|
+
raise NotImplementedError(f"{engine} is not yet supported")
|
|
11
|
+
return BytesIO(file_content)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import TextIO
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_csv(
|
|
11
|
+
the_file: TextIO,
|
|
12
|
+
encoding: str,
|
|
13
|
+
sep: str,
|
|
14
|
+
num_rows: int,
|
|
15
|
+
skiprows: int,
|
|
16
|
+
random_state: int = 42,
|
|
17
|
+
verbose: bool = False,
|
|
18
|
+
) -> tuple[pd.DataFrame, int, int]:
|
|
19
|
+
if verbose:
|
|
20
|
+
start = time()
|
|
21
|
+
logging.info("Parsing table")
|
|
22
|
+
table = None
|
|
23
|
+
|
|
24
|
+
if not isinstance(the_file, str):
|
|
25
|
+
the_file.seek(0)
|
|
26
|
+
|
|
27
|
+
total_lines = None
|
|
28
|
+
for encoding in [encoding, "ISO-8859-1", "utf-8"]:
|
|
29
|
+
if encoding is None:
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
if "ISO-8859" in encoding:
|
|
33
|
+
encoding = "ISO-8859-1"
|
|
34
|
+
try:
|
|
35
|
+
table = pd.read_csv(
|
|
36
|
+
the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
|
|
37
|
+
)
|
|
38
|
+
total_lines = len(table)
|
|
39
|
+
nb_duplicates = len(table.loc[table.duplicated()])
|
|
40
|
+
if num_rows > 0:
|
|
41
|
+
num_rows = min(num_rows - 1, total_lines)
|
|
42
|
+
table = table.sample(num_rows, random_state=random_state)
|
|
43
|
+
# else : table is unchanged
|
|
44
|
+
break
|
|
45
|
+
except TypeError:
|
|
46
|
+
print("Trying encoding : {encoding}".format(encoding=encoding))
|
|
47
|
+
|
|
48
|
+
if table is None:
|
|
49
|
+
raise ValueError("Could not load file")
|
|
50
|
+
if verbose:
|
|
51
|
+
display_logs_depending_process_time(
|
|
52
|
+
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
53
|
+
time() - start,
|
|
54
|
+
)
|
|
55
|
+
return table, total_lines, nb_duplicates
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import openpyxl
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import requests
|
|
8
|
+
import xlrd
|
|
9
|
+
|
|
10
|
+
from csv_detective.detection.engine import engine_to_file
|
|
11
|
+
from csv_detective.detection.rows import remove_empty_first_rows
|
|
12
|
+
from csv_detective.utils import (
|
|
13
|
+
display_logs_depending_process_time,
|
|
14
|
+
is_url,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
|
|
18
|
+
OLD_EXCEL_EXT = [".xls"]
|
|
19
|
+
OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
|
|
20
|
+
XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_excel(
|
|
24
|
+
file_path: str,
|
|
25
|
+
num_rows: int = -1,
|
|
26
|
+
engine: Optional[str] = None,
|
|
27
|
+
sheet_name: Optional[str] = None,
|
|
28
|
+
random_state: int = 42,
|
|
29
|
+
verbose: bool = False,
|
|
30
|
+
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
|
31
|
+
""""Excel-like parsing is really slow, could be a good improvement for future development"""
|
|
32
|
+
if verbose:
|
|
33
|
+
start = time()
|
|
34
|
+
no_sheet_specified = sheet_name is None
|
|
35
|
+
|
|
36
|
+
if (
|
|
37
|
+
engine in ['openpyxl', 'xlrd'] or
|
|
38
|
+
any([file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
|
|
39
|
+
):
|
|
40
|
+
remote_content = None
|
|
41
|
+
if is_url(file_path):
|
|
42
|
+
r = requests.get(file_path)
|
|
43
|
+
r.raise_for_status()
|
|
44
|
+
remote_content = BytesIO(r.content)
|
|
45
|
+
if not engine:
|
|
46
|
+
if any([file_path.endswith(k) for k in NEW_EXCEL_EXT]):
|
|
47
|
+
engine = "openpyxl"
|
|
48
|
+
else:
|
|
49
|
+
engine = "xlrd"
|
|
50
|
+
if sheet_name is None:
|
|
51
|
+
if verbose:
|
|
52
|
+
display_logs_depending_process_time(
|
|
53
|
+
f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
|
|
54
|
+
time() - start,
|
|
55
|
+
)
|
|
56
|
+
try:
|
|
57
|
+
if engine == "openpyxl":
|
|
58
|
+
# openpyxl doesn't want to open files that don't have a valid extension
|
|
59
|
+
# see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
|
|
60
|
+
# if the file is remote, we have a remote content anyway so it's fine
|
|
61
|
+
if not remote_content and '.' not in file_path.split('/')[-1]:
|
|
62
|
+
with open(file_path, 'rb') as f:
|
|
63
|
+
remote_content = BytesIO(f.read())
|
|
64
|
+
# faster than loading all sheets
|
|
65
|
+
wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
|
|
66
|
+
try:
|
|
67
|
+
sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
|
|
68
|
+
except TypeError:
|
|
69
|
+
# sometimes read_only can't get the info, so we have to open the file for real
|
|
70
|
+
# this takes more time but it's for a limited number of files
|
|
71
|
+
# and it's this or nothing
|
|
72
|
+
wb = openpyxl.load_workbook(remote_content or file_path)
|
|
73
|
+
sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
|
|
74
|
+
else:
|
|
75
|
+
if remote_content:
|
|
76
|
+
wb = xlrd.open_workbook(file_contents=remote_content.read())
|
|
77
|
+
else:
|
|
78
|
+
wb = xlrd.open_workbook(file_path)
|
|
79
|
+
sizes = {s.name: s.nrows * s.ncols for s in wb.sheets()}
|
|
80
|
+
sheet_name = max(sizes, key=sizes.get)
|
|
81
|
+
except xlrd.biffh.XLRDError:
|
|
82
|
+
# sometimes a xls file is recognized as ods
|
|
83
|
+
if verbose:
|
|
84
|
+
display_logs_depending_process_time(
|
|
85
|
+
'Could not read file with classic xls reader, trying with ODS',
|
|
86
|
+
time() - start,
|
|
87
|
+
)
|
|
88
|
+
engine = "odf"
|
|
89
|
+
|
|
90
|
+
if engine == "odf" or any([file_path.endswith(k) for k in OPEN_OFFICE_EXT]):
|
|
91
|
+
# for ODS files, no way to get sheets' sizes without
|
|
92
|
+
# loading the file one way or another (pandas or pure odfpy)
|
|
93
|
+
# so all in one
|
|
94
|
+
engine = "odf"
|
|
95
|
+
if sheet_name is None:
|
|
96
|
+
if verbose:
|
|
97
|
+
display_logs_depending_process_time(
|
|
98
|
+
f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
|
|
99
|
+
time() - start,
|
|
100
|
+
)
|
|
101
|
+
tables = pd.read_excel(
|
|
102
|
+
file_path,
|
|
103
|
+
engine="odf",
|
|
104
|
+
sheet_name=None,
|
|
105
|
+
dtype="unicode",
|
|
106
|
+
)
|
|
107
|
+
sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
|
|
108
|
+
sheet_name = max(sizes, key=sizes.get)
|
|
109
|
+
if verbose:
|
|
110
|
+
display_logs_depending_process_time(
|
|
111
|
+
f'Going forwards with sheet "{sheet_name}"',
|
|
112
|
+
time() - start,
|
|
113
|
+
)
|
|
114
|
+
table = tables[sheet_name]
|
|
115
|
+
else:
|
|
116
|
+
if verbose:
|
|
117
|
+
display_logs_depending_process_time(
|
|
118
|
+
f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
|
|
119
|
+
time() - start,
|
|
120
|
+
)
|
|
121
|
+
table = pd.read_excel(
|
|
122
|
+
file_path,
|
|
123
|
+
engine="odf",
|
|
124
|
+
sheet_name=sheet_name,
|
|
125
|
+
dtype="unicode",
|
|
126
|
+
)
|
|
127
|
+
table, header_row_idx = remove_empty_first_rows(table)
|
|
128
|
+
total_lines = len(table)
|
|
129
|
+
nb_duplicates = len(table.loc[table.duplicated()])
|
|
130
|
+
if num_rows > 0:
|
|
131
|
+
num_rows = min(num_rows - 1, total_lines)
|
|
132
|
+
table = table.sample(num_rows, random_state=random_state)
|
|
133
|
+
if verbose:
|
|
134
|
+
display_logs_depending_process_time(
|
|
135
|
+
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
136
|
+
time() - start,
|
|
137
|
+
)
|
|
138
|
+
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
139
|
+
|
|
140
|
+
# so here we end up with (old and new) excel files only
|
|
141
|
+
if verbose:
|
|
142
|
+
if no_sheet_specified:
|
|
143
|
+
display_logs_depending_process_time(
|
|
144
|
+
f'Going forwards with sheet "{sheet_name}"',
|
|
145
|
+
time() - start,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
display_logs_depending_process_time(
|
|
149
|
+
f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
|
|
150
|
+
time() - start,
|
|
151
|
+
)
|
|
152
|
+
table = pd.read_excel(
|
|
153
|
+
file_path,
|
|
154
|
+
engine=engine,
|
|
155
|
+
sheet_name=sheet_name,
|
|
156
|
+
dtype="unicode",
|
|
157
|
+
)
|
|
158
|
+
table, header_row_idx = remove_empty_first_rows(table)
|
|
159
|
+
total_lines = len(table)
|
|
160
|
+
nb_duplicates = len(table.loc[table.duplicated()])
|
|
161
|
+
if num_rows > 0:
|
|
162
|
+
num_rows = min(num_rows - 1, total_lines)
|
|
163
|
+
table = table.sample(num_rows, random_state=random_state)
|
|
164
|
+
if verbose:
|
|
165
|
+
display_logs_depending_process_time(
|
|
166
|
+
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
167
|
+
time() - start,
|
|
168
|
+
)
|
|
169
|
+
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from io import BytesIO, StringIO
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from csv_detective.detection.columns import detect_heading_columns, detect_trailing_columns
|
|
8
|
+
from csv_detective.detection.encoding import detect_encoding
|
|
9
|
+
from csv_detective.detection.engine import (
|
|
10
|
+
COMPRESSION_ENGINES,
|
|
11
|
+
EXCEL_ENGINES,
|
|
12
|
+
detect_engine,
|
|
13
|
+
)
|
|
14
|
+
from csv_detective.detection.headers import detect_headers
|
|
15
|
+
from csv_detective.detection.separator import detect_separator
|
|
16
|
+
from csv_detective.utils import is_url
|
|
17
|
+
from .compression import unzip
|
|
18
|
+
from .csv import parse_csv
|
|
19
|
+
from .excel import (
|
|
20
|
+
XLS_LIKE_EXT,
|
|
21
|
+
parse_excel,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_file(
|
|
26
|
+
file_path: str,
|
|
27
|
+
num_rows: int = 500,
|
|
28
|
+
encoding: Optional[str] = None,
|
|
29
|
+
sep: Optional[str] = None,
|
|
30
|
+
verbose: bool = False,
|
|
31
|
+
sheet_name: Optional[Union[str, int]] = None,
|
|
32
|
+
) -> tuple[pd.DataFrame, dict]:
|
|
33
|
+
file_name = file_path.split('/')[-1]
|
|
34
|
+
engine = None
|
|
35
|
+
if '.' not in file_name or not file_name.endswith("csv"):
|
|
36
|
+
# file has no extension, we'll investigate how to read it
|
|
37
|
+
engine = detect_engine(file_path, verbose=verbose)
|
|
38
|
+
|
|
39
|
+
if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
|
|
40
|
+
table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
|
|
41
|
+
file_path=file_path,
|
|
42
|
+
num_rows=num_rows,
|
|
43
|
+
engine=engine,
|
|
44
|
+
sheet_name=sheet_name,
|
|
45
|
+
verbose=verbose,
|
|
46
|
+
)
|
|
47
|
+
header = table.columns.to_list()
|
|
48
|
+
analysis = {
|
|
49
|
+
"engine": engine,
|
|
50
|
+
"sheet_name": sheet_name,
|
|
51
|
+
}
|
|
52
|
+
else:
|
|
53
|
+
# fetching or reading file as binary
|
|
54
|
+
if is_url(file_path):
|
|
55
|
+
r = requests.get(file_path, allow_redirects=True)
|
|
56
|
+
r.raise_for_status()
|
|
57
|
+
binary_file = BytesIO(r.content)
|
|
58
|
+
else:
|
|
59
|
+
binary_file = open(file_path, "rb")
|
|
60
|
+
# handling compression
|
|
61
|
+
if engine in COMPRESSION_ENGINES:
|
|
62
|
+
binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
|
|
63
|
+
# detecting encoding if not specified
|
|
64
|
+
if encoding is None:
|
|
65
|
+
encoding: str = detect_encoding(binary_file, verbose=verbose)
|
|
66
|
+
binary_file.seek(0)
|
|
67
|
+
# decoding and reading file
|
|
68
|
+
if is_url(file_path) or engine in COMPRESSION_ENGINES:
|
|
69
|
+
str_file = StringIO(binary_file.read().decode(encoding=encoding))
|
|
70
|
+
else:
|
|
71
|
+
str_file = open(file_path, "r", encoding=encoding)
|
|
72
|
+
if sep is None:
|
|
73
|
+
sep = detect_separator(str_file, verbose=verbose)
|
|
74
|
+
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
75
|
+
if header is None:
|
|
76
|
+
return {"error": True}
|
|
77
|
+
elif isinstance(header, list):
|
|
78
|
+
if any([x is None for x in header]):
|
|
79
|
+
return {"error": True}
|
|
80
|
+
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
81
|
+
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
82
|
+
table, total_lines, nb_duplicates = parse_csv(
|
|
83
|
+
str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
|
|
84
|
+
)
|
|
85
|
+
analysis = {
|
|
86
|
+
"encoding": encoding,
|
|
87
|
+
"separator": sep,
|
|
88
|
+
"heading_columns": heading_columns,
|
|
89
|
+
"trailing_columns": trailing_columns,
|
|
90
|
+
}
|
|
91
|
+
analysis.update({
|
|
92
|
+
"header_row_idx": header_row_idx,
|
|
93
|
+
"header": header,
|
|
94
|
+
"total_lines": total_lines,
|
|
95
|
+
"nb_duplicates": nb_duplicates,
|
|
96
|
+
})
|
|
97
|
+
return table, analysis
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from re import finditer
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def camel_case_split(identifier: str):
|
|
5
|
+
matches = finditer(
|
|
6
|
+
".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
|
|
7
|
+
)
|
|
8
|
+
return " ".join([m.group(0) for m in matches])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
translate_dict = {
|
|
12
|
+
" ": ["-", "_", "'", ",", " "],
|
|
13
|
+
"a": ["à", "â"],
|
|
14
|
+
"c": ["ç"],
|
|
15
|
+
"e": ["é", "è", "ê", "é"],
|
|
16
|
+
"i": ["î", "ï"],
|
|
17
|
+
"o": ["ô", "ö"],
|
|
18
|
+
"u": ["ù", "û", "ü"],
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Process text
|
|
23
|
+
def _process_text(val: str):
|
|
24
|
+
"""Traitement des chaînes de caractères pour les standardiser.
|
|
25
|
+
Plusieurs alternatives ont été testées : .translate, unidecode.unidecode,
|
|
26
|
+
des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
|
|
27
|
+
val = camel_case_split(val)
|
|
28
|
+
val = val.lower()
|
|
29
|
+
for target in translate_dict:
|
|
30
|
+
for source in translate_dict[target]:
|
|
31
|
+
val = val.replace(source, target)
|
|
32
|
+
val = val.strip()
|
|
33
|
+
return val
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def is_word_in_string(word: str, string: str):
|
|
37
|
+
# if the substring is too short, the test can become irrelevant
|
|
38
|
+
return len(word) > 2 and word in string
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def header_score(header: str, words_combinations_list: list[str]) -> float:
|
|
42
|
+
"""Returns:
|
|
43
|
+
- 1 if the header is exactly in the specified list
|
|
44
|
+
- 0.5 if any of the words is within the header
|
|
45
|
+
- 0 otherwise"""
|
|
46
|
+
processed_header = _process_text(header)
|
|
47
|
+
|
|
48
|
+
header_matches_words_combination = float(
|
|
49
|
+
any(
|
|
50
|
+
words_combination == processed_header for words_combination in words_combinations_list
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
words_combination_in_header = 0.5 * (
|
|
54
|
+
any(
|
|
55
|
+
is_word_in_string(
|
|
56
|
+
words_combination, processed_header
|
|
57
|
+
) for words_combination in words_combinations_list
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return max(header_matches_words_combination, words_combination_in_header)
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
- Refactor repo metadata and requirements [#120](https://github.com/datagouv/csv-detective/pull/120) [#122](https://github.com/datagouv/csv-detective/pull/122)
|
|
7
7
|
- Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
|
|
8
8
|
- For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124)
|
|
9
|
+
- Fix imports [#125](https://github.com/datagouv/csv-detective/pull/125) [#126](https://github.com/datagouv/csv-detective/pull/126)
|
|
9
10
|
|
|
10
11
|
## 0.8.0 (2025-05-20)
|
|
11
12
|
|
|
@@ -121,16 +121,32 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=w0eeZIseAmPwL4OvCWzZXbx
|
|
|
121
121
|
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=d0laZNzHx-kSARs9Re8TZ11GNs99aMz6gXc72CJ6ul4,440
|
|
122
122
|
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=53ysj7QgsxXwG1le3zfSJd1oaTTf-Er3jBeYi_A4F9g,458
|
|
123
123
|
csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWmcu8K-9jPED-pOlMlErfo,433
|
|
124
|
+
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
+
csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
|
|
126
|
+
csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
|
|
127
|
+
csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
|
|
128
|
+
csv_detective/detection/formats.py,sha256=5ZW7gmhyQt6BB7xLcVVhui17oGn1udAWI9w22EAOHy4,6337
|
|
129
|
+
csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
|
|
130
|
+
csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
|
|
131
|
+
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
132
|
+
csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
|
|
124
133
|
csv_detective/output/__init__.py,sha256=5KTevPfp_4MRxByJyOntQjToNfeG7dPQn-_13wSq7EU,1910
|
|
125
134
|
csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
|
|
126
135
|
csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5fA,8649
|
|
127
136
|
csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
|
|
128
137
|
csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
|
|
129
138
|
csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
|
|
130
|
-
csv_detective
|
|
131
|
-
csv_detective
|
|
132
|
-
csv_detective
|
|
133
|
-
csv_detective
|
|
139
|
+
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
|
+
csv_detective/parsing/columns.py,sha256=e0xVmeXNvSC3su5HTFSNClgkz8PlFkoHmNwRYdS57mk,5670
|
|
141
|
+
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
142
|
+
csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
|
|
143
|
+
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
144
|
+
csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
|
|
145
|
+
csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
|
|
146
|
+
csv_detective-0.8.1.dev1482.data/data/share/csv_detective/CHANGELOG.md,sha256=0kXmJeSwiHMY4mSOOtaZJ739S_Aj4ojWAzsJXYR_T98,8857
|
|
147
|
+
csv_detective-0.8.1.dev1482.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
148
|
+
csv_detective-0.8.1.dev1482.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
|
|
149
|
+
csv_detective-0.8.1.dev1482.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
134
150
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
135
151
|
tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
|
|
136
152
|
tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
|
|
@@ -138,8 +154,8 @@ tests/test_file.py,sha256=FWVtYHlD5uU7tPeYsqlQg6O4lpU8Ct35vddkbzhvvjA,8508
|
|
|
138
154
|
tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
|
|
139
155
|
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
140
156
|
tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
|
|
141
|
-
csv_detective-0.8.1.
|
|
142
|
-
csv_detective-0.8.1.
|
|
143
|
-
csv_detective-0.8.1.
|
|
144
|
-
csv_detective-0.8.1.
|
|
145
|
-
csv_detective-0.8.1.
|
|
157
|
+
csv_detective-0.8.1.dev1482.dist-info/METADATA,sha256=xigKiWIsWZoH2rUZ0JGfvKW7eGm72SFQ1d7uFzWQcKs,10443
|
|
158
|
+
csv_detective-0.8.1.dev1482.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
159
|
+
csv_detective-0.8.1.dev1482.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
160
|
+
csv_detective-0.8.1.dev1482.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
161
|
+
csv_detective-0.8.1.dev1482.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1460.dist-info → csv_detective-0.8.1.dev1482.dist-info}/top_level.txt
RENAMED
|
File without changes
|