csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +0 -2
- csv_detective/cli.py +6 -9
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
- csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
- csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +4 -2
- csv_detective/detect_fields/other/int/__init__.py +3 -3
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
- csv_detective/detect_fields/other/twitter/__init__.py +2 -2
- csv_detective/detect_fields/other/uuid/__init__.py +4 -5
- csv_detective/detect_fields/temp/date/__init__.py +3 -2
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
- csv_detective/detect_fields/temp/year/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detection/columns.py +9 -9
- csv_detective/detection/encoding.py +6 -4
- csv_detective/detection/engine.py +6 -5
- csv_detective/detection/formats.py +19 -19
- csv_detective/detection/headers.py +3 -5
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/variables.py +4 -4
- csv_detective/explore_csv.py +7 -8
- csv_detective/load_tests.py +6 -14
- csv_detective/output/__init__.py +3 -7
- csv_detective/output/dataframe.py +9 -5
- csv_detective/output/example.py +13 -13
- csv_detective/output/profile.py +30 -23
- csv_detective/output/schema.py +20 -23
- csv_detective/output/utils.py +15 -15
- csv_detective/parsing/columns.py +23 -12
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +10 -11
- csv_detective/parsing/load.py +11 -8
- csv_detective/parsing/text.py +4 -9
- csv_detective/s3_utils.py +3 -7
- csv_detective/utils.py +4 -2
- csv_detective/validate.py +18 -13
- csv_detective-0.8.1.dev1674.data/data/share/csv_detective/README.md → csv_detective-0.8.1.dev1720.dist-info/METADATA +32 -0
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/RECORD +81 -81
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/top_level.txt +2 -0
- tests/test_example.py +2 -6
- tests/test_fields.py +16 -10
- tests/test_file.py +10 -9
- tests/test_labels.py +3 -2
- tests/test_structure.py +3 -1
- tests/test_validation.py +9 -6
- venv/bin/activate_this.py +38 -0
- venv/bin/jp.py +54 -0
- venv/bin/runxlrd.py +410 -0
- csv_detective-0.8.1.dev1674.data/data/share/csv_detective/CHANGELOG.md +0 -186
- csv_detective-0.8.1.dev1674.dist-info/METADATA +0 -268
- csv_detective-0.8.1.dev1674.dist-info/licenses/LICENSE +0 -21
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1674.data/data/share/csv_detective → csv_detective-0.8.1.dev1720.dist-info/licenses}/LICENSE +0 -0
csv_detective/output/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Optional, Union
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
7
|
from csv_detective.utils import is_url
|
|
8
|
+
|
|
8
9
|
from .dataframe import cast_df
|
|
9
10
|
from .profile import create_profile
|
|
10
11
|
from .schema import generate_table_schema
|
|
@@ -24,7 +25,6 @@ def generate_output(
|
|
|
24
25
|
verbose: bool = False,
|
|
25
26
|
sheet_name: Optional[Union[str, int]] = None,
|
|
26
27
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
27
|
-
|
|
28
28
|
if output_profile:
|
|
29
29
|
analysis["profile"] = create_profile(
|
|
30
30
|
table=table,
|
|
@@ -40,7 +40,7 @@ def generate_output(
|
|
|
40
40
|
else:
|
|
41
41
|
output_path = os.path.splitext(file_path)[0]
|
|
42
42
|
if is_url(output_path):
|
|
43
|
-
output_path = output_path.split(
|
|
43
|
+
output_path = output_path.split("/")[-1]
|
|
44
44
|
if analysis.get("sheet_name"):
|
|
45
45
|
output_path += "_sheet-" + str(sheet_name)
|
|
46
46
|
output_path += ".json"
|
|
@@ -48,11 +48,7 @@ def generate_output(
|
|
|
48
48
|
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
49
49
|
|
|
50
50
|
if output_schema:
|
|
51
|
-
analysis["schema"] = generate_table_schema(
|
|
52
|
-
analysis,
|
|
53
|
-
save_file=False,
|
|
54
|
-
verbose=verbose
|
|
55
|
-
)
|
|
51
|
+
analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
|
|
56
52
|
|
|
57
53
|
if output_df:
|
|
58
54
|
return analysis, cast_df(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from datetime import date, datetime
|
|
2
1
|
import json
|
|
3
|
-
from
|
|
2
|
+
from datetime import date, datetime
|
|
4
3
|
from time import time
|
|
4
|
+
from typing import Optional, Union
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
@@ -30,12 +30,16 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
|
|
|
30
30
|
raise ValueError(f"Unknown type `{_type}`")
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def cast_df(
|
|
33
|
+
def cast_df(
|
|
34
|
+
df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
|
|
35
|
+
) -> pd.DataFrame:
|
|
34
36
|
if verbose:
|
|
35
37
|
start = time()
|
|
36
38
|
output_df = pd.DataFrame()
|
|
37
39
|
for col_name, detection in columns.items():
|
|
38
|
-
if detection["python_type"] == "string" or (
|
|
40
|
+
if detection["python_type"] == "string" or (
|
|
41
|
+
detection["python_type"] == "json" and not cast_json
|
|
42
|
+
):
|
|
39
43
|
# no change if detected type is string
|
|
40
44
|
output_df[col_name] = df[col_name].copy()
|
|
41
45
|
elif detection["python_type"] == "int":
|
|
@@ -49,7 +53,7 @@ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bo
|
|
|
49
53
|
del df[col_name]
|
|
50
54
|
if verbose:
|
|
51
55
|
display_logs_depending_process_time(
|
|
52
|
-
f
|
|
56
|
+
f"Casting columns completed in {round(time() - start, 3)}s",
|
|
53
57
|
time() - start,
|
|
54
58
|
)
|
|
55
59
|
return output_df
|
csv_detective/output/example.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
1
|
import json
|
|
3
2
|
import random
|
|
4
3
|
import string
|
|
5
|
-
from typing import Union, Optional, Any, Type
|
|
6
4
|
import uuid
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Optional, Type, Union
|
|
7
7
|
|
|
8
|
-
from faker import Faker
|
|
9
8
|
import pandas as pd
|
|
10
9
|
import requests
|
|
11
10
|
import rstr
|
|
11
|
+
from faker import Faker
|
|
12
12
|
|
|
13
13
|
fake = Faker()
|
|
14
14
|
|
|
@@ -135,7 +135,7 @@ def create_example_csv_file(
|
|
|
135
135
|
return random.choice(enum)
|
|
136
136
|
if num_range is None:
|
|
137
137
|
num_range = [0, 1000]
|
|
138
|
-
if num_type
|
|
138
|
+
if num_type is int:
|
|
139
139
|
return random.randint(num_range[0], num_range[1])
|
|
140
140
|
else:
|
|
141
141
|
return round(random.uniform(num_range[0], num_range[1]), 1)
|
|
@@ -179,7 +179,7 @@ def create_example_csv_file(
|
|
|
179
179
|
"yearmonth": "date",
|
|
180
180
|
"time": "time",
|
|
181
181
|
"datetime": "datetime",
|
|
182
|
-
"array": "array"
|
|
182
|
+
"array": "array",
|
|
183
183
|
}
|
|
184
184
|
|
|
185
185
|
if schema_path:
|
|
@@ -188,7 +188,7 @@ def create_example_csv_file(
|
|
|
188
188
|
else:
|
|
189
189
|
with open(schema_path, encoding=encoding) as jsonfile:
|
|
190
190
|
schema = json.load(jsonfile)
|
|
191
|
-
if
|
|
191
|
+
if "fields" not in schema.keys():
|
|
192
192
|
raise ValueError("The schema must have a 'fields' key.")
|
|
193
193
|
else:
|
|
194
194
|
fields = [
|
|
@@ -198,12 +198,14 @@ def create_example_csv_file(
|
|
|
198
198
|
# when frformat is supported in TableSchema, we can build args for French standards
|
|
199
199
|
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
200
200
|
"args": (
|
|
201
|
-
build_args_from_constraints(f["constraints"])
|
|
201
|
+
build_args_from_constraints(f["constraints"])
|
|
202
|
+
if "constraints" in f.keys()
|
|
202
203
|
else build_args_from_constraints(f["arrayItem"]["constraints"])
|
|
203
204
|
if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
|
|
204
205
|
else {}
|
|
205
|
-
)
|
|
206
|
-
}
|
|
206
|
+
),
|
|
207
|
+
}
|
|
208
|
+
for f in schema["fields"]
|
|
207
209
|
]
|
|
208
210
|
|
|
209
211
|
for k in range(len(fields)):
|
|
@@ -234,10 +236,8 @@ def create_example_csv_file(
|
|
|
234
236
|
# would it be better to create by column or by row (as for now)?
|
|
235
237
|
output = pd.DataFrame(
|
|
236
238
|
[
|
|
237
|
-
[
|
|
238
|
-
|
|
239
|
-
for f in fields
|
|
240
|
-
] for _ in range(file_length)
|
|
239
|
+
[types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
|
|
240
|
+
for _ in range(file_length)
|
|
241
241
|
],
|
|
242
242
|
columns=[f["name"] for f in fields],
|
|
243
243
|
)
|
csv_detective/output/profile.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
3
|
from time import time
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
@@ -29,15 +29,12 @@ def create_profile(
|
|
|
29
29
|
safe_table = table.copy()
|
|
30
30
|
if not limited_output:
|
|
31
31
|
dict_cols_fields = {
|
|
32
|
-
k: v[0] if v else {
|
|
32
|
+
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
33
33
|
for k, v in dict_cols_fields.items()
|
|
34
34
|
}
|
|
35
|
-
dtypes = {
|
|
36
|
-
k: map_python_types.get(v["python_type"], str)
|
|
37
|
-
for k, v in dict_cols_fields.items()
|
|
38
|
-
}
|
|
35
|
+
dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
|
|
39
36
|
for c in safe_table.columns:
|
|
40
|
-
if dtypes[c]
|
|
37
|
+
if dtypes[c] is float:
|
|
41
38
|
safe_table[c] = safe_table[c].apply(
|
|
42
39
|
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
43
40
|
)
|
|
@@ -48,18 +45,26 @@ def create_profile(
|
|
|
48
45
|
int,
|
|
49
46
|
]:
|
|
50
47
|
profile[c].update(
|
|
51
|
-
min=prevent_nan(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
48
|
+
min=prevent_nan(
|
|
49
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
50
|
+
safe_table[c].min()
|
|
51
|
+
)
|
|
52
|
+
),
|
|
53
|
+
max=prevent_nan(
|
|
54
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
55
|
+
safe_table[c].max()
|
|
56
|
+
)
|
|
57
|
+
),
|
|
58
|
+
mean=prevent_nan(
|
|
59
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
60
|
+
safe_table[c].mean()
|
|
61
|
+
)
|
|
62
|
+
),
|
|
63
|
+
std=prevent_nan(
|
|
64
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
65
|
+
safe_table[c].std()
|
|
66
|
+
)
|
|
67
|
+
),
|
|
63
68
|
)
|
|
64
69
|
tops_bruts = (
|
|
65
70
|
safe_table[safe_table[c].notna()][c]
|
|
@@ -70,10 +75,12 @@ def create_profile(
|
|
|
70
75
|
)
|
|
71
76
|
tops = []
|
|
72
77
|
for tb in tops_bruts:
|
|
73
|
-
tops.append(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
tops.append(
|
|
79
|
+
{
|
|
80
|
+
"count": tb["count"],
|
|
81
|
+
"value": tb[c],
|
|
82
|
+
}
|
|
83
|
+
)
|
|
77
84
|
profile[c].update(
|
|
78
85
|
tops=tops,
|
|
79
86
|
nb_distinct=safe_table[c].nunique(),
|
csv_detective/output/schema.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
3
|
import os
|
|
5
4
|
import tempfile
|
|
5
|
+
from datetime import datetime
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import Optional
|
|
8
8
|
|
|
9
9
|
from botocore.exceptions import ClientError
|
|
10
10
|
|
|
11
|
-
from csv_detective.s3_utils import
|
|
11
|
+
from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
|
|
12
12
|
from csv_detective.utils import display_logs_depending_process_time
|
|
13
13
|
|
|
14
14
|
|
|
@@ -26,13 +26,11 @@ def get_description(format: str) -> str:
|
|
|
26
26
|
"insee_canton": "Le nom du canton",
|
|
27
27
|
"latitude_l93": "La latitude au format Lambert 93",
|
|
28
28
|
"latitude_wgs_fr_metropole": (
|
|
29
|
-
"La latitude au format WGS. Ne concerne que des latitudes "
|
|
30
|
-
"de la métropole française"
|
|
29
|
+
"La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
|
|
31
30
|
),
|
|
32
31
|
"longitude_l93": "La longitude au format Lambert 93",
|
|
33
32
|
"longitude_wgs_fr_metropole": (
|
|
34
|
-
"La longitude au format WGS. Ne concerne que des longitudes "
|
|
35
|
-
"de la métropole française"
|
|
33
|
+
"La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
|
|
36
34
|
),
|
|
37
35
|
"pays": "Le nom du pays",
|
|
38
36
|
"region": "Le nom de la région",
|
|
@@ -86,13 +84,13 @@ def get_pattern(format: str) -> str:
|
|
|
86
84
|
),
|
|
87
85
|
"uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
|
|
88
86
|
"email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
|
|
89
|
-
"twitter": r
|
|
90
|
-
"mongo_object_id": r
|
|
91
|
-
"uuid": r
|
|
87
|
+
"twitter": r"^@[A-Za-z0-9_]+$",
|
|
88
|
+
"mongo_object_id": r"^[0-9a-fA-F]{24}$",
|
|
89
|
+
"uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
|
|
92
90
|
"url": (
|
|
93
|
-
r
|
|
94
|
-
r
|
|
95
|
-
)
|
|
91
|
+
r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
|
|
92
|
+
r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
|
|
93
|
+
),
|
|
96
94
|
}
|
|
97
95
|
if format in format_to_pattern:
|
|
98
96
|
return {"pattern": format_to_pattern[format]}
|
|
@@ -210,7 +208,7 @@ def generate_table_schema(
|
|
|
210
208
|
key: Optional[str] = None,
|
|
211
209
|
minio_user: Optional[str] = None,
|
|
212
210
|
minio_pwd: Optional[str] = None,
|
|
213
|
-
verbose: bool = False
|
|
211
|
+
verbose: bool = False,
|
|
214
212
|
) -> dict:
|
|
215
213
|
"""Generates a table schema from the analysis report
|
|
216
214
|
|
|
@@ -236,7 +234,7 @@ def generate_table_schema(
|
|
|
236
234
|
"example": get_example(field_report["format"]),
|
|
237
235
|
"type": get_validata_type(field_report["format"]),
|
|
238
236
|
"formatFR": field_report["format"],
|
|
239
|
-
"constraints": get_constraints(field_report["format"])
|
|
237
|
+
"constraints": get_constraints(field_report["format"]),
|
|
240
238
|
}
|
|
241
239
|
for header, field_report in analysis_report["columns"].items()
|
|
242
240
|
]
|
|
@@ -255,12 +253,9 @@ def generate_table_schema(
|
|
|
255
253
|
"sources": [
|
|
256
254
|
{
|
|
257
255
|
"title": "Spécification Tableschema",
|
|
258
|
-
"path": "https://specs.frictionlessdata.io/table-schema"
|
|
256
|
+
"path": "https://specs.frictionlessdata.io/table-schema",
|
|
259
257
|
},
|
|
260
|
-
{
|
|
261
|
-
"title": "schema.data.gouv.fr",
|
|
262
|
-
"path": "https://schema.data.gouv.fr"
|
|
263
|
-
}
|
|
258
|
+
{"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
|
|
264
259
|
],
|
|
265
260
|
"created": datetime.today().strftime("%Y-%m-%d"),
|
|
266
261
|
"lastModified": datetime.today().strftime("%Y-%m-%d"),
|
|
@@ -278,7 +273,9 @@ def generate_table_schema(
|
|
|
278
273
|
}
|
|
279
274
|
|
|
280
275
|
if verbose:
|
|
281
|
-
display_logs_depending_process_time(
|
|
276
|
+
display_logs_depending_process_time(
|
|
277
|
+
f"Created schema in {round(time() - start, 3)}s", time() - start
|
|
278
|
+
)
|
|
282
279
|
|
|
283
280
|
if not save_file:
|
|
284
281
|
return schema
|
|
@@ -301,9 +298,9 @@ def generate_table_schema(
|
|
|
301
298
|
if "Contents" in tableschema_objects:
|
|
302
299
|
tableschema_keys = [
|
|
303
300
|
tableschema["Key"]
|
|
304
|
-
for tableschema in client.list_objects(
|
|
305
|
-
|
|
306
|
-
|
|
301
|
+
for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
|
|
302
|
+
"Contents"
|
|
303
|
+
]
|
|
307
304
|
]
|
|
308
305
|
tableschema_versions = [
|
|
309
306
|
os.path.splitext(tableschema_key)[0].split("_")[-1]
|
csv_detective/output/utils.py
CHANGED
|
@@ -19,14 +19,17 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
19
19
|
# no need to specify int and float everywhere, they are deprioritized anyway
|
|
20
20
|
("int", ("float",)),
|
|
21
21
|
# bool over everything
|
|
22
|
-
(
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
22
|
+
(
|
|
23
|
+
"booleen",
|
|
24
|
+
(
|
|
25
|
+
"latitude_l93",
|
|
26
|
+
"latitude_wgs",
|
|
27
|
+
"latitude_wgs_fr_metropole",
|
|
28
|
+
"longitude_l93",
|
|
29
|
+
"longitude_wgs",
|
|
30
|
+
"longitude_wgs_fr_metropole",
|
|
31
|
+
),
|
|
32
|
+
),
|
|
30
33
|
("geojson", ("json",)),
|
|
31
34
|
# latlon over lonlat if no longitude allows to discriminate
|
|
32
35
|
("latlon_wgs", ("json", "lonlat_wgs")),
|
|
@@ -49,13 +52,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
49
52
|
for prio_format, secondary_formats in priorities:
|
|
50
53
|
if prio_format in detected_formats:
|
|
51
54
|
for secondary in secondary_formats:
|
|
52
|
-
if (
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
>= return_dict_cols[column_name][secondary]
|
|
57
|
-
or return_dict_cols[column_name][prio_format] >= 1
|
|
58
|
-
)
|
|
55
|
+
if secondary in detected_formats and (
|
|
56
|
+
return_dict_cols[column_name][prio_format]
|
|
57
|
+
>= return_dict_cols[column_name][secondary]
|
|
58
|
+
or return_dict_cols[column_name][prio_format] >= 1
|
|
59
59
|
):
|
|
60
60
|
formats_to_remove.add(secondary)
|
|
61
61
|
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -28,6 +28,7 @@ def test_col_val(
|
|
|
28
28
|
# TODO : change for a cleaner method and only test columns in modules labels
|
|
29
29
|
def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
|
|
30
30
|
return serie.sample(n=_range).apply(test_func)
|
|
31
|
+
|
|
31
32
|
try:
|
|
32
33
|
if skipna:
|
|
33
34
|
serie = serie[serie.notnull()]
|
|
@@ -60,11 +61,13 @@ def test_col_val(
|
|
|
60
61
|
if verbose and time() - start > 3:
|
|
61
62
|
display_logs_depending_process_time(
|
|
62
63
|
f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
|
|
63
|
-
time() - start
|
|
64
|
+
time() - start,
|
|
64
65
|
)
|
|
65
66
|
|
|
66
67
|
|
|
67
|
-
def test_col_label(
|
|
68
|
+
def test_col_label(
|
|
69
|
+
label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False
|
|
70
|
+
):
|
|
68
71
|
"""Tests label (from header) using test_func.
|
|
69
72
|
- proportion : indicates the minimum score to pass the test for the serie
|
|
70
73
|
to be detected as a certain format
|
|
@@ -76,7 +79,13 @@ def test_col_label(label: str, test_func: Callable, proportion: float = 1, limit
|
|
|
76
79
|
return result if result >= proportion else 0
|
|
77
80
|
|
|
78
81
|
|
|
79
|
-
def test_col(
|
|
82
|
+
def test_col(
|
|
83
|
+
table: pd.DataFrame,
|
|
84
|
+
all_tests: list,
|
|
85
|
+
limited_output: bool,
|
|
86
|
+
skipna: bool = True,
|
|
87
|
+
verbose: bool = False,
|
|
88
|
+
):
|
|
80
89
|
if verbose:
|
|
81
90
|
start = time()
|
|
82
91
|
logging.info("Testing columns to get types")
|
|
@@ -106,11 +115,13 @@ def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna:
|
|
|
106
115
|
)
|
|
107
116
|
if verbose:
|
|
108
117
|
display_logs_depending_process_time(
|
|
109
|
-
f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
|
|
110
|
-
time() - start_type
|
|
118
|
+
f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
|
|
119
|
+
time() - start_type,
|
|
111
120
|
)
|
|
112
121
|
if verbose:
|
|
113
|
-
display_logs_depending_process_time(
|
|
122
|
+
display_logs_depending_process_time(
|
|
123
|
+
f"Done testing columns in {round(time() - start, 3)}s", time() - start
|
|
124
|
+
)
|
|
114
125
|
return return_table
|
|
115
126
|
|
|
116
127
|
|
|
@@ -128,16 +139,16 @@ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbo
|
|
|
128
139
|
if verbose:
|
|
129
140
|
start_type = time()
|
|
130
141
|
return_table.loc[key] = [
|
|
131
|
-
test_col_label(
|
|
132
|
-
col_name, value["func"], value["prop"], limited_output=limited_output
|
|
133
|
-
)
|
|
142
|
+
test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
|
|
134
143
|
for col_name in table.columns
|
|
135
144
|
]
|
|
136
145
|
if verbose:
|
|
137
146
|
display_logs_depending_process_time(
|
|
138
|
-
f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
|
|
139
|
-
time() - start_type
|
|
147
|
+
f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
|
|
148
|
+
time() - start_type,
|
|
140
149
|
)
|
|
141
150
|
if verbose:
|
|
142
|
-
display_logs_depending_process_time(
|
|
151
|
+
display_logs_depending_process_time(
|
|
152
|
+
f"Done testing labels in {round(time() - start, 3)}s", time() - start
|
|
153
|
+
)
|
|
143
154
|
return return_table
|
csv_detective/parsing/csv.py
CHANGED
|
@@ -49,7 +49,7 @@ def parse_csv(
|
|
|
49
49
|
raise ValueError("Could not load file")
|
|
50
50
|
if verbose:
|
|
51
51
|
display_logs_depending_process_time(
|
|
52
|
-
f
|
|
52
|
+
f"Table parsed successfully in {round(time() - start, 3)}s",
|
|
53
53
|
time() - start,
|
|
54
54
|
)
|
|
55
55
|
return table, total_lines, nb_duplicates
|
csv_detective/parsing/excel.py
CHANGED
|
@@ -28,14 +28,13 @@ def parse_excel(
|
|
|
28
28
|
random_state: int = 42,
|
|
29
29
|
verbose: bool = False,
|
|
30
30
|
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
|
31
|
-
""""Excel-like parsing is really slow, could be a good improvement for future development"""
|
|
31
|
+
""" "Excel-like parsing is really slow, could be a good improvement for future development"""
|
|
32
32
|
if verbose:
|
|
33
33
|
start = time()
|
|
34
34
|
no_sheet_specified = sheet_name is None
|
|
35
35
|
|
|
36
|
-
if (
|
|
37
|
-
|
|
38
|
-
any([file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
|
|
36
|
+
if engine in ["openpyxl", "xlrd"] or any(
|
|
37
|
+
[file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT]
|
|
39
38
|
):
|
|
40
39
|
remote_content = None
|
|
41
40
|
if is_url(file_path):
|
|
@@ -50,7 +49,7 @@ def parse_excel(
|
|
|
50
49
|
if sheet_name is None:
|
|
51
50
|
if verbose:
|
|
52
51
|
display_logs_depending_process_time(
|
|
53
|
-
f
|
|
52
|
+
f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
|
|
54
53
|
time() - start,
|
|
55
54
|
)
|
|
56
55
|
try:
|
|
@@ -58,8 +57,8 @@ def parse_excel(
|
|
|
58
57
|
# openpyxl doesn't want to open files that don't have a valid extension
|
|
59
58
|
# see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
|
|
60
59
|
# if the file is remote, we have a remote content anyway so it's fine
|
|
61
|
-
if not remote_content and
|
|
62
|
-
with open(file_path,
|
|
60
|
+
if not remote_content and "." not in file_path.split("/")[-1]:
|
|
61
|
+
with open(file_path, "rb") as f:
|
|
63
62
|
remote_content = BytesIO(f.read())
|
|
64
63
|
# faster than loading all sheets
|
|
65
64
|
wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
|
|
@@ -82,7 +81,7 @@ def parse_excel(
|
|
|
82
81
|
# sometimes a xls file is recognized as ods
|
|
83
82
|
if verbose:
|
|
84
83
|
display_logs_depending_process_time(
|
|
85
|
-
|
|
84
|
+
"Could not read file with classic xls reader, trying with ODS",
|
|
86
85
|
time() - start,
|
|
87
86
|
)
|
|
88
87
|
engine = "odf"
|
|
@@ -95,7 +94,7 @@ def parse_excel(
|
|
|
95
94
|
if sheet_name is None:
|
|
96
95
|
if verbose:
|
|
97
96
|
display_logs_depending_process_time(
|
|
98
|
-
f
|
|
97
|
+
f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
|
|
99
98
|
time() - start,
|
|
100
99
|
)
|
|
101
100
|
tables = pd.read_excel(
|
|
@@ -132,7 +131,7 @@ def parse_excel(
|
|
|
132
131
|
table = table.sample(num_rows, random_state=random_state)
|
|
133
132
|
if verbose:
|
|
134
133
|
display_logs_depending_process_time(
|
|
135
|
-
f
|
|
134
|
+
f"Table parsed successfully in {round(time() - start, 3)}s",
|
|
136
135
|
time() - start,
|
|
137
136
|
)
|
|
138
137
|
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
@@ -163,7 +162,7 @@ def parse_excel(
|
|
|
163
162
|
table = table.sample(num_rows, random_state=random_state)
|
|
164
163
|
if verbose:
|
|
165
164
|
display_logs_depending_process_time(
|
|
166
|
-
f
|
|
165
|
+
f"Table parsed successfully in {round(time() - start, 3)}s",
|
|
167
166
|
time() - start,
|
|
168
167
|
)
|
|
169
168
|
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
csv_detective/parsing/load.py
CHANGED
|
@@ -14,6 +14,7 @@ from csv_detective.detection.engine import (
|
|
|
14
14
|
from csv_detective.detection.headers import detect_headers
|
|
15
15
|
from csv_detective.detection.separator import detect_separator
|
|
16
16
|
from csv_detective.utils import is_url
|
|
17
|
+
|
|
17
18
|
from .compression import unzip
|
|
18
19
|
from .csv import parse_csv
|
|
19
20
|
from .excel import (
|
|
@@ -30,9 +31,9 @@ def load_file(
|
|
|
30
31
|
verbose: bool = False,
|
|
31
32
|
sheet_name: Optional[Union[str, int]] = None,
|
|
32
33
|
) -> tuple[pd.DataFrame, dict]:
|
|
33
|
-
file_name = file_path.split(
|
|
34
|
+
file_name = file_path.split("/")[-1]
|
|
34
35
|
engine = None
|
|
35
|
-
if
|
|
36
|
+
if "." not in file_name or not file_name.endswith("csv"):
|
|
36
37
|
# file has no extension, we'll investigate how to read it
|
|
37
38
|
engine = detect_engine(file_path, verbose=verbose)
|
|
38
39
|
|
|
@@ -88,10 +89,12 @@ def load_file(
|
|
|
88
89
|
"heading_columns": heading_columns,
|
|
89
90
|
"trailing_columns": trailing_columns,
|
|
90
91
|
}
|
|
91
|
-
analysis.update(
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
92
|
+
analysis.update(
|
|
93
|
+
{
|
|
94
|
+
"header_row_idx": header_row_idx,
|
|
95
|
+
"header": header,
|
|
96
|
+
"total_lines": total_lines,
|
|
97
|
+
"nb_duplicates": nb_duplicates,
|
|
98
|
+
}
|
|
99
|
+
)
|
|
97
100
|
return table, analysis
|
csv_detective/parsing/text.py
CHANGED
|
@@ -2,9 +2,7 @@ from re import finditer
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def camel_case_split(identifier: str):
|
|
5
|
-
matches = finditer(
|
|
6
|
-
".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
|
|
7
|
-
)
|
|
5
|
+
matches = finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
|
|
8
6
|
return " ".join([m.group(0) for m in matches])
|
|
9
7
|
|
|
10
8
|
|
|
@@ -46,15 +44,12 @@ def header_score(header: str, words_combinations_list: list[str]) -> float:
|
|
|
46
44
|
processed_header = _process_text(header)
|
|
47
45
|
|
|
48
46
|
header_matches_words_combination = float(
|
|
49
|
-
any(
|
|
50
|
-
words_combination == processed_header for words_combination in words_combinations_list
|
|
51
|
-
)
|
|
47
|
+
any(words_combination == processed_header for words_combination in words_combinations_list)
|
|
52
48
|
)
|
|
53
49
|
words_combination_in_header = 0.5 * (
|
|
54
50
|
any(
|
|
55
|
-
is_word_in_string(
|
|
56
|
-
|
|
57
|
-
) for words_combination in words_combinations_list
|
|
51
|
+
is_word_in_string(words_combination, processed_header)
|
|
52
|
+
for words_combination in words_combinations_list
|
|
58
53
|
)
|
|
59
54
|
)
|
|
60
55
|
|
csv_detective/s3_utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import boto3
|
|
2
1
|
import logging
|
|
3
2
|
|
|
3
|
+
import boto3
|
|
4
4
|
from botocore.client import Config
|
|
5
5
|
from botocore.exceptions import ClientError
|
|
6
6
|
|
|
@@ -27,9 +27,7 @@ def download_from_minio(
|
|
|
27
27
|
s3 = get_s3_client(netloc, minio_user, minio_pwd)
|
|
28
28
|
try:
|
|
29
29
|
s3.download_file(bucket, key, filepath)
|
|
30
|
-
logging.info(
|
|
31
|
-
f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}"
|
|
32
|
-
)
|
|
30
|
+
logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
|
|
33
31
|
except ClientError as e:
|
|
34
32
|
logging.error(e)
|
|
35
33
|
|
|
@@ -41,8 +39,6 @@ def upload_to_minio(
|
|
|
41
39
|
s3 = get_s3_client(netloc, minio_user, minio_pwd)
|
|
42
40
|
try:
|
|
43
41
|
s3.upload_file(filepath, bucket, key)
|
|
44
|
-
logging.info(
|
|
45
|
-
f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}"
|
|
46
|
-
)
|
|
42
|
+
logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
|
|
47
43
|
except ClientError as e:
|
|
48
44
|
logging.error(e)
|
csv_detective/utils.py
CHANGED
|
@@ -4,7 +4,9 @@ from typing import Optional
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
6
|
logging.basicConfig(level=logging.INFO)
|
|
7
|
-
logging.addLevelName(
|
|
7
|
+
logging.addLevelName(
|
|
8
|
+
logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL)
|
|
9
|
+
)
|
|
8
10
|
logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
|
|
9
11
|
|
|
10
12
|
THRESHOLD_WARN = 1
|
|
@@ -26,7 +28,7 @@ def display_logs_depending_process_time(prompt: str, duration: float) -> None:
|
|
|
26
28
|
def is_url(file_path: str) -> bool:
|
|
27
29
|
# could be more sophisticated if needed
|
|
28
30
|
# using the URL detection test was considered but too broad (schema required to use requests)
|
|
29
|
-
return file_path.startswith(
|
|
31
|
+
return file_path.startswith("http")
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
def prevent_nan(value: float) -> Optional[float]:
|