csv-detective 0.9.2.dev1874__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +2 -2
- csv_detective/detect_fields/temp/date/__init__.py +1 -2
- csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
- csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
- csv_detective/detection/engine.py +1 -2
- csv_detective/detection/formats.py +14 -8
- csv_detective/detection/headers.py +2 -2
- csv_detective/explore_csv.py +11 -119
- csv_detective/load_tests.py +1 -2
- csv_detective/output/__init__.py +11 -14
- csv_detective/output/dataframe.py +1 -2
- csv_detective/output/example.py +12 -12
- csv_detective/output/profile.py +13 -10
- csv_detective/output/schema.py +7 -86
- csv_detective/parsing/excel.py +2 -3
- csv_detective/parsing/load.py +3 -4
- csv_detective/utils.py +4 -3
- csv_detective/validate.py +4 -5
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +34 -36
- tests/test_fields.py +37 -4
- tests/test_file.py +68 -0
- venv/bin/activate_this.py +1 -1
- csv_detective/s3_utils.py +0 -44
- venv/bin/jp.py +0 -54
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
-
PROPORTION =
|
|
3
|
+
PROPORTION = 0.9
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
7
|
"""Detects e-mails"""
|
|
8
8
|
return isinstance(val, str) and bool(
|
|
9
|
-
re.match(r"^[a-
|
|
9
|
+
re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
|
|
10
10
|
)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
from dateparser import parse as date_parser
|
|
6
5
|
from dateutil.parser import ParserError
|
|
@@ -10,7 +9,7 @@ PROPORTION = 1
|
|
|
10
9
|
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
def date_casting(val: str) ->
|
|
12
|
+
def date_casting(val: str) -> datetime | None:
|
|
14
13
|
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
|
|
15
14
|
try:
|
|
16
15
|
return dateutil_parser(val)
|
|
@@ -1,24 +1,25 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
|
|
5
5
|
|
|
6
6
|
PROPORTION = 1
|
|
7
7
|
threshold = 0.7
|
|
8
8
|
|
|
9
|
-
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
|
|
9
|
+
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
|
|
10
10
|
pat = (
|
|
11
11
|
aaaammjj_pattern.replace("$", "")
|
|
12
|
-
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})
|
|
12
|
+
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
|
|
13
|
+
+ r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
|
|
13
14
|
)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def _is(val:
|
|
17
|
+
def _is(val: Any | None) -> bool:
|
|
17
18
|
"""Detects timezone-aware datetimes only"""
|
|
18
19
|
# early stops, to cut processing time
|
|
19
|
-
#
|
|
20
|
+
# 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
|
|
20
21
|
# 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
|
|
21
|
-
if not isinstance(val, str) or len(val) > 35 or len(val) <
|
|
22
|
+
if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
|
|
22
23
|
return False
|
|
23
24
|
# if usual format, no need to parse
|
|
24
25
|
if bool(re.match(pat, val)):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
|
|
5
5
|
|
|
@@ -9,11 +9,11 @@ threshold = 0.7
|
|
|
9
9
|
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
|
|
10
10
|
pat = (
|
|
11
11
|
aaaammjj_pattern.replace("$", "")
|
|
12
|
-
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})
|
|
12
|
+
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def _is(val:
|
|
16
|
+
def _is(val: Any | None) -> bool:
|
|
17
17
|
"""Detects naive datetimes only"""
|
|
18
18
|
# early stops, to cut processing time
|
|
19
19
|
# 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
|
|
@@ -26,8 +26,4 @@ def _is(val: Optional[Any]) -> bool:
|
|
|
26
26
|
if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
|
|
27
27
|
return False
|
|
28
28
|
res = date_casting(val)
|
|
29
|
-
return (
|
|
30
|
-
res is not None
|
|
31
|
-
and bool(res.hour or res.minute or res.second or res.microsecond)
|
|
32
|
-
and not bool(res.tzinfo)
|
|
33
|
-
)
|
|
29
|
+
return res is not None and not bool(res.tzinfo)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from time import time
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
import magic
|
|
5
4
|
import requests
|
|
@@ -16,7 +15,7 @@ engine_to_file = {
|
|
|
16
15
|
}
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
def detect_engine(file_path: str, verbose=False) ->
|
|
18
|
+
def detect_engine(file_path: str, verbose=False) -> str | None:
|
|
20
19
|
if verbose:
|
|
21
20
|
start = time()
|
|
22
21
|
mapping = {
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from typing import Union
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
@@ -22,7 +21,7 @@ def detect_formats(
|
|
|
22
21
|
table: pd.DataFrame,
|
|
23
22
|
analysis: dict,
|
|
24
23
|
file_path: str,
|
|
25
|
-
user_input_tests:
|
|
24
|
+
user_input_tests: str | list[str] = "ALL",
|
|
26
25
|
limited_output: bool = True,
|
|
27
26
|
skipna: bool = True,
|
|
28
27
|
verbose: bool = False,
|
|
@@ -30,7 +29,7 @@ def detect_formats(
|
|
|
30
29
|
on_sample = len(table) > MAX_ROWS_ANALYSIS
|
|
31
30
|
if on_sample:
|
|
32
31
|
if verbose:
|
|
33
|
-
logging.warning(f"File is too long, analysing
|
|
32
|
+
logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
|
|
34
33
|
table = build_sample(table)
|
|
35
34
|
|
|
36
35
|
if table.empty:
|
|
@@ -183,13 +182,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
|
183
182
|
samples = pd.concat(
|
|
184
183
|
[
|
|
185
184
|
# one row with the minimum of the column
|
|
186
|
-
table.loc[table[col] ==
|
|
185
|
+
table.loc[table[col] == val].iloc[[0]]
|
|
187
186
|
for col in table.columns
|
|
187
|
+
if not pd.isna(val := table[col].dropna().min())
|
|
188
188
|
]
|
|
189
189
|
+ [
|
|
190
190
|
# one row with the maximum of the column
|
|
191
|
-
table.loc[table[col] ==
|
|
191
|
+
table.loc[table[col] == val].iloc[[0]]
|
|
192
192
|
for col in table.columns
|
|
193
|
+
if not pd.isna(val := table[col].dropna().max())
|
|
193
194
|
]
|
|
194
195
|
+ [
|
|
195
196
|
# one row with a NaN value if the column has any
|
|
@@ -199,7 +200,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
|
199
200
|
],
|
|
200
201
|
ignore_index=True,
|
|
201
202
|
)
|
|
202
|
-
return
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
return (
|
|
204
|
+
pd.concat(
|
|
205
|
+
[samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
|
|
206
|
+
ignore_index=True,
|
|
207
|
+
)
|
|
208
|
+
# this is very unlikely but we never know
|
|
209
|
+
if len(samples) <= MAX_ROWS_ANALYSIS
|
|
210
|
+
else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
|
|
205
211
|
)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TextIO
|
|
4
4
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
|
|
9
9
|
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
10
|
if verbose:
|
|
11
11
|
start = time()
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,16 +1,11 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import logging
|
|
3
|
-
import os
|
|
4
|
-
import tempfile
|
|
5
2
|
from time import time
|
|
6
|
-
from typing import Optional, Union
|
|
7
3
|
|
|
8
4
|
import pandas as pd
|
|
9
5
|
|
|
10
6
|
from csv_detective.detection.formats import detect_formats
|
|
11
|
-
from csv_detective.output import generate_output
|
|
7
|
+
from csv_detective.output import generate_output
|
|
12
8
|
from csv_detective.parsing.load import load_file
|
|
13
|
-
from csv_detective.s3_utils import download_from_minio, upload_to_minio
|
|
14
9
|
from csv_detective.utils import display_logs_depending_process_time, is_url
|
|
15
10
|
from csv_detective.validate import validate
|
|
16
11
|
|
|
@@ -20,24 +15,24 @@ logging.basicConfig(level=logging.INFO)
|
|
|
20
15
|
def routine(
|
|
21
16
|
file_path: str,
|
|
22
17
|
num_rows: int = 500,
|
|
23
|
-
user_input_tests:
|
|
18
|
+
user_input_tests: str | list[str] = "ALL",
|
|
24
19
|
limited_output: bool = True,
|
|
25
|
-
save_results:
|
|
26
|
-
encoding:
|
|
27
|
-
sep:
|
|
20
|
+
save_results: bool | str = True,
|
|
21
|
+
encoding: str | None = None,
|
|
22
|
+
sep: str | None = None,
|
|
28
23
|
skipna: bool = True,
|
|
29
24
|
output_profile: bool = False,
|
|
30
25
|
output_schema: bool = False,
|
|
31
26
|
output_df: bool = False,
|
|
32
27
|
cast_json: bool = True,
|
|
33
28
|
verbose: bool = False,
|
|
34
|
-
sheet_name:
|
|
35
|
-
) ->
|
|
36
|
-
"""Returns a dict with information about the
|
|
29
|
+
sheet_name: str | int | None = None,
|
|
30
|
+
) -> dict | tuple[dict, pd.DataFrame]:
|
|
31
|
+
"""Returns a dict with information about the table and possible
|
|
37
32
|
column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
38
33
|
|
|
39
34
|
Args:
|
|
40
|
-
file_path: local path to
|
|
35
|
+
file_path: local path or URL to file
|
|
41
36
|
num_rows: number of rows to sample from the file for analysis ; -1 for analysis
|
|
42
37
|
of the whole file
|
|
43
38
|
user_input_tests: tests to run on the file
|
|
@@ -111,9 +106,9 @@ def validate_then_detect(
|
|
|
111
106
|
file_path: str,
|
|
112
107
|
previous_analysis: dict,
|
|
113
108
|
num_rows: int = 500,
|
|
114
|
-
user_input_tests:
|
|
109
|
+
user_input_tests: str | list[str] = "ALL",
|
|
115
110
|
limited_output: bool = True,
|
|
116
|
-
save_results:
|
|
111
|
+
save_results: bool | str = True,
|
|
117
112
|
skipna: bool = True,
|
|
118
113
|
output_profile: bool = False,
|
|
119
114
|
output_schema: bool = False,
|
|
@@ -173,106 +168,3 @@ def validate_then_detect(
|
|
|
173
168
|
display_logs_depending_process_time(
|
|
174
169
|
f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
175
170
|
)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def routine_minio(
|
|
179
|
-
csv_minio_location: dict[str, str],
|
|
180
|
-
output_minio_location: dict[str, str],
|
|
181
|
-
tableschema_minio_location: dict[str, str],
|
|
182
|
-
minio_user: str,
|
|
183
|
-
minio_pwd: str,
|
|
184
|
-
**kwargs,
|
|
185
|
-
):
|
|
186
|
-
"""Returns a dict with information about the csv table and possible
|
|
187
|
-
column contents.
|
|
188
|
-
|
|
189
|
-
Args:
|
|
190
|
-
csv_minio_location: dict with Minio URL, bucket and key of the CSV file
|
|
191
|
-
output_minio_location: Minio URL, bucket and key to store output file. None if
|
|
192
|
-
not uploading to Minio.
|
|
193
|
-
tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
|
|
194
|
-
None if not uploading the tableschema to Minio.
|
|
195
|
-
minio_user: user name for the minio instance
|
|
196
|
-
minio_pwd: password for the minio instance
|
|
197
|
-
kwargs: arguments for routine
|
|
198
|
-
|
|
199
|
-
Returns:
|
|
200
|
-
dict: a dict with information about the csv and possible types for each column
|
|
201
|
-
"""
|
|
202
|
-
|
|
203
|
-
if (
|
|
204
|
-
(
|
|
205
|
-
any(
|
|
206
|
-
[
|
|
207
|
-
location_dict is not None
|
|
208
|
-
for location_dict in [
|
|
209
|
-
csv_minio_location,
|
|
210
|
-
output_minio_location,
|
|
211
|
-
tableschema_minio_location,
|
|
212
|
-
]
|
|
213
|
-
]
|
|
214
|
-
)
|
|
215
|
-
)
|
|
216
|
-
and (minio_user is None)
|
|
217
|
-
or (minio_pwd is None)
|
|
218
|
-
):
|
|
219
|
-
raise ValueError("Minio credentials are required if using Minio")
|
|
220
|
-
|
|
221
|
-
for location_dict in [
|
|
222
|
-
csv_minio_location,
|
|
223
|
-
output_minio_location,
|
|
224
|
-
tableschema_minio_location,
|
|
225
|
-
]:
|
|
226
|
-
if location_dict is not None:
|
|
227
|
-
if any(
|
|
228
|
-
[
|
|
229
|
-
(location_key not in location_dict) or (location_dict[location_key] is None)
|
|
230
|
-
for location_key in ["netloc", "bucket", "key"]
|
|
231
|
-
]
|
|
232
|
-
):
|
|
233
|
-
raise ValueError("Minio location dict must contain url, bucket and key")
|
|
234
|
-
|
|
235
|
-
file_path = tempfile.NamedTemporaryFile(delete=False).name
|
|
236
|
-
download_from_minio(
|
|
237
|
-
netloc=csv_minio_location["netloc"],
|
|
238
|
-
bucket=csv_minio_location["bucket"],
|
|
239
|
-
key=csv_minio_location["key"],
|
|
240
|
-
filepath=file_path,
|
|
241
|
-
minio_user=minio_user,
|
|
242
|
-
minio_pwd=minio_pwd,
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
analysis = routine(
|
|
246
|
-
file_path,
|
|
247
|
-
save_results=True,
|
|
248
|
-
**kwargs,
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
# Write report JSON file.
|
|
252
|
-
output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
|
|
253
|
-
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
254
|
-
json.dump(analysis, fp, indent=4, separators=(",", ": "))
|
|
255
|
-
|
|
256
|
-
upload_to_minio(
|
|
257
|
-
netloc=output_minio_location["netloc"],
|
|
258
|
-
bucket=output_minio_location["bucket"],
|
|
259
|
-
key=output_minio_location["key"],
|
|
260
|
-
filepath=output_path_to_store_minio_file,
|
|
261
|
-
minio_user=minio_user,
|
|
262
|
-
minio_pwd=minio_pwd,
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
os.remove(output_path_to_store_minio_file)
|
|
266
|
-
os.remove(file_path)
|
|
267
|
-
|
|
268
|
-
generate_table_schema(
|
|
269
|
-
analysis_report=analysis,
|
|
270
|
-
save_file=True,
|
|
271
|
-
netloc=tableschema_minio_location["netloc"],
|
|
272
|
-
bucket=tableschema_minio_location["bucket"],
|
|
273
|
-
key=tableschema_minio_location["key"],
|
|
274
|
-
minio_user=minio_user,
|
|
275
|
-
minio_pwd=minio_pwd,
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
return analysis
|
csv_detective/load_tests.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Union
|
|
3
2
|
|
|
4
3
|
from csv_detective import detect_fields, detect_labels # noqa
|
|
5
4
|
|
|
@@ -18,7 +17,7 @@ def get_all_packages(detect_type) -> list:
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
def return_all_tests(
|
|
21
|
-
user_input_tests:
|
|
20
|
+
user_input_tests: str | list,
|
|
22
21
|
detect_type: str,
|
|
23
22
|
) -> list:
|
|
24
23
|
"""
|
csv_detective/output/__init__.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from typing import Optional, Union
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
@@ -17,22 +16,15 @@ def generate_output(
|
|
|
17
16
|
file_path: str,
|
|
18
17
|
num_rows: int = 500,
|
|
19
18
|
limited_output: bool = True,
|
|
20
|
-
save_results:
|
|
19
|
+
save_results: bool | str = True,
|
|
21
20
|
output_profile: bool = False,
|
|
22
21
|
output_schema: bool = False,
|
|
23
22
|
output_df: bool = False,
|
|
24
23
|
cast_json: bool = True,
|
|
25
24
|
verbose: bool = False,
|
|
26
|
-
sheet_name:
|
|
27
|
-
) ->
|
|
28
|
-
if output_profile
|
|
29
|
-
# to create the profile we have to cast columns, so using the dedicated function
|
|
30
|
-
table = cast_df(
|
|
31
|
-
df=table,
|
|
32
|
-
columns=analysis["columns"],
|
|
33
|
-
cast_json=cast_json,
|
|
34
|
-
verbose=verbose,
|
|
35
|
-
)
|
|
25
|
+
sheet_name: str | int | None = None,
|
|
26
|
+
) -> dict | tuple[dict, pd.DataFrame]:
|
|
27
|
+
if output_profile:
|
|
36
28
|
analysis["profile"] = create_profile(
|
|
37
29
|
table=table,
|
|
38
30
|
columns=analysis["columns"],
|
|
@@ -58,8 +50,13 @@ def generate_output(
|
|
|
58
50
|
)
|
|
59
51
|
|
|
60
52
|
if output_schema:
|
|
61
|
-
analysis["schema"] = generate_table_schema(analysis,
|
|
53
|
+
analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
|
|
62
54
|
|
|
63
55
|
if output_df:
|
|
64
|
-
return analysis,
|
|
56
|
+
return analysis, cast_df(
|
|
57
|
+
df=table,
|
|
58
|
+
columns=analysis["columns"],
|
|
59
|
+
cast_json=cast_json,
|
|
60
|
+
verbose=verbose,
|
|
61
|
+
)
|
|
65
62
|
return analysis
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import date, datetime
|
|
3
3
|
from time import time
|
|
4
|
-
from typing import Optional, Union
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
|
|
@@ -11,7 +10,7 @@ from csv_detective.detect_fields.temp.date import date_casting
|
|
|
11
10
|
from csv_detective.utils import display_logs_depending_process_time
|
|
12
11
|
|
|
13
12
|
|
|
14
|
-
def cast(value: str, _type: str) ->
|
|
13
|
+
def cast(value: str, _type: str) -> str | float | bool | date | datetime | None:
|
|
15
14
|
if not isinstance(value, str) or not value:
|
|
16
15
|
# None is the current default value in hydra, should we keep this?
|
|
17
16
|
return None
|
csv_detective/output/example.py
CHANGED
|
@@ -3,7 +3,7 @@ import random
|
|
|
3
3
|
import string
|
|
4
4
|
import uuid
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, Type
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import requests
|
|
@@ -14,10 +14,10 @@ fake = Faker()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_example_csv_file(
|
|
17
|
-
fields:
|
|
18
|
-
schema_path:
|
|
17
|
+
fields: dict | None = None,
|
|
18
|
+
schema_path: str | None = None,
|
|
19
19
|
file_length: int = 10,
|
|
20
|
-
output_name:
|
|
20
|
+
output_name: str | None = "example_file.csv",
|
|
21
21
|
output_sep: str = ";",
|
|
22
22
|
encoding: str = "utf-8",
|
|
23
23
|
ignore_required: bool = False,
|
|
@@ -49,8 +49,8 @@ def create_example_csv_file(
|
|
|
49
49
|
def _string(
|
|
50
50
|
length: int = 10,
|
|
51
51
|
required: bool = True,
|
|
52
|
-
pattern:
|
|
53
|
-
enum:
|
|
52
|
+
pattern: str | None = None,
|
|
53
|
+
enum: str | None = None,
|
|
54
54
|
) -> str:
|
|
55
55
|
if potential_skip(required):
|
|
56
56
|
return ""
|
|
@@ -70,7 +70,7 @@ def create_example_csv_file(
|
|
|
70
70
|
return str(uuid.uuid4())
|
|
71
71
|
|
|
72
72
|
def _date(
|
|
73
|
-
date_range:
|
|
73
|
+
date_range: list[str] | None = None,
|
|
74
74
|
format: str = "%Y-%m-%d",
|
|
75
75
|
required: bool = True,
|
|
76
76
|
) -> str:
|
|
@@ -99,7 +99,7 @@ def create_example_csv_file(
|
|
|
99
99
|
return fake.time(format)
|
|
100
100
|
|
|
101
101
|
def _datetime(
|
|
102
|
-
datetime_range:
|
|
102
|
+
datetime_range: list[str] | None = None,
|
|
103
103
|
format: str = "%Y-%m-%d %H-%M-%S",
|
|
104
104
|
required: bool = True,
|
|
105
105
|
) -> str:
|
|
@@ -123,11 +123,11 @@ def create_example_csv_file(
|
|
|
123
123
|
return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
|
|
124
124
|
|
|
125
125
|
def _number(
|
|
126
|
-
num_type: Type[
|
|
127
|
-
num_range:
|
|
128
|
-
enum:
|
|
126
|
+
num_type: Type[int | float] = int,
|
|
127
|
+
num_range: list[float] | None = None,
|
|
128
|
+
enum: list | None = None,
|
|
129
129
|
required: bool = True,
|
|
130
|
-
) ->
|
|
130
|
+
) -> int | float:
|
|
131
131
|
assert num_range is None or len(num_range) == 2
|
|
132
132
|
if potential_skip(required):
|
|
133
133
|
return ""
|
csv_detective/output/profile.py
CHANGED
|
@@ -4,7 +4,8 @@ from time import time
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from csv_detective.
|
|
7
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
8
|
+
from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def create_profile(
|
|
@@ -18,11 +19,6 @@ def create_profile(
|
|
|
18
19
|
if verbose:
|
|
19
20
|
start = time()
|
|
20
21
|
logging.info("Creating profile")
|
|
21
|
-
map_python_types = {
|
|
22
|
-
"string": str,
|
|
23
|
-
"int": float,
|
|
24
|
-
"float": float,
|
|
25
|
-
}
|
|
26
22
|
|
|
27
23
|
if num_rows > 0:
|
|
28
24
|
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
@@ -35,12 +31,19 @@ def create_profile(
|
|
|
35
31
|
for c in table.columns:
|
|
36
32
|
# for numerical formats we want min, max, mean, std
|
|
37
33
|
if columns[c]["python_type"] in ["float", "int"]:
|
|
34
|
+
# we locally cast the column to perform the operations, using the same method as in cast_df
|
|
35
|
+
cast_col = (
|
|
36
|
+
table[c].astype(pd.Int64Dtype())
|
|
37
|
+
if columns[c]["python_type"] == "int"
|
|
38
|
+
else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
|
|
39
|
+
)
|
|
38
40
|
profile[c].update(
|
|
39
|
-
min=
|
|
40
|
-
max=
|
|
41
|
-
mean=
|
|
42
|
-
std=
|
|
41
|
+
min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
|
|
42
|
+
max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
|
|
43
|
+
mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
|
|
44
|
+
std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
|
|
43
45
|
)
|
|
46
|
+
del cast_col
|
|
44
47
|
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
45
48
|
tops_bruts = (
|
|
46
49
|
table.loc[table[c].notna(), c]
|