csv-detective 0.9.2.dev1896__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +2 -2
- csv_detective/detect_fields/temp/date/__init__.py +1 -2
- csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
- csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
- csv_detective/detection/engine.py +1 -2
- csv_detective/detection/formats.py +14 -8
- csv_detective/detection/headers.py +2 -2
- csv_detective/explore_csv.py +11 -119
- csv_detective/load_tests.py +1 -2
- csv_detective/output/__init__.py +4 -5
- csv_detective/output/dataframe.py +1 -2
- csv_detective/output/example.py +12 -12
- csv_detective/output/schema.py +7 -86
- csv_detective/parsing/excel.py +2 -3
- csv_detective/parsing/load.py +3 -4
- csv_detective/utils.py +1 -2
- csv_detective/validate.py +4 -5
- {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
- {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +33 -35
- tests/test_fields.py +37 -4
- tests/test_file.py +18 -0
- venv/bin/activate_this.py +1 -1
- csv_detective/s3_utils.py +0 -44
- venv/bin/jp.py +0 -54
- {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
-
PROPORTION =
|
|
3
|
+
PROPORTION = 0.9
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
7
|
"""Detects e-mails"""
|
|
8
8
|
return isinstance(val, str) and bool(
|
|
9
|
-
re.match(r"^[a-
|
|
9
|
+
re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
|
|
10
10
|
)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
from dateparser import parse as date_parser
|
|
6
5
|
from dateutil.parser import ParserError
|
|
@@ -10,7 +9,7 @@ PROPORTION = 1
|
|
|
10
9
|
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
def date_casting(val: str) ->
|
|
12
|
+
def date_casting(val: str) -> datetime | None:
|
|
14
13
|
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
|
|
15
14
|
try:
|
|
16
15
|
return dateutil_parser(val)
|
|
@@ -1,24 +1,25 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
|
|
5
5
|
|
|
6
6
|
PROPORTION = 1
|
|
7
7
|
threshold = 0.7
|
|
8
8
|
|
|
9
|
-
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
|
|
9
|
+
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
|
|
10
10
|
pat = (
|
|
11
11
|
aaaammjj_pattern.replace("$", "")
|
|
12
|
-
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})
|
|
12
|
+
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
|
|
13
|
+
+ r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
|
|
13
14
|
)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def _is(val:
|
|
17
|
+
def _is(val: Any | None) -> bool:
|
|
17
18
|
"""Detects timezone-aware datetimes only"""
|
|
18
19
|
# early stops, to cut processing time
|
|
19
|
-
#
|
|
20
|
+
# 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
|
|
20
21
|
# 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
|
|
21
|
-
if not isinstance(val, str) or len(val) > 35 or len(val) <
|
|
22
|
+
if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
|
|
22
23
|
return False
|
|
23
24
|
# if usual format, no need to parse
|
|
24
25
|
if bool(re.match(pat, val)):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
|
|
5
5
|
|
|
@@ -9,11 +9,11 @@ threshold = 0.7
|
|
|
9
9
|
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
|
|
10
10
|
pat = (
|
|
11
11
|
aaaammjj_pattern.replace("$", "")
|
|
12
|
-
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})
|
|
12
|
+
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def _is(val:
|
|
16
|
+
def _is(val: Any | None) -> bool:
|
|
17
17
|
"""Detects naive datetimes only"""
|
|
18
18
|
# early stops, to cut processing time
|
|
19
19
|
# 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
|
|
@@ -26,8 +26,4 @@ def _is(val: Optional[Any]) -> bool:
|
|
|
26
26
|
if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
|
|
27
27
|
return False
|
|
28
28
|
res = date_casting(val)
|
|
29
|
-
return (
|
|
30
|
-
res is not None
|
|
31
|
-
and bool(res.hour or res.minute or res.second or res.microsecond)
|
|
32
|
-
and not bool(res.tzinfo)
|
|
33
|
-
)
|
|
29
|
+
return res is not None and not bool(res.tzinfo)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from time import time
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
import magic
|
|
5
4
|
import requests
|
|
@@ -16,7 +15,7 @@ engine_to_file = {
|
|
|
16
15
|
}
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
def detect_engine(file_path: str, verbose=False) ->
|
|
18
|
+
def detect_engine(file_path: str, verbose=False) -> str | None:
|
|
20
19
|
if verbose:
|
|
21
20
|
start = time()
|
|
22
21
|
mapping = {
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from typing import Union
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
@@ -22,7 +21,7 @@ def detect_formats(
|
|
|
22
21
|
table: pd.DataFrame,
|
|
23
22
|
analysis: dict,
|
|
24
23
|
file_path: str,
|
|
25
|
-
user_input_tests:
|
|
24
|
+
user_input_tests: str | list[str] = "ALL",
|
|
26
25
|
limited_output: bool = True,
|
|
27
26
|
skipna: bool = True,
|
|
28
27
|
verbose: bool = False,
|
|
@@ -30,7 +29,7 @@ def detect_formats(
|
|
|
30
29
|
on_sample = len(table) > MAX_ROWS_ANALYSIS
|
|
31
30
|
if on_sample:
|
|
32
31
|
if verbose:
|
|
33
|
-
logging.warning(f"File is too long, analysing
|
|
32
|
+
logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
|
|
34
33
|
table = build_sample(table)
|
|
35
34
|
|
|
36
35
|
if table.empty:
|
|
@@ -183,13 +182,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
|
183
182
|
samples = pd.concat(
|
|
184
183
|
[
|
|
185
184
|
# one row with the minimum of the column
|
|
186
|
-
table.loc[table[col] ==
|
|
185
|
+
table.loc[table[col] == val].iloc[[0]]
|
|
187
186
|
for col in table.columns
|
|
187
|
+
if not pd.isna(val := table[col].dropna().min())
|
|
188
188
|
]
|
|
189
189
|
+ [
|
|
190
190
|
# one row with the maximum of the column
|
|
191
|
-
table.loc[table[col] ==
|
|
191
|
+
table.loc[table[col] == val].iloc[[0]]
|
|
192
192
|
for col in table.columns
|
|
193
|
+
if not pd.isna(val := table[col].dropna().max())
|
|
193
194
|
]
|
|
194
195
|
+ [
|
|
195
196
|
# one row with a NaN value if the column has any
|
|
@@ -199,7 +200,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
|
199
200
|
],
|
|
200
201
|
ignore_index=True,
|
|
201
202
|
)
|
|
202
|
-
return
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
return (
|
|
204
|
+
pd.concat(
|
|
205
|
+
[samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
|
|
206
|
+
ignore_index=True,
|
|
207
|
+
)
|
|
208
|
+
# this is very unlikely but we never know
|
|
209
|
+
if len(samples) <= MAX_ROWS_ANALYSIS
|
|
210
|
+
else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
|
|
205
211
|
)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TextIO
|
|
4
4
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
|
|
9
9
|
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
10
|
if verbose:
|
|
11
11
|
start = time()
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,16 +1,11 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import logging
|
|
3
|
-
import os
|
|
4
|
-
import tempfile
|
|
5
2
|
from time import time
|
|
6
|
-
from typing import Optional, Union
|
|
7
3
|
|
|
8
4
|
import pandas as pd
|
|
9
5
|
|
|
10
6
|
from csv_detective.detection.formats import detect_formats
|
|
11
|
-
from csv_detective.output import generate_output
|
|
7
|
+
from csv_detective.output import generate_output
|
|
12
8
|
from csv_detective.parsing.load import load_file
|
|
13
|
-
from csv_detective.s3_utils import download_from_minio, upload_to_minio
|
|
14
9
|
from csv_detective.utils import display_logs_depending_process_time, is_url
|
|
15
10
|
from csv_detective.validate import validate
|
|
16
11
|
|
|
@@ -20,24 +15,24 @@ logging.basicConfig(level=logging.INFO)
|
|
|
20
15
|
def routine(
|
|
21
16
|
file_path: str,
|
|
22
17
|
num_rows: int = 500,
|
|
23
|
-
user_input_tests:
|
|
18
|
+
user_input_tests: str | list[str] = "ALL",
|
|
24
19
|
limited_output: bool = True,
|
|
25
|
-
save_results:
|
|
26
|
-
encoding:
|
|
27
|
-
sep:
|
|
20
|
+
save_results: bool | str = True,
|
|
21
|
+
encoding: str | None = None,
|
|
22
|
+
sep: str | None = None,
|
|
28
23
|
skipna: bool = True,
|
|
29
24
|
output_profile: bool = False,
|
|
30
25
|
output_schema: bool = False,
|
|
31
26
|
output_df: bool = False,
|
|
32
27
|
cast_json: bool = True,
|
|
33
28
|
verbose: bool = False,
|
|
34
|
-
sheet_name:
|
|
35
|
-
) ->
|
|
36
|
-
"""Returns a dict with information about the
|
|
29
|
+
sheet_name: str | int | None = None,
|
|
30
|
+
) -> dict | tuple[dict, pd.DataFrame]:
|
|
31
|
+
"""Returns a dict with information about the table and possible
|
|
37
32
|
column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
38
33
|
|
|
39
34
|
Args:
|
|
40
|
-
file_path: local path to
|
|
35
|
+
file_path: local path or URL to file
|
|
41
36
|
num_rows: number of rows to sample from the file for analysis ; -1 for analysis
|
|
42
37
|
of the whole file
|
|
43
38
|
user_input_tests: tests to run on the file
|
|
@@ -111,9 +106,9 @@ def validate_then_detect(
|
|
|
111
106
|
file_path: str,
|
|
112
107
|
previous_analysis: dict,
|
|
113
108
|
num_rows: int = 500,
|
|
114
|
-
user_input_tests:
|
|
109
|
+
user_input_tests: str | list[str] = "ALL",
|
|
115
110
|
limited_output: bool = True,
|
|
116
|
-
save_results:
|
|
111
|
+
save_results: bool | str = True,
|
|
117
112
|
skipna: bool = True,
|
|
118
113
|
output_profile: bool = False,
|
|
119
114
|
output_schema: bool = False,
|
|
@@ -173,106 +168,3 @@ def validate_then_detect(
|
|
|
173
168
|
display_logs_depending_process_time(
|
|
174
169
|
f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
175
170
|
)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def routine_minio(
|
|
179
|
-
csv_minio_location: dict[str, str],
|
|
180
|
-
output_minio_location: dict[str, str],
|
|
181
|
-
tableschema_minio_location: dict[str, str],
|
|
182
|
-
minio_user: str,
|
|
183
|
-
minio_pwd: str,
|
|
184
|
-
**kwargs,
|
|
185
|
-
):
|
|
186
|
-
"""Returns a dict with information about the csv table and possible
|
|
187
|
-
column contents.
|
|
188
|
-
|
|
189
|
-
Args:
|
|
190
|
-
csv_minio_location: dict with Minio URL, bucket and key of the CSV file
|
|
191
|
-
output_minio_location: Minio URL, bucket and key to store output file. None if
|
|
192
|
-
not uploading to Minio.
|
|
193
|
-
tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
|
|
194
|
-
None if not uploading the tableschema to Minio.
|
|
195
|
-
minio_user: user name for the minio instance
|
|
196
|
-
minio_pwd: password for the minio instance
|
|
197
|
-
kwargs: arguments for routine
|
|
198
|
-
|
|
199
|
-
Returns:
|
|
200
|
-
dict: a dict with information about the csv and possible types for each column
|
|
201
|
-
"""
|
|
202
|
-
|
|
203
|
-
if (
|
|
204
|
-
(
|
|
205
|
-
any(
|
|
206
|
-
[
|
|
207
|
-
location_dict is not None
|
|
208
|
-
for location_dict in [
|
|
209
|
-
csv_minio_location,
|
|
210
|
-
output_minio_location,
|
|
211
|
-
tableschema_minio_location,
|
|
212
|
-
]
|
|
213
|
-
]
|
|
214
|
-
)
|
|
215
|
-
)
|
|
216
|
-
and (minio_user is None)
|
|
217
|
-
or (minio_pwd is None)
|
|
218
|
-
):
|
|
219
|
-
raise ValueError("Minio credentials are required if using Minio")
|
|
220
|
-
|
|
221
|
-
for location_dict in [
|
|
222
|
-
csv_minio_location,
|
|
223
|
-
output_minio_location,
|
|
224
|
-
tableschema_minio_location,
|
|
225
|
-
]:
|
|
226
|
-
if location_dict is not None:
|
|
227
|
-
if any(
|
|
228
|
-
[
|
|
229
|
-
(location_key not in location_dict) or (location_dict[location_key] is None)
|
|
230
|
-
for location_key in ["netloc", "bucket", "key"]
|
|
231
|
-
]
|
|
232
|
-
):
|
|
233
|
-
raise ValueError("Minio location dict must contain url, bucket and key")
|
|
234
|
-
|
|
235
|
-
file_path = tempfile.NamedTemporaryFile(delete=False).name
|
|
236
|
-
download_from_minio(
|
|
237
|
-
netloc=csv_minio_location["netloc"],
|
|
238
|
-
bucket=csv_minio_location["bucket"],
|
|
239
|
-
key=csv_minio_location["key"],
|
|
240
|
-
filepath=file_path,
|
|
241
|
-
minio_user=minio_user,
|
|
242
|
-
minio_pwd=minio_pwd,
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
analysis = routine(
|
|
246
|
-
file_path,
|
|
247
|
-
save_results=True,
|
|
248
|
-
**kwargs,
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
# Write report JSON file.
|
|
252
|
-
output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
|
|
253
|
-
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
254
|
-
json.dump(analysis, fp, indent=4, separators=(",", ": "))
|
|
255
|
-
|
|
256
|
-
upload_to_minio(
|
|
257
|
-
netloc=output_minio_location["netloc"],
|
|
258
|
-
bucket=output_minio_location["bucket"],
|
|
259
|
-
key=output_minio_location["key"],
|
|
260
|
-
filepath=output_path_to_store_minio_file,
|
|
261
|
-
minio_user=minio_user,
|
|
262
|
-
minio_pwd=minio_pwd,
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
os.remove(output_path_to_store_minio_file)
|
|
266
|
-
os.remove(file_path)
|
|
267
|
-
|
|
268
|
-
generate_table_schema(
|
|
269
|
-
analysis_report=analysis,
|
|
270
|
-
save_file=True,
|
|
271
|
-
netloc=tableschema_minio_location["netloc"],
|
|
272
|
-
bucket=tableschema_minio_location["bucket"],
|
|
273
|
-
key=tableschema_minio_location["key"],
|
|
274
|
-
minio_user=minio_user,
|
|
275
|
-
minio_pwd=minio_pwd,
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
return analysis
|
csv_detective/load_tests.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Union
|
|
3
2
|
|
|
4
3
|
from csv_detective import detect_fields, detect_labels # noqa
|
|
5
4
|
|
|
@@ -18,7 +17,7 @@ def get_all_packages(detect_type) -> list:
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
def return_all_tests(
|
|
21
|
-
user_input_tests:
|
|
20
|
+
user_input_tests: str | list,
|
|
22
21
|
detect_type: str,
|
|
23
22
|
) -> list:
|
|
24
23
|
"""
|
csv_detective/output/__init__.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from typing import Optional, Union
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
@@ -17,14 +16,14 @@ def generate_output(
|
|
|
17
16
|
file_path: str,
|
|
18
17
|
num_rows: int = 500,
|
|
19
18
|
limited_output: bool = True,
|
|
20
|
-
save_results:
|
|
19
|
+
save_results: bool | str = True,
|
|
21
20
|
output_profile: bool = False,
|
|
22
21
|
output_schema: bool = False,
|
|
23
22
|
output_df: bool = False,
|
|
24
23
|
cast_json: bool = True,
|
|
25
24
|
verbose: bool = False,
|
|
26
|
-
sheet_name:
|
|
27
|
-
) ->
|
|
25
|
+
sheet_name: str | int | None = None,
|
|
26
|
+
) -> dict | tuple[dict, pd.DataFrame]:
|
|
28
27
|
if output_profile:
|
|
29
28
|
analysis["profile"] = create_profile(
|
|
30
29
|
table=table,
|
|
@@ -51,7 +50,7 @@ def generate_output(
|
|
|
51
50
|
)
|
|
52
51
|
|
|
53
52
|
if output_schema:
|
|
54
|
-
analysis["schema"] = generate_table_schema(analysis,
|
|
53
|
+
analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
|
|
55
54
|
|
|
56
55
|
if output_df:
|
|
57
56
|
return analysis, cast_df(
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import date, datetime
|
|
3
3
|
from time import time
|
|
4
|
-
from typing import Optional, Union
|
|
5
4
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
|
|
@@ -11,7 +10,7 @@ from csv_detective.detect_fields.temp.date import date_casting
|
|
|
11
10
|
from csv_detective.utils import display_logs_depending_process_time
|
|
12
11
|
|
|
13
12
|
|
|
14
|
-
def cast(value: str, _type: str) ->
|
|
13
|
+
def cast(value: str, _type: str) -> str | float | bool | date | datetime | None:
|
|
15
14
|
if not isinstance(value, str) or not value:
|
|
16
15
|
# None is the current default value in hydra, should we keep this?
|
|
17
16
|
return None
|
csv_detective/output/example.py
CHANGED
|
@@ -3,7 +3,7 @@ import random
|
|
|
3
3
|
import string
|
|
4
4
|
import uuid
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, Type
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import requests
|
|
@@ -14,10 +14,10 @@ fake = Faker()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_example_csv_file(
|
|
17
|
-
fields:
|
|
18
|
-
schema_path:
|
|
17
|
+
fields: dict | None = None,
|
|
18
|
+
schema_path: str | None = None,
|
|
19
19
|
file_length: int = 10,
|
|
20
|
-
output_name:
|
|
20
|
+
output_name: str | None = "example_file.csv",
|
|
21
21
|
output_sep: str = ";",
|
|
22
22
|
encoding: str = "utf-8",
|
|
23
23
|
ignore_required: bool = False,
|
|
@@ -49,8 +49,8 @@ def create_example_csv_file(
|
|
|
49
49
|
def _string(
|
|
50
50
|
length: int = 10,
|
|
51
51
|
required: bool = True,
|
|
52
|
-
pattern:
|
|
53
|
-
enum:
|
|
52
|
+
pattern: str | None = None,
|
|
53
|
+
enum: str | None = None,
|
|
54
54
|
) -> str:
|
|
55
55
|
if potential_skip(required):
|
|
56
56
|
return ""
|
|
@@ -70,7 +70,7 @@ def create_example_csv_file(
|
|
|
70
70
|
return str(uuid.uuid4())
|
|
71
71
|
|
|
72
72
|
def _date(
|
|
73
|
-
date_range:
|
|
73
|
+
date_range: list[str] | None = None,
|
|
74
74
|
format: str = "%Y-%m-%d",
|
|
75
75
|
required: bool = True,
|
|
76
76
|
) -> str:
|
|
@@ -99,7 +99,7 @@ def create_example_csv_file(
|
|
|
99
99
|
return fake.time(format)
|
|
100
100
|
|
|
101
101
|
def _datetime(
|
|
102
|
-
datetime_range:
|
|
102
|
+
datetime_range: list[str] | None = None,
|
|
103
103
|
format: str = "%Y-%m-%d %H-%M-%S",
|
|
104
104
|
required: bool = True,
|
|
105
105
|
) -> str:
|
|
@@ -123,11 +123,11 @@ def create_example_csv_file(
|
|
|
123
123
|
return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
|
|
124
124
|
|
|
125
125
|
def _number(
|
|
126
|
-
num_type: Type[
|
|
127
|
-
num_range:
|
|
128
|
-
enum:
|
|
126
|
+
num_type: Type[int | float] = int,
|
|
127
|
+
num_range: list[float] | None = None,
|
|
128
|
+
enum: list | None = None,
|
|
129
129
|
required: bool = True,
|
|
130
|
-
) ->
|
|
130
|
+
) -> int | float:
|
|
131
131
|
assert num_range is None or len(num_range) == 2
|
|
132
132
|
if potential_skip(required):
|
|
133
133
|
return ""
|
csv_detective/output/schema.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
|
-
import tempfile
|
|
5
3
|
from datetime import datetime
|
|
6
4
|
from time import time
|
|
7
|
-
from typing import Optional
|
|
8
5
|
|
|
9
|
-
from botocore.exceptions import ClientError
|
|
10
|
-
|
|
11
|
-
from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
|
|
12
6
|
from csv_detective.utils import display_logs_depending_process_time
|
|
13
7
|
|
|
14
8
|
|
|
@@ -202,25 +196,14 @@ def get_constraints(format: str) -> dict:
|
|
|
202
196
|
|
|
203
197
|
def generate_table_schema(
|
|
204
198
|
analysis_report: dict,
|
|
205
|
-
|
|
206
|
-
netloc: Optional[str] = None,
|
|
207
|
-
bucket: Optional[str] = None,
|
|
208
|
-
key: Optional[str] = None,
|
|
209
|
-
minio_user: Optional[str] = None,
|
|
210
|
-
minio_pwd: Optional[str] = None,
|
|
199
|
+
save_results: bool | str = True,
|
|
211
200
|
verbose: bool = False,
|
|
212
201
|
) -> dict:
|
|
213
202
|
"""Generates a table schema from the analysis report
|
|
214
203
|
|
|
215
204
|
Args:
|
|
216
205
|
analysis_report (dict): The analysis report from csv_detective
|
|
217
|
-
|
|
218
|
-
netloc (str): The netloc of the minio instance to upload the tableschema
|
|
219
|
-
bucket (str): The bucket to save the schema in
|
|
220
|
-
key (str): The key to save the schema in (without extension as we will append
|
|
221
|
-
version number and extension)
|
|
222
|
-
minio_user (str): The minio user
|
|
223
|
-
minio_pwd (str): The minio password
|
|
206
|
+
save_results (bool or str): whether and where to save the results
|
|
224
207
|
|
|
225
208
|
Returns:
|
|
226
209
|
"""
|
|
@@ -277,71 +260,9 @@ def generate_table_schema(
|
|
|
277
260
|
f"Created schema in {round(time() - start, 3)}s", time() - start
|
|
278
261
|
)
|
|
279
262
|
|
|
280
|
-
if
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
if not all([netloc, key, bucket, minio_user, minio_pwd]):
|
|
285
|
-
raise Exception(
|
|
286
|
-
"To save schema into minio, parameters : netloc, key, bucket, "
|
|
287
|
-
"minio_user, minio_pwd should be provided"
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
# Create bucket if does not exist
|
|
291
|
-
client = get_s3_client(netloc, minio_user, minio_pwd)
|
|
292
|
-
try:
|
|
293
|
-
client.head_bucket(Bucket=bucket)
|
|
294
|
-
except ClientError:
|
|
295
|
-
client.create_bucket(Bucket=bucket)
|
|
296
|
-
|
|
297
|
-
tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
|
|
298
|
-
if "Contents" in tableschema_objects:
|
|
299
|
-
tableschema_keys = [
|
|
300
|
-
tableschema["Key"]
|
|
301
|
-
for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
|
|
302
|
-
"Contents"
|
|
303
|
-
]
|
|
304
|
-
]
|
|
305
|
-
tableschema_versions = [
|
|
306
|
-
os.path.splitext(tableschema_key)[0].split("_")[-1]
|
|
307
|
-
for tableschema_key in tableschema_keys
|
|
308
|
-
]
|
|
309
|
-
latest_version = max(tableschema_versions)
|
|
263
|
+
if save_results:
|
|
264
|
+
output_path = save_results if isinstance(save_results, str) else "schema.json"
|
|
265
|
+
with open(output_path, "w", encoding="utf8") as fp:
|
|
266
|
+
json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
|
|
310
267
|
|
|
311
|
-
|
|
312
|
-
with open(latest_schema_file.name, "w") as fp:
|
|
313
|
-
download_from_minio(
|
|
314
|
-
netloc,
|
|
315
|
-
bucket,
|
|
316
|
-
f"{key}_{latest_version}.json",
|
|
317
|
-
latest_schema_file.name,
|
|
318
|
-
minio_user,
|
|
319
|
-
minio_pwd,
|
|
320
|
-
)
|
|
321
|
-
# Check if files are different
|
|
322
|
-
with open(latest_schema_file.name, "r") as fp:
|
|
323
|
-
latest_schema = json.load(fp)
|
|
324
|
-
if latest_schema["fields"] != fields:
|
|
325
|
-
latest_version_split = latest_version.split(".")
|
|
326
|
-
new_version = (
|
|
327
|
-
latest_version_split[0]
|
|
328
|
-
+ "."
|
|
329
|
-
+ latest_version_split[1]
|
|
330
|
-
+ "."
|
|
331
|
-
+ str(int(latest_version_split[2]) + 1)
|
|
332
|
-
)
|
|
333
|
-
else:
|
|
334
|
-
return None
|
|
335
|
-
|
|
336
|
-
schema["version"] = new_version
|
|
337
|
-
|
|
338
|
-
tableschema_file = tempfile.NamedTemporaryFile(delete=False)
|
|
339
|
-
with open(tableschema_file.name, "w") as fp:
|
|
340
|
-
json.dump(schema, fp, indent=4)
|
|
341
|
-
|
|
342
|
-
new_version_key = f"{key}_{new_version}.json"
|
|
343
|
-
upload_to_minio(
|
|
344
|
-
netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
|
|
345
|
-
)
|
|
346
|
-
os.unlink(tableschema_file.name)
|
|
347
|
-
return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
|
|
268
|
+
return schema
|
csv_detective/parsing/excel.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import openpyxl
|
|
6
5
|
import pandas as pd
|
|
@@ -23,8 +22,8 @@ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
|
|
|
23
22
|
def parse_excel(
|
|
24
23
|
file_path: str,
|
|
25
24
|
num_rows: int = -1,
|
|
26
|
-
engine:
|
|
27
|
-
sheet_name:
|
|
25
|
+
engine: str | None = None,
|
|
26
|
+
sheet_name: str | None = None,
|
|
28
27
|
random_state: int = 42,
|
|
29
28
|
verbose: bool = False,
|
|
30
29
|
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
csv_detective/parsing/load.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from io import BytesIO, StringIO
|
|
2
|
-
from typing import Optional, Union
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
import requests
|
|
@@ -26,10 +25,10 @@ from .excel import (
|
|
|
26
25
|
def load_file(
|
|
27
26
|
file_path: str,
|
|
28
27
|
num_rows: int = 500,
|
|
29
|
-
encoding:
|
|
30
|
-
sep:
|
|
28
|
+
encoding: str | None = None,
|
|
29
|
+
sep: str | None = None,
|
|
31
30
|
verbose: bool = False,
|
|
32
|
-
sheet_name:
|
|
31
|
+
sheet_name: str | int | None = None,
|
|
33
32
|
) -> tuple[pd.DataFrame, dict]:
|
|
34
33
|
file_name = file_path.split("/")[-1]
|
|
35
34
|
engine = None
|
csv_detective/utils.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Optional, Union
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
@@ -31,7 +30,7 @@ def is_url(file_path: str) -> bool:
|
|
|
31
30
|
return file_path.startswith("http")
|
|
32
31
|
|
|
33
32
|
|
|
34
|
-
def cast_prevent_nan(value: float, _type: str) ->
|
|
33
|
+
def cast_prevent_nan(value: float, _type: str) -> float | int | None:
|
|
35
34
|
if _type not in {"int", "float"}:
|
|
36
35
|
raise ValueError(f"Invalid type was passed: {_type}")
|
|
37
36
|
return None if pd.isna(value) else eval(_type)(value)
|
csv_detective/validate.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Optional, Union
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
@@ -22,12 +21,12 @@ def validate(
|
|
|
22
21
|
file_path: str,
|
|
23
22
|
previous_analysis: dict,
|
|
24
23
|
num_rows: int = 500,
|
|
25
|
-
encoding:
|
|
26
|
-
sep:
|
|
24
|
+
encoding: str | None = None,
|
|
25
|
+
sep: str | None = None,
|
|
27
26
|
verbose: bool = False,
|
|
28
27
|
skipna: bool = True,
|
|
29
|
-
sheet_name:
|
|
30
|
-
) -> tuple[bool,
|
|
28
|
+
sheet_name: str | int | None = None,
|
|
29
|
+
) -> tuple[bool, pd.DataFrame | None, dict | None]:
|
|
31
30
|
"""
|
|
32
31
|
Verify is the given file has the same fields and types as in the previous analysis.
|
|
33
32
|
"""
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.3.dev0
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: Etalab <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Source, https://github.com/datagouv/csv_detective
|
|
8
8
|
Keywords: CSV,data processing,encoding,guess,parser,tabular
|
|
9
|
-
Requires-Python: <3.14,>=3.
|
|
9
|
+
Requires-Python: <3.14,>=3.10
|
|
10
10
|
Description-Content-Type: text/markdown
|
|
11
11
|
License-File: LICENSE
|
|
12
|
-
Requires-Dist: boto3<2,>=1.34.0
|
|
13
12
|
Requires-Dist: dateparser<2,>=1.2.0
|
|
14
13
|
Requires-Dist: faust-cchardet==2.1.19
|
|
15
14
|
Requires-Dist: pandas<3,>=2.2.0
|
|
@@ -26,7 +25,6 @@ Requires-Dist: rstr==3.2.2
|
|
|
26
25
|
Provides-Extra: dev
|
|
27
26
|
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
28
27
|
Requires-Dist: responses>=0.25.0; extra == "dev"
|
|
29
|
-
Requires-Dist: bumpx>=0.3.10; extra == "dev"
|
|
30
28
|
Requires-Dist: ruff>=0.9.3; extra == "dev"
|
|
31
29
|
Dynamic: license-file
|
|
32
30
|
|
|
@@ -221,32 +219,26 @@ ruff check --fix .
|
|
|
221
219
|
ruff format .
|
|
222
220
|
```
|
|
223
221
|
|
|
224
|
-
|
|
222
|
+
### 🏷️ Release
|
|
225
223
|
|
|
226
|
-
The release process uses `
|
|
224
|
+
The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
|
|
227
225
|
|
|
228
|
-
```
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
### Process
|
|
233
|
-
|
|
234
|
-
1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
|
|
235
|
-
2. It will update the CHANGELOG according to the new version being published
|
|
236
|
-
3. It will push a tag with the given version to github
|
|
237
|
-
4. CircleCI will pickup this tag, build the package and publish it to pypi
|
|
238
|
-
5. `bumpx` will have everything ready for the next version (version, changelog...)
|
|
226
|
+
```bash
|
|
227
|
+
# Create a new release
|
|
228
|
+
./tag_version.sh <version>
|
|
239
229
|
|
|
240
|
-
|
|
230
|
+
# Example
|
|
231
|
+
./tag_version.sh 2.5.0
|
|
241
232
|
|
|
242
|
-
|
|
243
|
-
|
|
233
|
+
# Dry run to see what would happen
|
|
234
|
+
./tag_version.sh 2.5.0 --dry-run
|
|
244
235
|
```
|
|
245
236
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
This will release a patch version:
|
|
237
|
+
**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
|
|
249
238
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
239
|
+
The script automatically:
|
|
240
|
+
- Updates the version in pyproject.toml
|
|
241
|
+
- Extracts commits since the last tag and formats them for CHANGELOG.md
|
|
242
|
+
- Identifies breaking changes (commits with `!:` in the subject)
|
|
243
|
+
- Creates a git tag and pushes it to the remote repository
|
|
244
|
+
- Creates a GitHub release with the changelog content
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/load_tests.py,sha256=
|
|
5
|
-
csv_detective/
|
|
6
|
-
csv_detective/
|
|
7
|
-
csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
|
|
3
|
+
csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
|
|
4
|
+
csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
|
|
5
|
+
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
|
+
csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
|
|
8
7
|
csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
|
|
9
8
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
9
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
|
|
|
17
16
|
csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
|
|
18
17
|
csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
|
|
19
18
|
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
|
|
20
|
-
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=
|
|
21
|
-
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=
|
|
22
|
-
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=
|
|
23
|
-
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=
|
|
19
|
+
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
|
|
20
|
+
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
|
|
21
|
+
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
|
|
22
|
+
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
|
|
24
23
|
csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
|
|
25
24
|
csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
|
|
26
25
|
csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -50,13 +49,13 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
|
|
|
50
49
|
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
|
|
51
50
|
csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
52
51
|
csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
|
|
53
|
-
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=
|
|
52
|
+
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
|
|
54
53
|
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
|
|
55
|
-
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=
|
|
54
|
+
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
|
|
56
55
|
csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
|
|
57
56
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
57
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
|
|
59
|
-
csv_detective/detect_fields/other/email/__init__.py,sha256=
|
|
58
|
+
csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
|
|
60
59
|
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
61
60
|
csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
|
|
62
61
|
csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
|
|
@@ -67,9 +66,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
|
|
|
67
66
|
csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
|
|
68
67
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
|
|
69
68
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
|
-
csv_detective/detect_fields/temp/date/__init__.py,sha256=
|
|
71
|
-
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=
|
|
72
|
-
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=
|
|
69
|
+
csv_detective/detect_fields/temp/date/__init__.py,sha256=j066luXADCti4Mbb-jvznrL1jf3p5TpEpVzW8vThRDE,2124
|
|
70
|
+
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=oDaZIhkW0SXSYeuK5R5TIzajvSmu-XjUn8GpqITFLnY,1250
|
|
71
|
+
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=z5wpuHiDl8j7ZeQjfZ5wO9lG6H9Ps6X218ANNw19Dag,1073
|
|
73
72
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
|
|
74
73
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
|
|
75
74
|
csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
|
|
@@ -131,38 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
|
|
|
131
130
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
131
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
133
132
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
134
|
-
csv_detective/detection/engine.py,sha256=
|
|
135
|
-
csv_detective/detection/formats.py,sha256=
|
|
136
|
-
csv_detective/detection/headers.py,sha256=
|
|
133
|
+
csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
|
|
134
|
+
csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
|
|
135
|
+
csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
|
|
137
136
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
137
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
139
138
|
csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
|
|
140
|
-
csv_detective/output/__init__.py,sha256=
|
|
141
|
-
csv_detective/output/dataframe.py,sha256=
|
|
142
|
-
csv_detective/output/example.py,sha256=
|
|
139
|
+
csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
|
|
140
|
+
csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
|
|
141
|
+
csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
|
|
143
142
|
csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
|
|
144
|
-
csv_detective/output/schema.py,sha256=
|
|
143
|
+
csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
|
|
145
144
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
146
145
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
147
146
|
csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
|
|
148
147
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
149
148
|
csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
|
|
150
|
-
csv_detective/parsing/excel.py,sha256=
|
|
151
|
-
csv_detective/parsing/load.py,sha256
|
|
149
|
+
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
150
|
+
csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
|
|
152
151
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.9.
|
|
152
|
+
csv_detective-0.9.3.dev0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
153
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
154
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
|
-
tests/test_fields.py,sha256=
|
|
157
|
-
tests/test_file.py,sha256=
|
|
155
|
+
tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
|
|
156
|
+
tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
|
|
158
157
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
159
158
|
tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
|
|
160
159
|
tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
161
|
-
venv/bin/activate_this.py,sha256=
|
|
162
|
-
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
160
|
+
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
163
161
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.9.
|
|
165
|
-
csv_detective-0.9.
|
|
166
|
-
csv_detective-0.9.
|
|
167
|
-
csv_detective-0.9.
|
|
168
|
-
csv_detective-0.9.
|
|
162
|
+
csv_detective-0.9.3.dev0.dist-info/METADATA,sha256=Xga9fj8KjfrMOhp5ZIoXsJLcAI2Jz31yNsdfFJca2DU,9928
|
|
163
|
+
csv_detective-0.9.3.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
csv_detective-0.9.3.dev0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
165
|
+
csv_detective-0.9.3.dev0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
166
|
+
csv_detective-0.9.3.dev0.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
|
|
|
79
79
|
from csv_detective.load_tests import return_all_tests
|
|
80
80
|
from csv_detective.output.dataframe import cast
|
|
81
81
|
from csv_detective.output.utils import prepare_output_dict
|
|
82
|
+
from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
def test_all_tests_return_bool():
|
|
@@ -284,7 +285,7 @@ fields = {
|
|
|
284
285
|
False: ["nein", "ja", "2", "-0"],
|
|
285
286
|
},
|
|
286
287
|
email: {
|
|
287
|
-
True: ["cdo_intern@data.gouv.fr"],
|
|
288
|
+
True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
|
|
288
289
|
False: ["cdo@@gouv.sfd"],
|
|
289
290
|
},
|
|
290
291
|
json: {
|
|
@@ -356,17 +357,25 @@ fields = {
|
|
|
356
357
|
True: [
|
|
357
358
|
"2021-06-22 10:20:10-04:00",
|
|
358
359
|
"2030-06-22 00:00:00.0028+02:00",
|
|
360
|
+
"2000-12-21 10:20:10.1Z",
|
|
359
361
|
"2024-12-19T10:53:36.428000+00:00",
|
|
360
362
|
"1996/06/22 10:20:10 GMT",
|
|
361
363
|
],
|
|
362
364
|
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
|
|
363
365
|
},
|
|
364
366
|
datetime_naive: {
|
|
365
|
-
True: [
|
|
367
|
+
True: [
|
|
368
|
+
"2021-06-22 10:20:10",
|
|
369
|
+
"2030/06-22 00:00:00",
|
|
370
|
+
"2030/06/22 00:00:00.0028",
|
|
371
|
+
],
|
|
366
372
|
False: [
|
|
367
373
|
"2021-06-22T30:20:10",
|
|
368
374
|
"Sun, 06 Nov 1994 08:49:37 GMT",
|
|
369
375
|
"2021-06-44 10:20:10+02:00",
|
|
376
|
+
"1999-12-01T00:00:00Z",
|
|
377
|
+
"2021-06-44",
|
|
378
|
+
"15 décembre 1985",
|
|
370
379
|
],
|
|
371
380
|
},
|
|
372
381
|
datetime_rfc822: {
|
|
@@ -451,8 +460,8 @@ def test_priority(args):
|
|
|
451
460
|
("28/01/2000", date),
|
|
452
461
|
("2025-08-20T14:30:00+02:00", datetime_aware),
|
|
453
462
|
("2025/08/20 14:30:00.2763-12:00", datetime_aware),
|
|
454
|
-
("1925_12_20T14:30:00.
|
|
455
|
-
("1925 12 20 14:30:00Z",
|
|
463
|
+
("1925_12_20T14:30:00.2763", datetime_naive),
|
|
464
|
+
("1925 12 20 14:30:00Z", datetime_aware),
|
|
456
465
|
),
|
|
457
466
|
)
|
|
458
467
|
def test_early_detection(args):
|
|
@@ -461,3 +470,27 @@ def test_early_detection(args):
|
|
|
461
470
|
res = module._is(value)
|
|
462
471
|
assert res
|
|
463
472
|
mock_func.assert_not_called()
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def test_all_proportion_1():
|
|
476
|
+
all_tests = return_all_tests("ALL", "detect_fields")
|
|
477
|
+
prop_1 = {
|
|
478
|
+
t.__name__.split(".")[-1]: eval(
|
|
479
|
+
t.__name__.split(".")[-1]
|
|
480
|
+
if t.__name__.split(".")[-1] not in ["int", "float"]
|
|
481
|
+
else "test_" + t.__name__.split(".")[-1]
|
|
482
|
+
)
|
|
483
|
+
for t in all_tests
|
|
484
|
+
if t.PROPORTION == 1
|
|
485
|
+
}
|
|
486
|
+
# building a table that uses only correct values for these formats, except on one row
|
|
487
|
+
table = pd.DataFrame(
|
|
488
|
+
{
|
|
489
|
+
test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
|
|
490
|
+
for test_name, test_module in prop_1.items()
|
|
491
|
+
}
|
|
492
|
+
)
|
|
493
|
+
# testing columns for all formats
|
|
494
|
+
returned_table = col_test(table, all_tests, limited_output=True)
|
|
495
|
+
# the analysis should have found no match on any format
|
|
496
|
+
assert all(returned_table[col].sum() == 0 for col in table.columns)
|
tests/test_file.py
CHANGED
|
@@ -6,6 +6,7 @@ import responses
|
|
|
6
6
|
|
|
7
7
|
from csv_detective import routine
|
|
8
8
|
from csv_detective.output.profile import create_profile
|
|
9
|
+
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@pytest.mark.parametrize(
|
|
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
343
344
|
save_results=False,
|
|
344
345
|
)
|
|
345
346
|
assert analysis["columns"][col_name]["format"] == "int"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_full_nan_column(mocked_responses):
|
|
350
|
+
# we want a file that needs sampling
|
|
351
|
+
expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
|
|
352
|
+
mocked_responses.get(
|
|
353
|
+
"http://example.com/test.csv",
|
|
354
|
+
body=expected_content,
|
|
355
|
+
status=200,
|
|
356
|
+
)
|
|
357
|
+
# just testing it doesn't fail
|
|
358
|
+
routine(
|
|
359
|
+
file_path="http://example.com/test.csv",
|
|
360
|
+
num_rows=-1,
|
|
361
|
+
output_profile=False,
|
|
362
|
+
save_results=False,
|
|
363
|
+
)
|
venv/bin/activate_this.py
CHANGED
|
@@ -29,7 +29,7 @@ os.environ["VIRTUAL_ENV_PROMPT"] = '' or os.path.basename(base)
|
|
|
29
29
|
|
|
30
30
|
# add the virtual environments libraries to the host python import mechanism
|
|
31
31
|
prev_length = len(sys.path)
|
|
32
|
-
for lib in '../lib/python3.
|
|
32
|
+
for lib in '../lib/python3.11/site-packages'.split(os.pathsep):
|
|
33
33
|
path = os.path.realpath(os.path.join(bin_dir, lib))
|
|
34
34
|
site.addsitedir(path.decode("utf-8") if '' else path)
|
|
35
35
|
sys.path[:] = sys.path[prev_length:] + sys.path[0:prev_length]
|
csv_detective/s3_utils.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
import boto3
|
|
4
|
-
from botocore.client import Config
|
|
5
|
-
from botocore.exceptions import ClientError
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def get_minio_url(netloc: str, bucket: str, key: str) -> str:
|
|
9
|
-
"""Returns location of given resource in minio once it is saved"""
|
|
10
|
-
return netloc + "/" + bucket + "/" + key
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
|
|
14
|
-
return boto3.client(
|
|
15
|
-
"s3",
|
|
16
|
-
endpoint_url=url,
|
|
17
|
-
aws_access_key_id=minio_user,
|
|
18
|
-
aws_secret_access_key=minio_pwd,
|
|
19
|
-
config=Config(signature_version="s3v4"),
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def download_from_minio(
|
|
24
|
-
netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
|
|
25
|
-
) -> None:
|
|
26
|
-
logging.info("Downloading from minio")
|
|
27
|
-
s3 = get_s3_client(netloc, minio_user, minio_pwd)
|
|
28
|
-
try:
|
|
29
|
-
s3.download_file(bucket, key, filepath)
|
|
30
|
-
logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
|
|
31
|
-
except ClientError as e:
|
|
32
|
-
logging.error(e)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def upload_to_minio(
|
|
36
|
-
netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
|
|
37
|
-
) -> None:
|
|
38
|
-
logging.info("Saving to minio")
|
|
39
|
-
s3 = get_s3_client(netloc, minio_user, minio_pwd)
|
|
40
|
-
try:
|
|
41
|
-
s3.upload_file(filepath, bucket, key)
|
|
42
|
-
logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
|
|
43
|
-
except ClientError as e:
|
|
44
|
-
logging.error(e)
|
venv/bin/jp.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
#!/home/circleci/project/venv/bin/python
|
|
2
|
-
|
|
3
|
-
import sys
|
|
4
|
-
import json
|
|
5
|
-
import argparse
|
|
6
|
-
from pprint import pformat
|
|
7
|
-
|
|
8
|
-
import jmespath
|
|
9
|
-
from jmespath import exceptions
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def main():
|
|
13
|
-
parser = argparse.ArgumentParser()
|
|
14
|
-
parser.add_argument('expression')
|
|
15
|
-
parser.add_argument('-f', '--filename',
|
|
16
|
-
help=('The filename containing the input data. '
|
|
17
|
-
'If a filename is not given then data is '
|
|
18
|
-
'read from stdin.'))
|
|
19
|
-
parser.add_argument('--ast', action='store_true',
|
|
20
|
-
help=('Pretty print the AST, do not search the data.'))
|
|
21
|
-
args = parser.parse_args()
|
|
22
|
-
expression = args.expression
|
|
23
|
-
if args.ast:
|
|
24
|
-
# Only print the AST
|
|
25
|
-
expression = jmespath.compile(args.expression)
|
|
26
|
-
sys.stdout.write(pformat(expression.parsed))
|
|
27
|
-
sys.stdout.write('\n')
|
|
28
|
-
return 0
|
|
29
|
-
if args.filename:
|
|
30
|
-
with open(args.filename, 'r') as f:
|
|
31
|
-
data = json.load(f)
|
|
32
|
-
else:
|
|
33
|
-
data = sys.stdin.read()
|
|
34
|
-
data = json.loads(data)
|
|
35
|
-
try:
|
|
36
|
-
sys.stdout.write(json.dumps(
|
|
37
|
-
jmespath.search(expression, data), indent=4, ensure_ascii=False))
|
|
38
|
-
sys.stdout.write('\n')
|
|
39
|
-
except exceptions.ArityError as e:
|
|
40
|
-
sys.stderr.write("invalid-arity: %s\n" % e)
|
|
41
|
-
return 1
|
|
42
|
-
except exceptions.JMESPathTypeError as e:
|
|
43
|
-
sys.stderr.write("invalid-type: %s\n" % e)
|
|
44
|
-
return 1
|
|
45
|
-
except exceptions.UnknownFunctionError as e:
|
|
46
|
-
sys.stderr.write("unknown-function: %s\n" % e)
|
|
47
|
-
return 1
|
|
48
|
-
except exceptions.ParseError as e:
|
|
49
|
-
sys.stderr.write("syntax-error: %s\n" % e)
|
|
50
|
-
return 1
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if __name__ == '__main__':
|
|
54
|
-
sys.exit(main())
|
|
File without changes
|
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|