csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.4.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/explore_csv.py +2 -2
- csv_detective/format.py +2 -0
- csv_detective/parsing/columns.py +3 -3
- csv_detective/parsing/csv.py +1 -1
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/METADATA +1 -1
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/RECORD +10 -10
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/WHEEL +0 -0
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/top_level.txt +0 -0
csv_detective/explore_csv.py
CHANGED
|
@@ -36,7 +36,7 @@ def routine(
|
|
|
36
36
|
file_path: local path or URL to file
|
|
37
37
|
num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
|
|
38
38
|
tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
|
|
39
|
-
limited_output: whether or not to return all possible
|
|
39
|
+
limited_output: whether or not to return all possible formats or only the most likely one for each column
|
|
40
40
|
save_results: whether or not to save the results in a json file, or the path where to dump the output
|
|
41
41
|
output_profile: whether or not to add the 'profile' field to the output
|
|
42
42
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
@@ -47,7 +47,7 @@ def routine(
|
|
|
47
47
|
skipna: whether to keep NaN (empty cells) for tests
|
|
48
48
|
|
|
49
49
|
Returns:
|
|
50
|
-
dict: a dict with information about the csv and possible
|
|
50
|
+
dict: a dict with information about the csv and possible formats for each column
|
|
51
51
|
"""
|
|
52
52
|
|
|
53
53
|
if not (
|
csv_detective/format.py
CHANGED
|
@@ -25,6 +25,8 @@ class Format:
|
|
|
25
25
|
labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
|
|
26
26
|
proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
|
|
27
27
|
tags: to allow users to submit a file to only a subset of formats
|
|
28
|
+
mandatory_label: whether the format can only be considered if the column passes both field and label tests
|
|
29
|
+
python_type: the python type related to the format (less specific, used for downstream casting)
|
|
28
30
|
"""
|
|
29
31
|
self.name: str = name
|
|
30
32
|
self.func: Callable[[Any], bool] = func
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -94,7 +94,7 @@ def test_col(
|
|
|
94
94
|
)
|
|
95
95
|
if verbose:
|
|
96
96
|
display_logs_depending_process_time(
|
|
97
|
-
f'\t> Done with
|
|
97
|
+
f'\t> Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
|
|
98
98
|
time() - start_type,
|
|
99
99
|
)
|
|
100
100
|
if verbose:
|
|
@@ -109,7 +109,7 @@ def test_label(
|
|
|
109
109
|
):
|
|
110
110
|
if verbose:
|
|
111
111
|
start = time()
|
|
112
|
-
logging.info("Testing labels to get
|
|
112
|
+
logging.info("Testing labels to get formats")
|
|
113
113
|
|
|
114
114
|
return_table = pd.DataFrame(columns=columns)
|
|
115
115
|
for idx, (label, format) in enumerate(formats.items()):
|
|
@@ -118,7 +118,7 @@ def test_label(
|
|
|
118
118
|
return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
|
|
119
119
|
if verbose:
|
|
120
120
|
display_logs_depending_process_time(
|
|
121
|
-
f'\t- Done with
|
|
121
|
+
f'\t- Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
|
|
122
122
|
time() - start_type,
|
|
123
123
|
)
|
|
124
124
|
if verbose:
|
csv_detective/parsing/csv.py
CHANGED
|
@@ -39,7 +39,7 @@ def parse_csv(
|
|
|
39
39
|
# branch between small and big files starts here
|
|
40
40
|
if total_lines == CHUNK_SIZE:
|
|
41
41
|
if verbose:
|
|
42
|
-
logging.warning(f"File is too long,
|
|
42
|
+
logging.warning(f"File is too long, loading in chunks of {CHUNK_SIZE} rows")
|
|
43
43
|
total_lines, nb_duplicates = None, None
|
|
44
44
|
else:
|
|
45
45
|
nb_duplicates = table.duplicated().sum()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/format.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=YjkQihSm1vgZbEfXHxJ-_bVJrtCUT3Ut8x8FX60ZK3k,7643
|
|
4
|
+
csv_detective/format.py,sha256=vYz4h-WDUZ3pZIfxUc1toXRbUiNnWqHHiUXBbZtnaUw,3140
|
|
5
5
|
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
6
|
csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
|
|
7
7
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -80,13 +80,13 @@ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XN
|
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
81
81
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
82
82
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
-
csv_detective/parsing/columns.py,sha256=
|
|
83
|
+
csv_detective/parsing/columns.py,sha256=ExabNAzQB-IvQfjmVLClBfj1GMkbAQxUUnournFpdy8,9350
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
|
-
csv_detective/parsing/csv.py,sha256=
|
|
85
|
+
csv_detective/parsing/csv.py,sha256=vfAHkpgzLkzeUXWVqrWlttZ1y-Hql0GNFSZZrA366mk,1714
|
|
86
86
|
csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
|
|
87
87
|
csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.4.
|
|
89
|
+
csv_detective-0.10.4.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
92
|
tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.4.
|
|
108
|
-
csv_detective-0.10.4.
|
|
109
|
-
csv_detective-0.10.4.
|
|
110
|
-
csv_detective-0.10.4.
|
|
111
|
-
csv_detective-0.10.4.
|
|
107
|
+
csv_detective-0.10.4.dev2.dist-info/METADATA,sha256=52ZirEUs9m49EuivGtEmtr_p2h_wVsSRbyQTs0SfoWE,10925
|
|
108
|
+
csv_detective-0.10.4.dev2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
109
|
+
csv_detective-0.10.4.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.4.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.4.dev2.dist-info/RECORD,,
|
|
File without changes
|
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.4.dev2.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|