csv-detective 0.9.3.dev2447__py3-none-any.whl → 0.9.3.dev2473__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/parsing/columns.py +2 -2
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +2 -2
- csv_detective/validate.py +2 -2
- {csv_detective-0.9.3.dev2447.dist-info → csv_detective-0.9.3.dev2473.dist-info}/METADATA +1 -1
- {csv_detective-0.9.3.dev2447.dist-info → csv_detective-0.9.3.dev2473.dist-info}/RECORD +8 -8
- {csv_detective-0.9.3.dev2447.dist-info → csv_detective-0.9.3.dev2473.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2447.dist-info → csv_detective-0.9.3.dev2473.dist-info}/entry_points.txt +0 -0
csv_detective/parsing/columns.py
CHANGED
|
@@ -151,7 +151,7 @@ def test_col_chunks(
|
|
|
151
151
|
remaining_tests_per_col = build_remaining_tests_per_col(return_table)
|
|
152
152
|
|
|
153
153
|
# hashing rows to get nb_duplicates
|
|
154
|
-
row_hashes_count =
|
|
154
|
+
row_hashes_count = pd.util.hash_pandas_object(table, index=False).value_counts()
|
|
155
155
|
# getting values for profile to read the file only once
|
|
156
156
|
col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
|
|
157
157
|
|
|
@@ -189,7 +189,7 @@ def test_col_chunks(
|
|
|
189
189
|
batch = pd.concat(batch, ignore_index=True)
|
|
190
190
|
analysis["total_lines"] += len(batch)
|
|
191
191
|
row_hashes_count = row_hashes_count.add(
|
|
192
|
-
|
|
192
|
+
pd.util.hash_pandas_object(batch, index=False).value_counts(),
|
|
193
193
|
fill_value=0,
|
|
194
194
|
)
|
|
195
195
|
for col in batch.columns:
|
csv_detective/parsing/csv.py
CHANGED
|
@@ -42,7 +42,7 @@ def parse_csv(
|
|
|
42
42
|
logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
|
|
43
43
|
total_lines, nb_duplicates = None, None
|
|
44
44
|
else:
|
|
45
|
-
nb_duplicates =
|
|
45
|
+
nb_duplicates = table.duplicated().sum()
|
|
46
46
|
if num_rows > 0:
|
|
47
47
|
num_rows = min(num_rows, total_lines or len(table))
|
|
48
48
|
table = table.sample(num_rows, random_state=random_state)
|
csv_detective/parsing/excel.py
CHANGED
|
@@ -124,7 +124,7 @@ def parse_excel(
|
|
|
124
124
|
)
|
|
125
125
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
126
126
|
total_lines = len(table)
|
|
127
|
-
nb_duplicates =
|
|
127
|
+
nb_duplicates = table.duplicated().sum()
|
|
128
128
|
if num_rows > 0:
|
|
129
129
|
num_rows = min(num_rows - 1, total_lines)
|
|
130
130
|
table = table.sample(num_rows, random_state=random_state)
|
|
@@ -155,7 +155,7 @@ def parse_excel(
|
|
|
155
155
|
)
|
|
156
156
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
157
157
|
total_lines = len(table)
|
|
158
|
-
nb_duplicates =
|
|
158
|
+
nb_duplicates = table.duplicated().sum()
|
|
159
159
|
if num_rows > 0:
|
|
160
160
|
num_rows = min(num_rows - 1, total_lines)
|
|
161
161
|
table = table.sample(num_rows, random_state=random_state)
|
csv_detective/validate.py
CHANGED
|
@@ -82,7 +82,7 @@ def validate(
|
|
|
82
82
|
)
|
|
83
83
|
|
|
84
84
|
# hashing rows to get nb_duplicates
|
|
85
|
-
row_hashes_count =
|
|
85
|
+
row_hashes_count = pd.util.hash_pandas_object(first_chunk, index=False).value_counts()
|
|
86
86
|
# getting values for profile to read the file only once
|
|
87
87
|
col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
|
|
88
88
|
analysis["total_lines"] = 0
|
|
@@ -91,7 +91,7 @@ def validate(
|
|
|
91
91
|
logging.info(f"> Testing chunk number {idx}")
|
|
92
92
|
analysis["total_lines"] += len(chunk)
|
|
93
93
|
row_hashes_count = row_hashes_count.add(
|
|
94
|
-
|
|
94
|
+
pd.util.hash_pandas_object(chunk, index=False).value_counts(),
|
|
95
95
|
fill_value=0,
|
|
96
96
|
)
|
|
97
97
|
for col in chunk.columns:
|
|
@@ -78,15 +78,15 @@ csv_detective/output/profile.py,sha256=VUQp0VJ22dfY4R5TybTpuQW_TOX_rLEp98cOzu-Jf
|
|
|
78
78
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
79
79
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
80
80
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
|
-
csv_detective/parsing/columns.py,sha256=
|
|
81
|
+
csv_detective/parsing/columns.py,sha256=WwivsR4r-SAkugzVSmYeUkgbNXz3CWXnEl2ZmoX_tcs,9238
|
|
82
82
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
83
|
-
csv_detective/parsing/csv.py,sha256=
|
|
84
|
-
csv_detective/parsing/excel.py,sha256=
|
|
83
|
+
csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
|
|
84
|
+
csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
|
|
85
85
|
csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
|
|
86
86
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
87
87
|
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
88
|
-
csv_detective/validate.py,sha256=
|
|
89
|
-
csv_detective-0.9.3.
|
|
90
|
-
csv_detective-0.9.3.
|
|
91
|
-
csv_detective-0.9.3.
|
|
92
|
-
csv_detective-0.9.3.
|
|
88
|
+
csv_detective/validate.py,sha256=CjZXhhDP-n6wGgEqbwrGRqebU8L5bidwnvQp-TbnvFA,5424
|
|
89
|
+
csv_detective-0.9.3.dev2473.dist-info/WHEEL,sha256=z-mOpxbJHqy3cq6SvUThBZdaLGFZzdZPtgWLcP2NKjQ,79
|
|
90
|
+
csv_detective-0.9.3.dev2473.dist-info/entry_points.txt,sha256=1J86TQNCanjsLMboAufdEUla03qEQaC9QmVGYgt2FCQ,57
|
|
91
|
+
csv_detective-0.9.3.dev2473.dist-info/METADATA,sha256=qf-8rc4HEOh9ZdHD07eoWRcvFxcMrUYprp0hU73rzJ0,11063
|
|
92
|
+
csv_detective-0.9.3.dev2473.dist-info/RECORD,,
|
|
File without changes
|
{csv_detective-0.9.3.dev2447.dist-info → csv_detective-0.9.3.dev2473.dist-info}/entry_points.txt
RENAMED
|
File without changes
|