csv-detective 0.9.3.dev2447__py3-none-any.whl → 0.9.3.dev2473__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -151,7 +151,7 @@ def test_col_chunks(
151
151
  remaining_tests_per_col = build_remaining_tests_per_col(return_table)
152
152
 
153
153
  # hashing rows to get nb_duplicates
154
- row_hashes_count = table.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
154
+ row_hashes_count = pd.util.hash_pandas_object(table, index=False).value_counts()
155
155
  # getting values for profile to read the file only once
156
156
  col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
157
157
 
@@ -189,7 +189,7 @@ def test_col_chunks(
189
189
  batch = pd.concat(batch, ignore_index=True)
190
190
  analysis["total_lines"] += len(batch)
191
191
  row_hashes_count = row_hashes_count.add(
192
- batch.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
192
+ pd.util.hash_pandas_object(batch, index=False).value_counts(),
193
193
  fill_value=0,
194
194
  )
195
195
  for col in batch.columns:
@@ -42,7 +42,7 @@ def parse_csv(
42
42
  logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
43
43
  total_lines, nb_duplicates = None, None
44
44
  else:
45
- nb_duplicates = len(table.loc[table.duplicated()])
45
+ nb_duplicates = table.duplicated().sum()
46
46
  if num_rows > 0:
47
47
  num_rows = min(num_rows, total_lines or len(table))
48
48
  table = table.sample(num_rows, random_state=random_state)
@@ -124,7 +124,7 @@ def parse_excel(
124
124
  )
125
125
  table, header_row_idx = remove_empty_first_rows(table)
126
126
  total_lines = len(table)
127
- nb_duplicates = len(table.loc[table.duplicated()])
127
+ nb_duplicates = table.duplicated().sum()
128
128
  if num_rows > 0:
129
129
  num_rows = min(num_rows - 1, total_lines)
130
130
  table = table.sample(num_rows, random_state=random_state)
@@ -155,7 +155,7 @@ def parse_excel(
155
155
  )
156
156
  table, header_row_idx = remove_empty_first_rows(table)
157
157
  total_lines = len(table)
158
- nb_duplicates = len(table.loc[table.duplicated()])
158
+ nb_duplicates = table.duplicated().sum()
159
159
  if num_rows > 0:
160
160
  num_rows = min(num_rows - 1, total_lines)
161
161
  table = table.sample(num_rows, random_state=random_state)
csv_detective/validate.py CHANGED
@@ -82,7 +82,7 @@ def validate(
82
82
  )
83
83
 
84
84
  # hashing rows to get nb_duplicates
85
- row_hashes_count = first_chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
85
+ row_hashes_count = pd.util.hash_pandas_object(first_chunk, index=False).value_counts()
86
86
  # getting values for profile to read the file only once
87
87
  col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
88
88
  analysis["total_lines"] = 0
@@ -91,7 +91,7 @@ def validate(
91
91
  logging.info(f"> Testing chunk number {idx}")
92
92
  analysis["total_lines"] += len(chunk)
93
93
  row_hashes_count = row_hashes_count.add(
94
- chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
94
+ pd.util.hash_pandas_object(chunk, index=False).value_counts(),
95
95
  fill_value=0,
96
96
  )
97
97
  for col in chunk.columns:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev2447
3
+ Version: 0.9.3.dev2473
4
4
  Summary: Detect tabular files column content
5
5
  Keywords: CSV,data processing,encoding,guess,parser,tabular
6
6
  Author: data.gouv.fr
@@ -78,15 +78,15 @@ csv_detective/output/profile.py,sha256=VUQp0VJ22dfY4R5TybTpuQW_TOX_rLEp98cOzu-Jf
78
78
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
79
79
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
80
80
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- csv_detective/parsing/columns.py,sha256=Eo8GUec5ykTuDTR15OC0S_hiMkcpuZnTPCPomW80aSQ,9244
81
+ csv_detective/parsing/columns.py,sha256=WwivsR4r-SAkugzVSmYeUkgbNXz3CWXnEl2ZmoX_tcs,9238
82
82
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
83
- csv_detective/parsing/csv.py,sha256=0T0gpaXzwJo-sq41IoLQD704GiMUYeDVVASVbat-zWg,1726
84
- csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
83
+ csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
84
+ csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
85
85
  csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
86
86
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
87
87
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
88
- csv_detective/validate.py,sha256=XldlbGkUlPaIh0y4z9iaWlmmahwCrD1900s5Cxlq5wI,5430
89
- csv_detective-0.9.3.dev2447.dist-info/WHEEL,sha256=z-mOpxbJHqy3cq6SvUThBZdaLGFZzdZPtgWLcP2NKjQ,79
90
- csv_detective-0.9.3.dev2447.dist-info/entry_points.txt,sha256=1J86TQNCanjsLMboAufdEUla03qEQaC9QmVGYgt2FCQ,57
91
- csv_detective-0.9.3.dev2447.dist-info/METADATA,sha256=Pl4Yw1e2r6GcmmTp405LNmslBeHfsYAHNpc1kvGYz14,11063
92
- csv_detective-0.9.3.dev2447.dist-info/RECORD,,
88
+ csv_detective/validate.py,sha256=CjZXhhDP-n6wGgEqbwrGRqebU8L5bidwnvQp-TbnvFA,5424
89
+ csv_detective-0.9.3.dev2473.dist-info/WHEEL,sha256=z-mOpxbJHqy3cq6SvUThBZdaLGFZzdZPtgWLcP2NKjQ,79
90
+ csv_detective-0.9.3.dev2473.dist-info/entry_points.txt,sha256=1J86TQNCanjsLMboAufdEUla03qEQaC9QmVGYgt2FCQ,57
91
+ csv_detective-0.9.3.dev2473.dist-info/METADATA,sha256=qf-8rc4HEOh9ZdHD07eoWRcvFxcMrUYprp0hU73rzJ0,11063
92
+ csv_detective-0.9.3.dev2473.dist-info/RECORD,,