valediction 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -8
- valediction/convenience.py +50 -50
- valediction/data_types/data_type_helpers.py +75 -75
- valediction/data_types/data_types.py +58 -58
- valediction/data_types/type_inference.py +541 -541
- valediction/datasets/datasets.py +870 -870
- valediction/datasets/datasets_helpers.py +46 -46
- valediction/demo/DEMOGRAPHICS.csv +101 -101
- valediction/demo/DIAGNOSES.csv +650 -650
- valediction/demo/LAB_TESTS.csv +1001 -1001
- valediction/demo/VITALS.csv +1001 -1001
- valediction/demo/__init__.py +6 -6
- valediction/demo/demo_dictionary.py +129 -129
- valediction/dictionary/exporting.py +501 -501
- valediction/dictionary/exporting_helpers.py +371 -371
- valediction/dictionary/generation.py +357 -357
- valediction/dictionary/helpers.py +174 -174
- valediction/dictionary/importing.py +494 -494
- valediction/dictionary/integrity.py +37 -37
- valediction/dictionary/model.py +582 -582
- valediction/exceptions.py +22 -22
- valediction/integrity.py +97 -97
- valediction/io/csv_readers.py +307 -307
- valediction/progress.py +206 -206
- valediction/support.py +72 -72
- valediction/validation/helpers.py +315 -315
- valediction/validation/issues.py +280 -280
- valediction/validation/validation.py +598 -598
- {valediction-1.0.0.dist-info → valediction-1.0.3.dist-info}/METADATA +1 -1
- valediction-1.0.3.dist-info/RECORD +38 -0
- {valediction-1.0.0.dist-info → valediction-1.0.3.dist-info}/WHEEL +1 -1
- valediction-1.0.0.dist-info/RECORD +0 -38
|
@@ -1,315 +1,315 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from typing import List
|
|
5
|
-
|
|
6
|
-
from numpy import flatnonzero, round
|
|
7
|
-
from pandas import NA, DataFrame, Series, to_datetime, to_numeric
|
|
8
|
-
from pandas.util import hash_pandas_object
|
|
9
|
-
|
|
10
|
-
from valediction.data_types.data_types import DataType
|
|
11
|
-
from valediction.dictionary.model import Table
|
|
12
|
-
from valediction.integrity import get_config
|
|
13
|
-
from valediction.validation.issues import Range
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# Remove Nulls
|
|
17
|
-
def _set_nulls(df: DataFrame) -> DataFrame:
|
|
18
|
-
null_values = get_config().null_values
|
|
19
|
-
token_set = {str(t).strip().casefold() for t in null_values}
|
|
20
|
-
columns = df.select_dtypes(include=["string", "object"]).columns
|
|
21
|
-
for column in columns:
|
|
22
|
-
series = df[column]
|
|
23
|
-
mask = series.notna() & series.str.casefold().isin(token_set)
|
|
24
|
-
df[column] = series.mask(mask, NA)
|
|
25
|
-
|
|
26
|
-
return df
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# Check for Nulls
|
|
30
|
-
def _column_has_values(column: Series):
|
|
31
|
-
return column.notna().any()
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# Range Setting
|
|
35
|
-
def mask_to_ranges(mask: Series, start_row: int) -> list[Range]:
|
|
36
|
-
"""Convert a boolean mask (over the current chunk) into 0-based contiguous
|
|
37
|
-
ranges."""
|
|
38
|
-
idx = flatnonzero(mask.to_numpy())
|
|
39
|
-
if idx.size == 0:
|
|
40
|
-
return []
|
|
41
|
-
ranges: List[Range] = []
|
|
42
|
-
run_start = idx[0]
|
|
43
|
-
prev = idx[0]
|
|
44
|
-
for i in idx[1:]:
|
|
45
|
-
if i == prev + 1:
|
|
46
|
-
prev = i
|
|
47
|
-
continue
|
|
48
|
-
ranges.append(Range(start=start_row + run_start, end=start_row + prev))
|
|
49
|
-
run_start = prev = i
|
|
50
|
-
ranges.append(Range(start=start_row + run_start, end=start_row + prev))
|
|
51
|
-
return ranges
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# PK Hashes
|
|
55
|
-
def create_pk_hashes(
|
|
56
|
-
df_primaries: DataFrame,
|
|
57
|
-
) -> Series:
|
|
58
|
-
"""For PK hash collision assessment, compute a deterministic 128-bit hash per row
|
|
59
|
-
over the provided PK columns. This is created by computing two 64-bit hashes.
|
|
60
|
-
|
|
61
|
-
forwards and backwards and then combining them. Rows with any NA across PK
|
|
62
|
-
components are returned as None - flagging these for NULL violations.
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
df_primaries (DataFrame): DataFrame
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
Series: Pandas Series with hashes or Nulls.
|
|
70
|
-
"""
|
|
71
|
-
hash_col_name = "PK_HASH"
|
|
72
|
-
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
73
|
-
return Series([], dtype=object, name=hash_col_name)
|
|
74
|
-
|
|
75
|
-
# Any NA in row => invalid PK -> None
|
|
76
|
-
null_rows = df_primaries.isna().any(axis=1)
|
|
77
|
-
|
|
78
|
-
# First Hash
|
|
79
|
-
hash_1 = hash_pandas_object(df_primaries, index=False) # uint64
|
|
80
|
-
|
|
81
|
-
# Second Hash (rows backwards if single row, else salt)
|
|
82
|
-
if df_primaries.shape[1] > 1:
|
|
83
|
-
df_primaries_backwards = df_primaries.iloc[:, ::-1]
|
|
84
|
-
else:
|
|
85
|
-
s = df_primaries.iloc[:, 0]
|
|
86
|
-
salt = Series(["§"] * len(s), index=s.index, dtype="string")
|
|
87
|
-
df_primaries_backwards = DataFrame(
|
|
88
|
-
{
|
|
89
|
-
"_a": s,
|
|
90
|
-
"_b": s.str.cat(salt),
|
|
91
|
-
}
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
|
|
95
|
-
|
|
96
|
-
a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
97
|
-
a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
98
|
-
|
|
99
|
-
combined = (a1 << 64) | a2
|
|
100
|
-
hashes = Series(
|
|
101
|
-
combined, index=df_primaries.index, name=hash_col_name, dtype=object
|
|
102
|
-
)
|
|
103
|
-
hashes[null_rows] = None
|
|
104
|
-
return hashes
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def compute_pk_masks(pk_hashes: Series, seen_hashes: set[int]) -> dict[str, Series]:
|
|
108
|
-
"""Compute masks for PK hashes that are either null or have been seen before.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
pk_hashes (Series): Series of PK hashes.
|
|
112
|
-
seen_hashes (set[int]): Set of hashes that have been seen before.
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
dict[str, Series]: Dictionary for boolean masks:
|
|
116
|
-
- null: rows where PK is None / NA
|
|
117
|
-
- dup_full: rows that are part of a within-chunk duplicate group
|
|
118
|
-
- cross_full: rows whose hash was seen in previous chunks (excluding dup_full)
|
|
119
|
-
- new_first_full: rows that are the first occurrence of a hash
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
s = pk_hashes
|
|
123
|
-
null = s.isna()
|
|
124
|
-
valid = ~null
|
|
125
|
-
if not valid.any():
|
|
126
|
-
# empty/default masks
|
|
127
|
-
return {
|
|
128
|
-
"null": null,
|
|
129
|
-
"in_chunk_collision": null,
|
|
130
|
-
"cross_chunk_collision": null,
|
|
131
|
-
"first_appearance": null,
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
s_valid = s[valid]
|
|
135
|
-
|
|
136
|
-
# Within-chunk duplicate membership (mark *all* members)
|
|
137
|
-
dup_local = s_valid.duplicated(keep=False)
|
|
138
|
-
|
|
139
|
-
# Across-chunk duplicates (exclude those already in a local dup group)
|
|
140
|
-
seen_local = s_valid.isin(seen_hashes)
|
|
141
|
-
cross_local = seen_local & ~dup_local
|
|
142
|
-
|
|
143
|
-
# New first occurrences in this chunk (first time we see the hash here, and not seen before)
|
|
144
|
-
first_local = ~s_valid.duplicated(keep="first")
|
|
145
|
-
new_first_local = first_local & ~seen_local
|
|
146
|
-
|
|
147
|
-
# Lift back to full length masks
|
|
148
|
-
in_chunk_collision = valid.copy()
|
|
149
|
-
in_chunk_collision.loc[valid] = dup_local
|
|
150
|
-
|
|
151
|
-
cross_chunk_collision = valid.copy()
|
|
152
|
-
cross_chunk_collision.loc[valid] = cross_local
|
|
153
|
-
|
|
154
|
-
first_appearance = valid.copy()
|
|
155
|
-
first_appearance.loc[valid] = new_first_local
|
|
156
|
-
|
|
157
|
-
return {
|
|
158
|
-
"null": null,
|
|
159
|
-
"in_chunk_collision": in_chunk_collision,
|
|
160
|
-
"cross_chunk_collision": cross_chunk_collision,
|
|
161
|
-
"first_appearance": first_appearance,
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
# PK Whitespace
|
|
166
|
-
def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
|
|
167
|
-
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
168
|
-
return Series(False, index=df_primaries.index)
|
|
169
|
-
|
|
170
|
-
col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
|
|
171
|
-
|
|
172
|
-
return col_masks.any(axis=1)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
# Data Type Checks Numeric
|
|
176
|
-
def invalid_mask_integer(column: Series, *, tolerance: float = 1e-12) -> Series:
|
|
177
|
-
"""True where a non-null value cannot be treated as an integer without losing non-
|
|
178
|
-
zero remainder.
|
|
179
|
-
|
|
180
|
-
Accepts scientific notation (e.g. '1e2').
|
|
181
|
-
"""
|
|
182
|
-
notnull = column.notna()
|
|
183
|
-
numeric = to_numeric(column, errors="coerce")
|
|
184
|
-
invalid = notnull & numeric.isna()
|
|
185
|
-
|
|
186
|
-
conversion_mask = notnull & numeric.notna()
|
|
187
|
-
if conversion_mask.any():
|
|
188
|
-
vals = numeric[conversion_mask].astype("float64")
|
|
189
|
-
frac = (vals - round(vals)).abs()
|
|
190
|
-
invalid_conv = frac > tolerance
|
|
191
|
-
invalid = invalid.copy()
|
|
192
|
-
invalid.loc[conversion_mask] = invalid_conv.values
|
|
193
|
-
return invalid
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
def invalid_mask_float(column: Series) -> Series:
|
|
197
|
-
"""True where non-null value is not convertible to a number."""
|
|
198
|
-
notnull = column.notna()
|
|
199
|
-
num = to_numeric(column, errors="coerce")
|
|
200
|
-
return notnull & num.isna()
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
# Data Type Checks Date
|
|
204
|
-
def _allowed_formats_for(dtype: DataType) -> list[str]:
|
|
205
|
-
"""Return the list of formats from Config.date_formats allowed for the given
|
|
206
|
-
DataType."""
|
|
207
|
-
config = get_config()
|
|
208
|
-
return [fmt for fmt, data_type in config.date_formats.items() if data_type == dtype]
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
def _parse_ok_any(column: Series, formats: list[str]) -> Series:
|
|
212
|
-
"""
|
|
213
|
-
Vectorised check: True for values that parse under at least one of `formats`.
|
|
214
|
-
"""
|
|
215
|
-
if not formats:
|
|
216
|
-
return Series(False, index=column.index)
|
|
217
|
-
ok_any = Series(False, index=column.index)
|
|
218
|
-
for fmt in formats:
|
|
219
|
-
parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
|
|
220
|
-
ok_any = ok_any | parsed.notna()
|
|
221
|
-
return ok_any
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
def invalid_mask_date(column: Series, fmt: str | None) -> Series:
|
|
225
|
-
"""Must not contain a non-zero time component."""
|
|
226
|
-
notnull = column.notna()
|
|
227
|
-
|
|
228
|
-
if fmt:
|
|
229
|
-
parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
|
|
230
|
-
ok = parsed.notna()
|
|
231
|
-
has_time = ok & (
|
|
232
|
-
(parsed.dt.hour != 0)
|
|
233
|
-
| (parsed.dt.minute != 0)
|
|
234
|
-
| (parsed.dt.second != 0)
|
|
235
|
-
| (parsed.dt.microsecond != 0)
|
|
236
|
-
)
|
|
237
|
-
return notnull & (~ok | has_time)
|
|
238
|
-
|
|
239
|
-
allowed = _allowed_formats_for(DataType.DATE)
|
|
240
|
-
ok_any = _parse_ok_any(column, allowed)
|
|
241
|
-
return notnull & (~ok_any)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
|
|
245
|
-
notnull = column.notna()
|
|
246
|
-
|
|
247
|
-
if fmt:
|
|
248
|
-
parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
|
|
249
|
-
ok = parsed.notna()
|
|
250
|
-
return notnull & (~ok)
|
|
251
|
-
|
|
252
|
-
allowed = _allowed_formats_for(DataType.DATETIME)
|
|
253
|
-
ok_any = _parse_ok_any(column, allowed)
|
|
254
|
-
return notnull & (~ok_any)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
# Other Text Checks
|
|
258
|
-
def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
|
|
259
|
-
if max_len is None or max_len <= 0:
|
|
260
|
-
# treat as unlimited length
|
|
261
|
-
return Series(False, index=column.index)
|
|
262
|
-
|
|
263
|
-
notnull = column.notna()
|
|
264
|
-
lens = column.str.len()
|
|
265
|
-
return notnull & (lens > max_len)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
def invalid_mask_text_forbidden_characters(column: Series) -> Series:
|
|
269
|
-
forbidden = get_config().forbidden_characters
|
|
270
|
-
if not forbidden:
|
|
271
|
-
return column.notna() & False
|
|
272
|
-
|
|
273
|
-
pattern = "[" + re.escape("".join(forbidden)) + "]"
|
|
274
|
-
notnull = column.notna()
|
|
275
|
-
has_forbidden = column.str.contains(pattern, regex=True, na=False)
|
|
276
|
-
return notnull & has_forbidden
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
# Apply Data Types #
|
|
280
|
-
def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
281
|
-
# name -> column object
|
|
282
|
-
column_dictionary = {column.name: column for column in table_dictionary}
|
|
283
|
-
|
|
284
|
-
for col in df.columns:
|
|
285
|
-
data_type = column_dictionary.get(col).data_type
|
|
286
|
-
datetime_format = column_dictionary.get(col).datetime_format
|
|
287
|
-
|
|
288
|
-
if data_type in (DataType.TEXT, DataType.FILE):
|
|
289
|
-
df[col] = df[col].astype("string")
|
|
290
|
-
|
|
291
|
-
elif data_type == DataType.INTEGER:
|
|
292
|
-
# Accepts '12', '12.0', '1e2' etc.; validation guarantees integer-equivalent
|
|
293
|
-
nums = to_numeric(df[col], errors="raise")
|
|
294
|
-
df[col] = nums.round().astype("Int64")
|
|
295
|
-
|
|
296
|
-
elif data_type == DataType.FLOAT:
|
|
297
|
-
nums = to_numeric(df[col], errors="raise")
|
|
298
|
-
df[col] = nums.astype("Float64")
|
|
299
|
-
|
|
300
|
-
elif data_type == DataType.DATE:
|
|
301
|
-
dtv = to_datetime(
|
|
302
|
-
df[col], format=datetime_format, errors="raise", utc=False
|
|
303
|
-
)
|
|
304
|
-
df[col] = dtv.dt.normalize() # midnight
|
|
305
|
-
|
|
306
|
-
elif data_type == DataType.DATETIME:
|
|
307
|
-
df[col] = to_datetime(
|
|
308
|
-
df[col], format=datetime_format, errors="raise", utc=False
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
else:
|
|
312
|
-
# Fallback: keep as string
|
|
313
|
-
df[col] = df[col].astype("string")
|
|
314
|
-
|
|
315
|
-
return df
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from numpy import flatnonzero, round
|
|
7
|
+
from pandas import NA, DataFrame, Series, to_datetime, to_numeric
|
|
8
|
+
from pandas.util import hash_pandas_object
|
|
9
|
+
|
|
10
|
+
from valediction.data_types.data_types import DataType
|
|
11
|
+
from valediction.dictionary.model import Table
|
|
12
|
+
from valediction.integrity import get_config
|
|
13
|
+
from valediction.validation.issues import Range
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Remove Nulls
|
|
17
|
+
def _set_nulls(df: DataFrame) -> DataFrame:
|
|
18
|
+
null_values = get_config().null_values
|
|
19
|
+
token_set = {str(t).strip().casefold() for t in null_values}
|
|
20
|
+
columns = df.select_dtypes(include=["string", "object"]).columns
|
|
21
|
+
for column in columns:
|
|
22
|
+
series = df[column]
|
|
23
|
+
mask = series.notna() & series.str.casefold().isin(token_set)
|
|
24
|
+
df[column] = series.mask(mask, NA)
|
|
25
|
+
|
|
26
|
+
return df
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Check for Nulls
|
|
30
|
+
def _column_has_values(column: Series):
|
|
31
|
+
return column.notna().any()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Range Setting
|
|
35
|
+
def mask_to_ranges(mask: Series, start_row: int) -> list[Range]:
|
|
36
|
+
"""Convert a boolean mask (over the current chunk) into 0-based contiguous
|
|
37
|
+
ranges."""
|
|
38
|
+
idx = flatnonzero(mask.to_numpy())
|
|
39
|
+
if idx.size == 0:
|
|
40
|
+
return []
|
|
41
|
+
ranges: List[Range] = []
|
|
42
|
+
run_start = idx[0]
|
|
43
|
+
prev = idx[0]
|
|
44
|
+
for i in idx[1:]:
|
|
45
|
+
if i == prev + 1:
|
|
46
|
+
prev = i
|
|
47
|
+
continue
|
|
48
|
+
ranges.append(Range(start=start_row + run_start, end=start_row + prev))
|
|
49
|
+
run_start = prev = i
|
|
50
|
+
ranges.append(Range(start=start_row + run_start, end=start_row + prev))
|
|
51
|
+
return ranges
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# PK Hashes
|
|
55
|
+
def create_pk_hashes(
|
|
56
|
+
df_primaries: DataFrame,
|
|
57
|
+
) -> Series:
|
|
58
|
+
"""For PK hash collision assessment, compute a deterministic 128-bit hash per row
|
|
59
|
+
over the provided PK columns. This is created by computing two 64-bit hashes.
|
|
60
|
+
|
|
61
|
+
forwards and backwards and then combining them. Rows with any NA across PK
|
|
62
|
+
components are returned as None - flagging these for NULL violations.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
df_primaries (DataFrame): DataFrame
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Series: Pandas Series with hashes or Nulls.
|
|
70
|
+
"""
|
|
71
|
+
hash_col_name = "PK_HASH"
|
|
72
|
+
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
73
|
+
return Series([], dtype=object, name=hash_col_name)
|
|
74
|
+
|
|
75
|
+
# Any NA in row => invalid PK -> None
|
|
76
|
+
null_rows = df_primaries.isna().any(axis=1)
|
|
77
|
+
|
|
78
|
+
# First Hash
|
|
79
|
+
hash_1 = hash_pandas_object(df_primaries, index=False) # uint64
|
|
80
|
+
|
|
81
|
+
# Second Hash (rows backwards if single row, else salt)
|
|
82
|
+
if df_primaries.shape[1] > 1:
|
|
83
|
+
df_primaries_backwards = df_primaries.iloc[:, ::-1]
|
|
84
|
+
else:
|
|
85
|
+
s = df_primaries.iloc[:, 0]
|
|
86
|
+
salt = Series(["§"] * len(s), index=s.index, dtype="string")
|
|
87
|
+
df_primaries_backwards = DataFrame(
|
|
88
|
+
{
|
|
89
|
+
"_a": s,
|
|
90
|
+
"_b": s.str.cat(salt),
|
|
91
|
+
}
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
|
|
95
|
+
|
|
96
|
+
a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
97
|
+
a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
98
|
+
|
|
99
|
+
combined = (a1 << 64) | a2
|
|
100
|
+
hashes = Series(
|
|
101
|
+
combined, index=df_primaries.index, name=hash_col_name, dtype=object
|
|
102
|
+
)
|
|
103
|
+
hashes[null_rows] = None
|
|
104
|
+
return hashes
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def compute_pk_masks(pk_hashes: Series, seen_hashes: set[int]) -> dict[str, Series]:
|
|
108
|
+
"""Compute masks for PK hashes that are either null or have been seen before.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
pk_hashes (Series): Series of PK hashes.
|
|
112
|
+
seen_hashes (set[int]): Set of hashes that have been seen before.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
dict[str, Series]: Dictionary for boolean masks:
|
|
116
|
+
- null: rows where PK is None / NA
|
|
117
|
+
- dup_full: rows that are part of a within-chunk duplicate group
|
|
118
|
+
- cross_full: rows whose hash was seen in previous chunks (excluding dup_full)
|
|
119
|
+
- new_first_full: rows that are the first occurrence of a hash
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
s = pk_hashes
|
|
123
|
+
null = s.isna()
|
|
124
|
+
valid = ~null
|
|
125
|
+
if not valid.any():
|
|
126
|
+
# empty/default masks
|
|
127
|
+
return {
|
|
128
|
+
"null": null,
|
|
129
|
+
"in_chunk_collision": null,
|
|
130
|
+
"cross_chunk_collision": null,
|
|
131
|
+
"first_appearance": null,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
s_valid = s[valid]
|
|
135
|
+
|
|
136
|
+
# Within-chunk duplicate membership (mark *all* members)
|
|
137
|
+
dup_local = s_valid.duplicated(keep=False)
|
|
138
|
+
|
|
139
|
+
# Across-chunk duplicates (exclude those already in a local dup group)
|
|
140
|
+
seen_local = s_valid.isin(seen_hashes)
|
|
141
|
+
cross_local = seen_local & ~dup_local
|
|
142
|
+
|
|
143
|
+
# New first occurrences in this chunk (first time we see the hash here, and not seen before)
|
|
144
|
+
first_local = ~s_valid.duplicated(keep="first")
|
|
145
|
+
new_first_local = first_local & ~seen_local
|
|
146
|
+
|
|
147
|
+
# Lift back to full length masks
|
|
148
|
+
in_chunk_collision = valid.copy()
|
|
149
|
+
in_chunk_collision.loc[valid] = dup_local
|
|
150
|
+
|
|
151
|
+
cross_chunk_collision = valid.copy()
|
|
152
|
+
cross_chunk_collision.loc[valid] = cross_local
|
|
153
|
+
|
|
154
|
+
first_appearance = valid.copy()
|
|
155
|
+
first_appearance.loc[valid] = new_first_local
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"null": null,
|
|
159
|
+
"in_chunk_collision": in_chunk_collision,
|
|
160
|
+
"cross_chunk_collision": cross_chunk_collision,
|
|
161
|
+
"first_appearance": first_appearance,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# PK Whitespace
|
|
166
|
+
def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
|
|
167
|
+
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
168
|
+
return Series(False, index=df_primaries.index)
|
|
169
|
+
|
|
170
|
+
col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
|
|
171
|
+
|
|
172
|
+
return col_masks.any(axis=1)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# Data Type Checks Numeric
|
|
176
|
+
def invalid_mask_integer(column: Series, *, tolerance: float = 1e-12) -> Series:
|
|
177
|
+
"""True where a non-null value cannot be treated as an integer without losing non-
|
|
178
|
+
zero remainder.
|
|
179
|
+
|
|
180
|
+
Accepts scientific notation (e.g. '1e2').
|
|
181
|
+
"""
|
|
182
|
+
notnull = column.notna()
|
|
183
|
+
numeric = to_numeric(column, errors="coerce")
|
|
184
|
+
invalid = notnull & numeric.isna()
|
|
185
|
+
|
|
186
|
+
conversion_mask = notnull & numeric.notna()
|
|
187
|
+
if conversion_mask.any():
|
|
188
|
+
vals = numeric[conversion_mask].astype("float64")
|
|
189
|
+
frac = (vals - round(vals)).abs()
|
|
190
|
+
invalid_conv = frac > tolerance
|
|
191
|
+
invalid = invalid.copy()
|
|
192
|
+
invalid.loc[conversion_mask] = invalid_conv.values
|
|
193
|
+
return invalid
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def invalid_mask_float(column: Series) -> Series:
|
|
197
|
+
"""True where non-null value is not convertible to a number."""
|
|
198
|
+
notnull = column.notna()
|
|
199
|
+
num = to_numeric(column, errors="coerce")
|
|
200
|
+
return notnull & num.isna()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# Data Type Checks Date
|
|
204
|
+
def _allowed_formats_for(dtype: DataType) -> list[str]:
|
|
205
|
+
"""Return the list of formats from Config.date_formats allowed for the given
|
|
206
|
+
DataType."""
|
|
207
|
+
config = get_config()
|
|
208
|
+
return [fmt for fmt, data_type in config.date_formats.items() if data_type == dtype]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _parse_ok_any(column: Series, formats: list[str]) -> Series:
|
|
212
|
+
"""
|
|
213
|
+
Vectorised check: True for values that parse under at least one of `formats`.
|
|
214
|
+
"""
|
|
215
|
+
if not formats:
|
|
216
|
+
return Series(False, index=column.index)
|
|
217
|
+
ok_any = Series(False, index=column.index)
|
|
218
|
+
for fmt in formats:
|
|
219
|
+
parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
|
|
220
|
+
ok_any = ok_any | parsed.notna()
|
|
221
|
+
return ok_any
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def invalid_mask_date(column: Series, fmt: str | None) -> Series:
|
|
225
|
+
"""Must not contain a non-zero time component."""
|
|
226
|
+
notnull = column.notna()
|
|
227
|
+
|
|
228
|
+
if fmt:
|
|
229
|
+
parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
|
|
230
|
+
ok = parsed.notna()
|
|
231
|
+
has_time = ok & (
|
|
232
|
+
(parsed.dt.hour != 0)
|
|
233
|
+
| (parsed.dt.minute != 0)
|
|
234
|
+
| (parsed.dt.second != 0)
|
|
235
|
+
| (parsed.dt.microsecond != 0)
|
|
236
|
+
)
|
|
237
|
+
return notnull & (~ok | has_time)
|
|
238
|
+
|
|
239
|
+
allowed = _allowed_formats_for(DataType.DATE)
|
|
240
|
+
ok_any = _parse_ok_any(column, allowed)
|
|
241
|
+
return notnull & (~ok_any)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
|
|
245
|
+
notnull = column.notna()
|
|
246
|
+
|
|
247
|
+
if fmt:
|
|
248
|
+
parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
|
|
249
|
+
ok = parsed.notna()
|
|
250
|
+
return notnull & (~ok)
|
|
251
|
+
|
|
252
|
+
allowed = _allowed_formats_for(DataType.DATETIME)
|
|
253
|
+
ok_any = _parse_ok_any(column, allowed)
|
|
254
|
+
return notnull & (~ok_any)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
# Other Text Checks
|
|
258
|
+
def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
|
|
259
|
+
if max_len is None or max_len <= 0:
|
|
260
|
+
# treat as unlimited length
|
|
261
|
+
return Series(False, index=column.index)
|
|
262
|
+
|
|
263
|
+
notnull = column.notna()
|
|
264
|
+
lens = column.str.len()
|
|
265
|
+
return notnull & (lens > max_len)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def invalid_mask_text_forbidden_characters(column: Series) -> Series:
|
|
269
|
+
forbidden = get_config().forbidden_characters
|
|
270
|
+
if not forbidden:
|
|
271
|
+
return column.notna() & False
|
|
272
|
+
|
|
273
|
+
pattern = "[" + re.escape("".join(forbidden)) + "]"
|
|
274
|
+
notnull = column.notna()
|
|
275
|
+
has_forbidden = column.str.contains(pattern, regex=True, na=False)
|
|
276
|
+
return notnull & has_forbidden
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
# Apply Data Types #
|
|
280
|
+
def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
281
|
+
# name -> column object
|
|
282
|
+
column_dictionary = {column.name: column for column in table_dictionary}
|
|
283
|
+
|
|
284
|
+
for col in df.columns:
|
|
285
|
+
data_type = column_dictionary.get(col).data_type
|
|
286
|
+
datetime_format = column_dictionary.get(col).datetime_format
|
|
287
|
+
|
|
288
|
+
if data_type in (DataType.TEXT, DataType.FILE):
|
|
289
|
+
df[col] = df[col].astype("string")
|
|
290
|
+
|
|
291
|
+
elif data_type == DataType.INTEGER:
|
|
292
|
+
# Accepts '12', '12.0', '1e2' etc.; validation guarantees integer-equivalent
|
|
293
|
+
nums = to_numeric(df[col], errors="raise")
|
|
294
|
+
df[col] = nums.round().astype("Int64")
|
|
295
|
+
|
|
296
|
+
elif data_type == DataType.FLOAT:
|
|
297
|
+
nums = to_numeric(df[col], errors="raise")
|
|
298
|
+
df[col] = nums.astype("Float64")
|
|
299
|
+
|
|
300
|
+
elif data_type == DataType.DATE:
|
|
301
|
+
dtv = to_datetime(
|
|
302
|
+
df[col], format=datetime_format, errors="raise", utc=False
|
|
303
|
+
)
|
|
304
|
+
df[col] = dtv.dt.normalize() # midnight
|
|
305
|
+
|
|
306
|
+
elif data_type == DataType.DATETIME:
|
|
307
|
+
df[col] = to_datetime(
|
|
308
|
+
df[col], format=datetime_format, errors="raise", utc=False
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
else:
|
|
312
|
+
# Fallback: keep as string
|
|
313
|
+
df[col] = df[col].astype("string")
|
|
314
|
+
|
|
315
|
+
return df
|