valediction 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,315 +1,315 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from typing import List
5
-
6
- from numpy import flatnonzero, round
7
- from pandas import NA, DataFrame, Series, to_datetime, to_numeric
8
- from pandas.util import hash_pandas_object
9
-
10
- from valediction.data_types.data_types import DataType
11
- from valediction.dictionary.model import Table
12
- from valediction.integrity import get_config
13
- from valediction.validation.issues import Range
14
-
15
-
16
- # Remove Nulls
17
- def _set_nulls(df: DataFrame) -> DataFrame:
18
- null_values = get_config().null_values
19
- token_set = {str(t).strip().casefold() for t in null_values}
20
- columns = df.select_dtypes(include=["string", "object"]).columns
21
- for column in columns:
22
- series = df[column]
23
- mask = series.notna() & series.str.casefold().isin(token_set)
24
- df[column] = series.mask(mask, NA)
25
-
26
- return df
27
-
28
-
29
- # Check for Nulls
30
- def _column_has_values(column: Series):
31
- return column.notna().any()
32
-
33
-
34
- # Range Setting
35
- def mask_to_ranges(mask: Series, start_row: int) -> list[Range]:
36
- """Convert a boolean mask (over the current chunk) into 0-based contiguous
37
- ranges."""
38
- idx = flatnonzero(mask.to_numpy())
39
- if idx.size == 0:
40
- return []
41
- ranges: List[Range] = []
42
- run_start = idx[0]
43
- prev = idx[0]
44
- for i in idx[1:]:
45
- if i == prev + 1:
46
- prev = i
47
- continue
48
- ranges.append(Range(start=start_row + run_start, end=start_row + prev))
49
- run_start = prev = i
50
- ranges.append(Range(start=start_row + run_start, end=start_row + prev))
51
- return ranges
52
-
53
-
54
- # PK Hashes
55
- def create_pk_hashes(
56
- df_primaries: DataFrame,
57
- ) -> Series:
58
- """For PK hash collision assessment, compute a deterministic 128-bit hash per row
59
- over the provided PK columns. This is created by computing two 64-bit hashes.
60
-
61
- forwards and backwards and then combining them. Rows with any NA across PK
62
- components are returned as None - flagging these for NULL violations.
63
-
64
-
65
- Args:
66
- df_primaries (DataFrame): DataFrame
67
-
68
- Returns:
69
- Series: Pandas Series with hashes or Nulls.
70
- """
71
- hash_col_name = "PK_HASH"
72
- if df_primaries.empty or df_primaries.shape[1] == 0:
73
- return Series([], dtype=object, name=hash_col_name)
74
-
75
- # Any NA in row => invalid PK -> None
76
- null_rows = df_primaries.isna().any(axis=1)
77
-
78
- # First Hash
79
- hash_1 = hash_pandas_object(df_primaries, index=False) # uint64
80
-
81
- # Second Hash (rows backwards if single row, else salt)
82
- if df_primaries.shape[1] > 1:
83
- df_primaries_backwards = df_primaries.iloc[:, ::-1]
84
- else:
85
- s = df_primaries.iloc[:, 0]
86
- salt = Series(["§"] * len(s), index=s.index, dtype="string")
87
- df_primaries_backwards = DataFrame(
88
- {
89
- "_a": s,
90
- "_b": s.str.cat(salt),
91
- }
92
- )
93
-
94
- hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
95
-
96
- a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
97
- a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
98
-
99
- combined = (a1 << 64) | a2
100
- hashes = Series(
101
- combined, index=df_primaries.index, name=hash_col_name, dtype=object
102
- )
103
- hashes[null_rows] = None
104
- return hashes
105
-
106
-
107
- def compute_pk_masks(pk_hashes: Series, seen_hashes: set[int]) -> dict[str, Series]:
108
- """Compute masks for PK hashes that are either null or have been seen before.
109
-
110
- Args:
111
- pk_hashes (Series): Series of PK hashes.
112
- seen_hashes (set[int]): Set of hashes that have been seen before.
113
-
114
- Returns:
115
- dict[str, Series]: Dictionary for boolean masks:
116
- - null: rows where PK is None / NA
117
- - dup_full: rows that are part of a within-chunk duplicate group
118
- - cross_full: rows whose hash was seen in previous chunks (excluding dup_full)
119
- - new_first_full: rows that are the first occurrence of a hash
120
- """
121
-
122
- s = pk_hashes
123
- null = s.isna()
124
- valid = ~null
125
- if not valid.any():
126
- # empty/default masks
127
- return {
128
- "null": null,
129
- "in_chunk_collision": null,
130
- "cross_chunk_collision": null,
131
- "first_appearance": null,
132
- }
133
-
134
- s_valid = s[valid]
135
-
136
- # Within-chunk duplicate membership (mark *all* members)
137
- dup_local = s_valid.duplicated(keep=False)
138
-
139
- # Across-chunk duplicates (exclude those already in a local dup group)
140
- seen_local = s_valid.isin(seen_hashes)
141
- cross_local = seen_local & ~dup_local
142
-
143
- # New first occurrences in this chunk (first time we see the hash here, and not seen before)
144
- first_local = ~s_valid.duplicated(keep="first")
145
- new_first_local = first_local & ~seen_local
146
-
147
- # Lift back to full length masks
148
- in_chunk_collision = valid.copy()
149
- in_chunk_collision.loc[valid] = dup_local
150
-
151
- cross_chunk_collision = valid.copy()
152
- cross_chunk_collision.loc[valid] = cross_local
153
-
154
- first_appearance = valid.copy()
155
- first_appearance.loc[valid] = new_first_local
156
-
157
- return {
158
- "null": null,
159
- "in_chunk_collision": in_chunk_collision,
160
- "cross_chunk_collision": cross_chunk_collision,
161
- "first_appearance": first_appearance,
162
- }
163
-
164
-
165
- # PK Whitespace
166
- def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
167
- if df_primaries.empty or df_primaries.shape[1] == 0:
168
- return Series(False, index=df_primaries.index)
169
-
170
- col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
171
-
172
- return col_masks.any(axis=1)
173
-
174
-
175
- # Data Type Checks Numeric
176
- def invalid_mask_integer(column: Series, *, tolerance: float = 1e-12) -> Series:
177
- """True where a non-null value cannot be treated as an integer without losing non-
178
- zero remainder.
179
-
180
- Accepts scientific notation (e.g. '1e2').
181
- """
182
- notnull = column.notna()
183
- numeric = to_numeric(column, errors="coerce")
184
- invalid = notnull & numeric.isna()
185
-
186
- conversion_mask = notnull & numeric.notna()
187
- if conversion_mask.any():
188
- vals = numeric[conversion_mask].astype("float64")
189
- frac = (vals - round(vals)).abs()
190
- invalid_conv = frac > tolerance
191
- invalid = invalid.copy()
192
- invalid.loc[conversion_mask] = invalid_conv.values
193
- return invalid
194
-
195
-
196
- def invalid_mask_float(column: Series) -> Series:
197
- """True where non-null value is not convertible to a number."""
198
- notnull = column.notna()
199
- num = to_numeric(column, errors="coerce")
200
- return notnull & num.isna()
201
-
202
-
203
- # Data Type Checks Date
204
- def _allowed_formats_for(dtype: DataType) -> list[str]:
205
- """Return the list of formats from Config.date_formats allowed for the given
206
- DataType."""
207
- config = get_config()
208
- return [fmt for fmt, data_type in config.date_formats.items() if data_type == dtype]
209
-
210
-
211
- def _parse_ok_any(column: Series, formats: list[str]) -> Series:
212
- """
213
- Vectorised check: True for values that parse under at least one of `formats`.
214
- """
215
- if not formats:
216
- return Series(False, index=column.index)
217
- ok_any = Series(False, index=column.index)
218
- for fmt in formats:
219
- parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
220
- ok_any = ok_any | parsed.notna()
221
- return ok_any
222
-
223
-
224
- def invalid_mask_date(column: Series, fmt: str | None) -> Series:
225
- """Must not contain a non-zero time component."""
226
- notnull = column.notna()
227
-
228
- if fmt:
229
- parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
230
- ok = parsed.notna()
231
- has_time = ok & (
232
- (parsed.dt.hour != 0)
233
- | (parsed.dt.minute != 0)
234
- | (parsed.dt.second != 0)
235
- | (parsed.dt.microsecond != 0)
236
- )
237
- return notnull & (~ok | has_time)
238
-
239
- allowed = _allowed_formats_for(DataType.DATE)
240
- ok_any = _parse_ok_any(column, allowed)
241
- return notnull & (~ok_any)
242
-
243
-
244
- def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
245
- notnull = column.notna()
246
-
247
- if fmt:
248
- parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
249
- ok = parsed.notna()
250
- return notnull & (~ok)
251
-
252
- allowed = _allowed_formats_for(DataType.DATETIME)
253
- ok_any = _parse_ok_any(column, allowed)
254
- return notnull & (~ok_any)
255
-
256
-
257
- # Other Text Checks
258
- def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
259
- if max_len is None or max_len <= 0:
260
- # treat as unlimited length
261
- return Series(False, index=column.index)
262
-
263
- notnull = column.notna()
264
- lens = column.str.len()
265
- return notnull & (lens > max_len)
266
-
267
-
268
- def invalid_mask_text_forbidden_characters(column: Series) -> Series:
269
- forbidden = get_config().forbidden_characters
270
- if not forbidden:
271
- return column.notna() & False
272
-
273
- pattern = "[" + re.escape("".join(forbidden)) + "]"
274
- notnull = column.notna()
275
- has_forbidden = column.str.contains(pattern, regex=True, na=False)
276
- return notnull & has_forbidden
277
-
278
-
279
- # Apply Data Types #
280
- def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
281
- # name -> column object
282
- column_dictionary = {column.name: column for column in table_dictionary}
283
-
284
- for col in df.columns:
285
- data_type = column_dictionary.get(col).data_type
286
- datetime_format = column_dictionary.get(col).datetime_format
287
-
288
- if data_type in (DataType.TEXT, DataType.FILE):
289
- df[col] = df[col].astype("string")
290
-
291
- elif data_type == DataType.INTEGER:
292
- # Accepts '12', '12.0', '1e2' etc.; validation guarantees integer-equivalent
293
- nums = to_numeric(df[col], errors="raise")
294
- df[col] = nums.round().astype("Int64")
295
-
296
- elif data_type == DataType.FLOAT:
297
- nums = to_numeric(df[col], errors="raise")
298
- df[col] = nums.astype("Float64")
299
-
300
- elif data_type == DataType.DATE:
301
- dtv = to_datetime(
302
- df[col], format=datetime_format, errors="raise", utc=False
303
- )
304
- df[col] = dtv.dt.normalize() # midnight
305
-
306
- elif data_type == DataType.DATETIME:
307
- df[col] = to_datetime(
308
- df[col], format=datetime_format, errors="raise", utc=False
309
- )
310
-
311
- else:
312
- # Fallback: keep as string
313
- df[col] = df[col].astype("string")
314
-
315
- return df
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import List
5
+
6
+ from numpy import flatnonzero, round
7
+ from pandas import NA, DataFrame, Series, to_datetime, to_numeric
8
+ from pandas.util import hash_pandas_object
9
+
10
+ from valediction.data_types.data_types import DataType
11
+ from valediction.dictionary.model import Table
12
+ from valediction.integrity import get_config
13
+ from valediction.validation.issues import Range
14
+
15
+
16
+ # Remove Nulls
17
+ def _set_nulls(df: DataFrame) -> DataFrame:
18
+ null_values = get_config().null_values
19
+ token_set = {str(t).strip().casefold() for t in null_values}
20
+ columns = df.select_dtypes(include=["string", "object"]).columns
21
+ for column in columns:
22
+ series = df[column]
23
+ mask = series.notna() & series.str.casefold().isin(token_set)
24
+ df[column] = series.mask(mask, NA)
25
+
26
+ return df
27
+
28
+
29
+ # Check for Nulls
30
+ def _column_has_values(column: Series):
31
+ return column.notna().any()
32
+
33
+
34
+ # Range Setting
35
+ def mask_to_ranges(mask: Series, start_row: int) -> list[Range]:
36
+ """Convert a boolean mask (over the current chunk) into 0-based contiguous
37
+ ranges."""
38
+ idx = flatnonzero(mask.to_numpy())
39
+ if idx.size == 0:
40
+ return []
41
+ ranges: List[Range] = []
42
+ run_start = idx[0]
43
+ prev = idx[0]
44
+ for i in idx[1:]:
45
+ if i == prev + 1:
46
+ prev = i
47
+ continue
48
+ ranges.append(Range(start=start_row + run_start, end=start_row + prev))
49
+ run_start = prev = i
50
+ ranges.append(Range(start=start_row + run_start, end=start_row + prev))
51
+ return ranges
52
+
53
+
54
+ # PK Hashes
55
+ def create_pk_hashes(
56
+ df_primaries: DataFrame,
57
+ ) -> Series:
58
+ """For PK hash collision assessment, compute a deterministic 128-bit hash per row
59
+ over the provided PK columns. This is created by computing two 64-bit hashes.
60
+
61
+ forwards and backwards and then combining them. Rows with any NA across PK
62
+ components are returned as None - flagging these for NULL violations.
63
+
64
+
65
+ Args:
66
+ df_primaries (DataFrame): DataFrame
67
+
68
+ Returns:
69
+ Series: Pandas Series with hashes or Nulls.
70
+ """
71
+ hash_col_name = "PK_HASH"
72
+ if df_primaries.empty or df_primaries.shape[1] == 0:
73
+ return Series([], dtype=object, name=hash_col_name)
74
+
75
+ # Any NA in row => invalid PK -> None
76
+ null_rows = df_primaries.isna().any(axis=1)
77
+
78
+ # First Hash
79
+ hash_1 = hash_pandas_object(df_primaries, index=False) # uint64
80
+
81
+ # Second Hash (rows backwards if single row, else salt)
82
+ if df_primaries.shape[1] > 1:
83
+ df_primaries_backwards = df_primaries.iloc[:, ::-1]
84
+ else:
85
+ s = df_primaries.iloc[:, 0]
86
+ salt = Series(["§"] * len(s), index=s.index, dtype="string")
87
+ df_primaries_backwards = DataFrame(
88
+ {
89
+ "_a": s,
90
+ "_b": s.str.cat(salt),
91
+ }
92
+ )
93
+
94
+ hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
95
+
96
+ a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
97
+ a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
98
+
99
+ combined = (a1 << 64) | a2
100
+ hashes = Series(
101
+ combined, index=df_primaries.index, name=hash_col_name, dtype=object
102
+ )
103
+ hashes[null_rows] = None
104
+ return hashes
105
+
106
+
107
+ def compute_pk_masks(pk_hashes: Series, seen_hashes: set[int]) -> dict[str, Series]:
108
+ """Compute masks for PK hashes that are either null or have been seen before.
109
+
110
+ Args:
111
+ pk_hashes (Series): Series of PK hashes.
112
+ seen_hashes (set[int]): Set of hashes that have been seen before.
113
+
114
+ Returns:
115
+ dict[str, Series]: Dictionary for boolean masks:
116
+ - null: rows where PK is None / NA
117
+ - dup_full: rows that are part of a within-chunk duplicate group
118
+ - cross_full: rows whose hash was seen in previous chunks (excluding dup_full)
119
+ - new_first_full: rows that are the first occurrence of a hash
120
+ """
121
+
122
+ s = pk_hashes
123
+ null = s.isna()
124
+ valid = ~null
125
+ if not valid.any():
126
+ # empty/default masks
127
+ return {
128
+ "null": null,
129
+ "in_chunk_collision": null,
130
+ "cross_chunk_collision": null,
131
+ "first_appearance": null,
132
+ }
133
+
134
+ s_valid = s[valid]
135
+
136
+ # Within-chunk duplicate membership (mark *all* members)
137
+ dup_local = s_valid.duplicated(keep=False)
138
+
139
+ # Across-chunk duplicates (exclude those already in a local dup group)
140
+ seen_local = s_valid.isin(seen_hashes)
141
+ cross_local = seen_local & ~dup_local
142
+
143
+ # New first occurrences in this chunk (first time we see the hash here, and not seen before)
144
+ first_local = ~s_valid.duplicated(keep="first")
145
+ new_first_local = first_local & ~seen_local
146
+
147
+ # Lift back to full length masks
148
+ in_chunk_collision = valid.copy()
149
+ in_chunk_collision.loc[valid] = dup_local
150
+
151
+ cross_chunk_collision = valid.copy()
152
+ cross_chunk_collision.loc[valid] = cross_local
153
+
154
+ first_appearance = valid.copy()
155
+ first_appearance.loc[valid] = new_first_local
156
+
157
+ return {
158
+ "null": null,
159
+ "in_chunk_collision": in_chunk_collision,
160
+ "cross_chunk_collision": cross_chunk_collision,
161
+ "first_appearance": first_appearance,
162
+ }
163
+
164
+
165
+ # PK Whitespace
166
+ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
167
+ if df_primaries.empty or df_primaries.shape[1] == 0:
168
+ return Series(False, index=df_primaries.index)
169
+
170
+ col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
171
+
172
+ return col_masks.any(axis=1)
173
+
174
+
175
+ # Data Type Checks Numeric
176
+ def invalid_mask_integer(column: Series, *, tolerance: float = 1e-12) -> Series:
177
+ """True where a non-null value cannot be treated as an integer without losing non-
178
+ zero remainder.
179
+
180
+ Accepts scientific notation (e.g. '1e2').
181
+ """
182
+ notnull = column.notna()
183
+ numeric = to_numeric(column, errors="coerce")
184
+ invalid = notnull & numeric.isna()
185
+
186
+ conversion_mask = notnull & numeric.notna()
187
+ if conversion_mask.any():
188
+ vals = numeric[conversion_mask].astype("float64")
189
+ frac = (vals - round(vals)).abs()
190
+ invalid_conv = frac > tolerance
191
+ invalid = invalid.copy()
192
+ invalid.loc[conversion_mask] = invalid_conv.values
193
+ return invalid
194
+
195
+
196
+ def invalid_mask_float(column: Series) -> Series:
197
+ """True where non-null value is not convertible to a number."""
198
+ notnull = column.notna()
199
+ num = to_numeric(column, errors="coerce")
200
+ return notnull & num.isna()
201
+
202
+
203
+ # Data Type Checks Date
204
+ def _allowed_formats_for(dtype: DataType) -> list[str]:
205
+ """Return the list of formats from Config.date_formats allowed for the given
206
+ DataType."""
207
+ config = get_config()
208
+ return [fmt for fmt, data_type in config.date_formats.items() if data_type == dtype]
209
+
210
+
211
+ def _parse_ok_any(column: Series, formats: list[str]) -> Series:
212
+ """
213
+ Vectorised check: True for values that parse under at least one of `formats`.
214
+ """
215
+ if not formats:
216
+ return Series(False, index=column.index)
217
+ ok_any = Series(False, index=column.index)
218
+ for fmt in formats:
219
+ parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
220
+ ok_any = ok_any | parsed.notna()
221
+ return ok_any
222
+
223
+
224
+ def invalid_mask_date(column: Series, fmt: str | None) -> Series:
225
+ """Must not contain a non-zero time component."""
226
+ notnull = column.notna()
227
+
228
+ if fmt:
229
+ parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
230
+ ok = parsed.notna()
231
+ has_time = ok & (
232
+ (parsed.dt.hour != 0)
233
+ | (parsed.dt.minute != 0)
234
+ | (parsed.dt.second != 0)
235
+ | (parsed.dt.microsecond != 0)
236
+ )
237
+ return notnull & (~ok | has_time)
238
+
239
+ allowed = _allowed_formats_for(DataType.DATE)
240
+ ok_any = _parse_ok_any(column, allowed)
241
+ return notnull & (~ok_any)
242
+
243
+
244
+ def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
245
+ notnull = column.notna()
246
+
247
+ if fmt:
248
+ parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
249
+ ok = parsed.notna()
250
+ return notnull & (~ok)
251
+
252
+ allowed = _allowed_formats_for(DataType.DATETIME)
253
+ ok_any = _parse_ok_any(column, allowed)
254
+ return notnull & (~ok_any)
255
+
256
+
257
+ # Other Text Checks
258
+ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
259
+ if max_len is None or max_len <= 0:
260
+ # treat as unlimited length
261
+ return Series(False, index=column.index)
262
+
263
+ notnull = column.notna()
264
+ lens = column.str.len()
265
+ return notnull & (lens > max_len)
266
+
267
+
268
+ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
269
+ forbidden = get_config().forbidden_characters
270
+ if not forbidden:
271
+ return column.notna() & False
272
+
273
+ pattern = "[" + re.escape("".join(forbidden)) + "]"
274
+ notnull = column.notna()
275
+ has_forbidden = column.str.contains(pattern, regex=True, na=False)
276
+ return notnull & has_forbidden
277
+
278
+
279
+ # Apply Data Types #
280
+ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
281
+ # name -> column object
282
+ column_dictionary = {column.name: column for column in table_dictionary}
283
+
284
+ for col in df.columns:
285
+ data_type = column_dictionary.get(col).data_type
286
+ datetime_format = column_dictionary.get(col).datetime_format
287
+
288
+ if data_type in (DataType.TEXT, DataType.FILE):
289
+ df[col] = df[col].astype("string")
290
+
291
+ elif data_type == DataType.INTEGER:
292
+ # Accepts '12', '12.0', '1e2' etc.; validation guarantees integer-equivalent
293
+ nums = to_numeric(df[col], errors="raise")
294
+ df[col] = nums.round().astype("Int64")
295
+
296
+ elif data_type == DataType.FLOAT:
297
+ nums = to_numeric(df[col], errors="raise")
298
+ df[col] = nums.astype("Float64")
299
+
300
+ elif data_type == DataType.DATE:
301
+ dtv = to_datetime(
302
+ df[col], format=datetime_format, errors="raise", utc=False
303
+ )
304
+ df[col] = dtv.dt.normalize() # midnight
305
+
306
+ elif data_type == DataType.DATETIME:
307
+ df[col] = to_datetime(
308
+ df[col], format=datetime_format, errors="raise", utc=False
309
+ )
310
+
311
+ else:
312
+ # Fallback: keep as string
313
+ df[col] = df[col].astype("string")
314
+
315
+ return df