valediction 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,280 +1,280 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass, field
4
- from enum import Enum
5
- from typing import Iterable, Iterator, Optional
6
-
7
- from pandas import DataFrame, concat
8
-
9
- from valediction.datasets.datasets_helpers import DatasetItemLike
10
- from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
11
- from valediction.support import _normalise_name, list_as_bullets
12
-
13
-
14
- class IssueType(Enum):
15
- # Column / schema
16
- MISSING_COLUMN = "MissingColumn"
17
- EXTRA_COLUMN = "ExtraColumn"
18
- FULLY_NULL_COLUMN = "FullyNullColumn"
19
-
20
- # Keys
21
- PK_NULL = "PrimaryKeyNull"
22
- PK_COLLISION = "PrimaryKeyCollision"
23
- PK_WHITESPACE = "PrimaryKeyContainsWhitespace"
24
-
25
- # Types / content
26
- TYPE_MISMATCH = "TypeMismatch"
27
- TEXT_TOO_LONG = "TextTooLong"
28
- FORBIDDEN_CHARACTER = "ForbiddenCharacter"
29
-
30
-
31
- # Settings
32
- APPLIES_WHOLE_COLUMN = {
33
- IssueType.MISSING_COLUMN,
34
- IssueType.EXTRA_COLUMN,
35
- IssueType.FULLY_NULL_COLUMN,
36
- }
37
-
38
- PRIMARY_KEY_ISSUES = {
39
- IssueType.PK_NULL,
40
- IssueType.PK_COLLISION,
41
- IssueType.PK_WHITESPACE,
42
- }
43
-
44
-
45
- @dataclass
46
- class Range:
47
- start: int
48
- end: int
49
-
50
- def __init__(self, start: int, end: int):
51
- self.start: int = int(start)
52
- self.end: int = int(end)
53
-
54
-
55
- @dataclass
56
- class Issue:
57
- """
58
- Summary:
59
- Dataclass representing an issue in the dataset.
60
-
61
- Attributes:
62
- type (IssueType): type of issue
63
- table (str): name of the table where the issue was detected
64
- column (str | None): name of the column where the issue was detected, or None if not applicable
65
- ranges (list[Range]): list of contiguous ranges of rows where the issue was detected
66
- parent (DatasetItemLike | None): parent dataset item, or None if not applicable
67
- """
68
-
69
- type: IssueType
70
- table: str
71
- column: str | None
72
- ranges: list[Range] = field(default_factory=list)
73
- parent: DatasetItemLike | None = None
74
-
75
- # Magic
76
- def __repr__(self) -> str:
77
- column_part = f", column={self.column!r}" if self.column is not None else ""
78
- sum_ranges = sum(r.end - r.start + 1 for r in self.ranges)
79
- sum_range_part = f", total={sum_ranges}" if sum_ranges else ""
80
- return f"Issue(type={self.type.value!r}, table={self.table!r}{column_part}{sum_range_part})"
81
-
82
- # Methods
83
- def add_ranges(self, new_ranges: Iterable[Range]) -> None:
84
- """
85
- Summary:
86
- Merge new contiguous/overlapping ranges into self.ranges (kept sorted).
87
-
88
- Arguments:
89
- new_ranges (Iterable[Range]): new contiguous/overlapping ranges to be merged into self.ranges
90
-
91
- Raises:
92
- ValueError: if new_ranges is empty
93
- """
94
- all_ranges = self.ranges + list(new_ranges)
95
- if not all_ranges:
96
- self.ranges = []
97
- return
98
- all_ranges.sort(key=lambda r: (r.start, r.end))
99
- merged: list[Range] = []
100
- cur = all_ranges[0]
101
- for r in all_ranges[1:]:
102
- if r.start <= cur.end + 1: # contiguous/overlap
103
- cur.end = max(cur.end, r.end)
104
- else:
105
- merged.append(cur)
106
- cur = r
107
- merged.append(cur)
108
- self.ranges = merged
109
-
110
- def inspect(
111
- self,
112
- additional_columns: bool | str | list[str] | None = None,
113
- chunk_size: int = 1_000_000,
114
- print_header: bool = True,
115
- ) -> DataFrame | str:
116
- """
117
- Summary:
118
- Inspect an issue in the dataset by returning a DataFrame containing the relevant values.
119
-
120
- Arguments:
121
- additional_columns (bool | str | list[str] | None): whether to include additional columns in the DataFrame
122
- - if True, include all columns
123
- - if str or list[str], include only the specified columns
124
- - if None, do not include any additional columns
125
- chunk_size (int): the number of rows to include in the DataFrame at a time
126
- print_header (bool): whether to print the issue details as a header
127
-
128
- Returns:
129
- DataFrame: a DataFrame containing the relevant rows of the dataset
130
-
131
- Raises:
132
- ValueError: if the issue has no parent DatasetItem
133
- """
134
- # Guard
135
- if not self.parent:
136
- raise ValueError("Issue has no parent DatasetItem")
137
- header = self.__repr__() if print_header else ""
138
- # Not applicable
139
- if self.type in APPLIES_WHOLE_COLUMN:
140
- print(f"{header}: applies to whole column")
141
- return None
142
-
143
- # Column Inclusion
144
- if print_header:
145
- print(f"{header}:")
146
- if additional_columns is True:
147
- columns = None
148
- else:
149
- additional_columns = (
150
- [additional_columns]
151
- if isinstance(additional_columns, str)
152
- else additional_columns
153
- )
154
- base = (
155
- set(self.parent.primary_keys)
156
- if self.type in PRIMARY_KEY_ISSUES
157
- else {self.column}
158
- )
159
- base |= set(additional_columns or [])
160
- base.discard(None)
161
- columns = list(base) if base else None
162
-
163
- if not self.ranges:
164
- return DataFrame(columns=columns) if columns else DataFrame()
165
-
166
- spans: list[tuple[int, int]] = [(r.start, r.end) for r in self.ranges]
167
-
168
- # DataFrame source: slice directly
169
- if self.parent.is_dataframe:
170
- df: DataFrame = self.parent.data
171
- n = len(df)
172
- if n == 0:
173
- return DataFrame(columns=columns) if columns else DataFrame()
174
-
175
- # Clamp spans to df length; build parts
176
- parts: list[DataFrame] = []
177
- for s, e in spans:
178
- if s > e or s >= n or e < 0:
179
- continue
180
- lo = max(0, s)
181
- hi = min(n - 1, e)
182
- part: DataFrame = df.iloc[lo : hi + 1]
183
- parts.append(part if columns is None else part.loc[:, columns])
184
-
185
- if not parts:
186
- return DataFrame(columns=columns) if columns else DataFrame()
187
- return concat(parts, axis=0, ignore_index=False)
188
-
189
- # CSV source: delegate reading to csv_readers
190
- if self.parent.is_path:
191
- path = self.parent.data
192
- cfg = CsvReadConfig(usecols=columns)
193
- out = read_csv_ranges(path, spans, cfg=cfg, chunk_size=chunk_size)
194
-
195
- return out if columns is None else out.loc[:, columns]
196
-
197
-
198
- @dataclass
199
- class Issues:
200
- """List-like container holding Issues with case-insensitive get and range
201
- merging."""
202
-
203
- # Magic
204
- def __init__(self) -> None:
205
- self._items: list[Issue] = []
206
- self._index: dict[
207
- tuple[str, Optional[str], IssueType], Issue
208
- ] = {} # table, column, issue_type
209
-
210
- def __iter__(self) -> Iterator[Issue]:
211
- return iter(self._items)
212
-
213
- def __len__(self) -> int:
214
- return len(self._items)
215
-
216
- def __bool__(self) -> bool:
217
- return bool(self._items)
218
-
219
- def __getitem__(self, idx) -> Issue | list[Issue]:
220
- return self._items[idx]
221
-
222
- def __repr__(self) -> str:
223
- if not self._items:
224
- return "Issues([])"
225
- issues = list_as_bullets(elements=[repr(item) for item in self._items])
226
- return f"Issues({issues}\n)"
227
-
228
- # Methods
229
- def add(
230
- self,
231
- issue_type: IssueType,
232
- table: str,
233
- column: str | None = None,
234
- ranges: Iterable[Range] | None = None,
235
- parent: DatasetItemLike | None = None,
236
- ) -> Issue:
237
- key = (
238
- _normalise_name(table),
239
- _normalise_name(column) if column is not None else None,
240
- issue_type,
241
- )
242
- issue = self._index.get(key)
243
- if issue is None:
244
- issue = Issue(type=issue_type, table=table, column=column, parent=parent)
245
- self._items.append(issue)
246
- self._index[key] = issue
247
- if ranges:
248
- issue.add_ranges(ranges)
249
- return issue
250
-
251
- def get(
252
- self,
253
- table: str,
254
- column: str | None = None,
255
- issue_type: IssueType | None = None,
256
- ) -> list[Issue]:
257
- """Case-insensitive filter; any arg can be None to act as a wildcard."""
258
- table = _normalise_name(table)
259
- column = _normalise_name(column) if column is not None else None
260
- output: list[Issue] = []
261
- if issue_type is not None:
262
- # direct index lookup where possible
263
- key = (table, column, issue_type)
264
- hit = self._index.get(key)
265
- if hit:
266
- output.append(hit)
267
- return output
268
-
269
- # otherwise scan (still cheap; we maintain a compact list)
270
- for item in self._items:
271
- if _normalise_name(item.table) != table:
272
- continue
273
- if column is not None and (_normalise_name(item.column) or "") != column:
274
- continue
275
- output.append(item)
276
- return output
277
-
278
- def extend(self, issues: Issues) -> None:
279
- for issue in issues:
280
- self.add(issue.type, issue.table, issue.column, issue.ranges, issue.parent)
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import Iterable, Iterator, Optional
6
+
7
+ from pandas import DataFrame, concat
8
+
9
+ from valediction.datasets.datasets_helpers import DatasetItemLike
10
+ from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
11
+ from valediction.support import _normalise_name, list_as_bullets
12
+
13
+
14
+ class IssueType(Enum):
15
+ # Column / schema
16
+ MISSING_COLUMN = "MissingColumn"
17
+ EXTRA_COLUMN = "ExtraColumn"
18
+ FULLY_NULL_COLUMN = "FullyNullColumn"
19
+
20
+ # Keys
21
+ PK_NULL = "PrimaryKeyNull"
22
+ PK_COLLISION = "PrimaryKeyCollision"
23
+ PK_WHITESPACE = "PrimaryKeyContainsWhitespace"
24
+
25
+ # Types / content
26
+ TYPE_MISMATCH = "TypeMismatch"
27
+ TEXT_TOO_LONG = "TextTooLong"
28
+ FORBIDDEN_CHARACTER = "ForbiddenCharacter"
29
+
30
+
31
+ # Settings
32
+ APPLIES_WHOLE_COLUMN = {
33
+ IssueType.MISSING_COLUMN,
34
+ IssueType.EXTRA_COLUMN,
35
+ IssueType.FULLY_NULL_COLUMN,
36
+ }
37
+
38
+ PRIMARY_KEY_ISSUES = {
39
+ IssueType.PK_NULL,
40
+ IssueType.PK_COLLISION,
41
+ IssueType.PK_WHITESPACE,
42
+ }
43
+
44
+
45
+ @dataclass
46
+ class Range:
47
+ start: int
48
+ end: int
49
+
50
+ def __init__(self, start: int, end: int):
51
+ self.start: int = int(start)
52
+ self.end: int = int(end)
53
+
54
+
55
+ @dataclass
56
+ class Issue:
57
+ """
58
+ Summary:
59
+ Dataclass representing an issue in the dataset.
60
+
61
+ Attributes:
62
+ type (IssueType): type of issue
63
+ table (str): name of the table where the issue was detected
64
+ column (str | None): name of the column where the issue was detected, or None if not applicable
65
+ ranges (list[Range]): list of contiguous ranges of rows where the issue was detected
66
+ parent (DatasetItemLike | None): parent dataset item, or None if not applicable
67
+ """
68
+
69
+ type: IssueType
70
+ table: str
71
+ column: str | None
72
+ ranges: list[Range] = field(default_factory=list)
73
+ parent: DatasetItemLike | None = None
74
+
75
+ # Magic
76
+ def __repr__(self) -> str:
77
+ column_part = f", column={self.column!r}" if self.column is not None else ""
78
+ sum_ranges = sum(r.end - r.start + 1 for r in self.ranges)
79
+ sum_range_part = f", total={sum_ranges}" if sum_ranges else ""
80
+ return f"Issue(type={self.type.value!r}, table={self.table!r}{column_part}{sum_range_part})"
81
+
82
+ # Methods
83
+ def add_ranges(self, new_ranges: Iterable[Range]) -> None:
84
+ """
85
+ Summary:
86
+ Merge new contiguous/overlapping ranges into self.ranges (kept sorted).
87
+
88
+ Arguments:
89
+ new_ranges (Iterable[Range]): new contiguous/overlapping ranges to be merged into self.ranges
90
+
91
+ Raises:
92
+ ValueError: if new_ranges is empty
93
+ """
94
+ all_ranges = self.ranges + list(new_ranges)
95
+ if not all_ranges:
96
+ self.ranges = []
97
+ return
98
+ all_ranges.sort(key=lambda r: (r.start, r.end))
99
+ merged: list[Range] = []
100
+ cur = all_ranges[0]
101
+ for r in all_ranges[1:]:
102
+ if r.start <= cur.end + 1: # contiguous/overlap
103
+ cur.end = max(cur.end, r.end)
104
+ else:
105
+ merged.append(cur)
106
+ cur = r
107
+ merged.append(cur)
108
+ self.ranges = merged
109
+
110
+ def inspect(
111
+ self,
112
+ additional_columns: bool | str | list[str] | None = None,
113
+ chunk_size: int = 1_000_000,
114
+ print_header: bool = True,
115
+ ) -> DataFrame | str:
116
+ """
117
+ Summary:
118
+ Inspect an issue in the dataset by returning a DataFrame containing the relevant values.
119
+
120
+ Arguments:
121
+ additional_columns (bool | str | list[str] | None): whether to include additional columns in the DataFrame
122
+ - if True, include all columns
123
+ - if str or list[str], include only the specified columns
124
+ - if None, do not include any additional columns
125
+ chunk_size (int): the number of rows to include in the DataFrame at a time
126
+ print_header (bool): whether to print the issue details as a header
127
+
128
+ Returns:
129
+ DataFrame: a DataFrame containing the relevant rows of the dataset
130
+
131
+ Raises:
132
+ ValueError: if the issue has no parent DatasetItem
133
+ """
134
+ # Guard
135
+ if not self.parent:
136
+ raise ValueError("Issue has no parent DatasetItem")
137
+ header = self.__repr__() if print_header else ""
138
+ # Not applicable
139
+ if self.type in APPLIES_WHOLE_COLUMN:
140
+ print(f"{header}: applies to whole column")
141
+ return None
142
+
143
+ # Column Inclusion
144
+ if print_header:
145
+ print(f"{header}:")
146
+ if additional_columns is True:
147
+ columns = None
148
+ else:
149
+ additional_columns = (
150
+ [additional_columns]
151
+ if isinstance(additional_columns, str)
152
+ else additional_columns
153
+ )
154
+ base = (
155
+ set(self.parent.primary_keys)
156
+ if self.type in PRIMARY_KEY_ISSUES
157
+ else {self.column}
158
+ )
159
+ base |= set(additional_columns or [])
160
+ base.discard(None)
161
+ columns = list(base) if base else None
162
+
163
+ if not self.ranges:
164
+ return DataFrame(columns=columns) if columns else DataFrame()
165
+
166
+ spans: list[tuple[int, int]] = [(r.start, r.end) for r in self.ranges]
167
+
168
+ # DataFrame source: slice directly
169
+ if self.parent.is_dataframe:
170
+ df: DataFrame = self.parent.data
171
+ n = len(df)
172
+ if n == 0:
173
+ return DataFrame(columns=columns) if columns else DataFrame()
174
+
175
+ # Clamp spans to df length; build parts
176
+ parts: list[DataFrame] = []
177
+ for s, e in spans:
178
+ if s > e or s >= n or e < 0:
179
+ continue
180
+ lo = max(0, s)
181
+ hi = min(n - 1, e)
182
+ part: DataFrame = df.iloc[lo : hi + 1]
183
+ parts.append(part if columns is None else part.loc[:, columns])
184
+
185
+ if not parts:
186
+ return DataFrame(columns=columns) if columns else DataFrame()
187
+ return concat(parts, axis=0, ignore_index=False)
188
+
189
+ # CSV source: delegate reading to csv_readers
190
+ if self.parent.is_path:
191
+ path = self.parent.data
192
+ cfg = CsvReadConfig(usecols=columns)
193
+ out = read_csv_ranges(path, spans, cfg=cfg, chunk_size=chunk_size)
194
+
195
+ return out if columns is None else out.loc[:, columns]
196
+
197
+
198
+ @dataclass
199
+ class Issues:
200
+ """List-like container holding Issues with case-insensitive get and range
201
+ merging."""
202
+
203
+ # Magic
204
+ def __init__(self) -> None:
205
+ self._items: list[Issue] = []
206
+ self._index: dict[
207
+ tuple[str, Optional[str], IssueType], Issue
208
+ ] = {} # table, column, issue_type
209
+
210
+ def __iter__(self) -> Iterator[Issue]:
211
+ return iter(self._items)
212
+
213
+ def __len__(self) -> int:
214
+ return len(self._items)
215
+
216
+ def __bool__(self) -> bool:
217
+ return bool(self._items)
218
+
219
+ def __getitem__(self, idx) -> Issue | list[Issue]:
220
+ return self._items[idx]
221
+
222
+ def __repr__(self) -> str:
223
+ if not self._items:
224
+ return "Issues([])"
225
+ issues = list_as_bullets(elements=[repr(item) for item in self._items])
226
+ return f"Issues({issues}\n)"
227
+
228
+ # Methods
229
+ def add(
230
+ self,
231
+ issue_type: IssueType,
232
+ table: str,
233
+ column: str | None = None,
234
+ ranges: Iterable[Range] | None = None,
235
+ parent: DatasetItemLike | None = None,
236
+ ) -> Issue:
237
+ key = (
238
+ _normalise_name(table),
239
+ _normalise_name(column) if column is not None else None,
240
+ issue_type,
241
+ )
242
+ issue = self._index.get(key)
243
+ if issue is None:
244
+ issue = Issue(type=issue_type, table=table, column=column, parent=parent)
245
+ self._items.append(issue)
246
+ self._index[key] = issue
247
+ if ranges:
248
+ issue.add_ranges(ranges)
249
+ return issue
250
+
251
+ def get(
252
+ self,
253
+ table: str,
254
+ column: str | None = None,
255
+ issue_type: IssueType | None = None,
256
+ ) -> list[Issue]:
257
+ """Case-insensitive filter; any arg can be None to act as a wildcard."""
258
+ table = _normalise_name(table)
259
+ column = _normalise_name(column) if column is not None else None
260
+ output: list[Issue] = []
261
+ if issue_type is not None:
262
+ # direct index lookup where possible
263
+ key = (table, column, issue_type)
264
+ hit = self._index.get(key)
265
+ if hit:
266
+ output.append(hit)
267
+ return output
268
+
269
+ # otherwise scan (still cheap; we maintain a compact list)
270
+ for item in self._items:
271
+ if _normalise_name(item.table) != table:
272
+ continue
273
+ if column is not None and (_normalise_name(item.column) or "") != column:
274
+ continue
275
+ output.append(item)
276
+ return output
277
+
278
+ def extend(self, issues: Issues) -> None:
279
+ for issue in issues:
280
+ self.add(issue.type, issue.table, issue.column, issue.ranges, issue.parent)