valediction 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -8
- valediction/convenience.py +50 -50
- valediction/data_types/data_type_helpers.py +75 -75
- valediction/data_types/data_types.py +58 -58
- valediction/data_types/type_inference.py +541 -541
- valediction/datasets/datasets.py +870 -870
- valediction/datasets/datasets_helpers.py +46 -46
- valediction/demo/DEMOGRAPHICS.csv +101 -101
- valediction/demo/DIAGNOSES.csv +650 -650
- valediction/demo/LAB_TESTS.csv +1001 -1001
- valediction/demo/VITALS.csv +1001 -1001
- valediction/demo/__init__.py +6 -6
- valediction/demo/demo_dictionary.py +129 -129
- valediction/dictionary/exporting.py +501 -501
- valediction/dictionary/exporting_helpers.py +371 -371
- valediction/dictionary/generation.py +357 -357
- valediction/dictionary/helpers.py +174 -174
- valediction/dictionary/importing.py +494 -494
- valediction/dictionary/integrity.py +37 -37
- valediction/dictionary/model.py +582 -582
- valediction/exceptions.py +22 -22
- valediction/integrity.py +97 -97
- valediction/io/csv_readers.py +307 -307
- valediction/progress.py +206 -206
- valediction/support.py +72 -72
- valediction/validation/helpers.py +315 -315
- valediction/validation/issues.py +280 -280
- valediction/validation/validation.py +598 -598
- {valediction-1.0.0.dist-info → valediction-1.0.3.dist-info}/METADATA +1 -1
- valediction-1.0.3.dist-info/RECORD +38 -0
- {valediction-1.0.0.dist-info → valediction-1.0.3.dist-info}/WHEEL +1 -1
- valediction-1.0.0.dist-info/RECORD +0 -38
valediction/validation/issues.py
CHANGED
|
@@ -1,280 +1,280 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from enum import Enum
|
|
5
|
-
from typing import Iterable, Iterator, Optional
|
|
6
|
-
|
|
7
|
-
from pandas import DataFrame, concat
|
|
8
|
-
|
|
9
|
-
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
10
|
-
from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
|
|
11
|
-
from valediction.support import _normalise_name, list_as_bullets
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class IssueType(Enum):
|
|
15
|
-
# Column / schema
|
|
16
|
-
MISSING_COLUMN = "MissingColumn"
|
|
17
|
-
EXTRA_COLUMN = "ExtraColumn"
|
|
18
|
-
FULLY_NULL_COLUMN = "FullyNullColumn"
|
|
19
|
-
|
|
20
|
-
# Keys
|
|
21
|
-
PK_NULL = "PrimaryKeyNull"
|
|
22
|
-
PK_COLLISION = "PrimaryKeyCollision"
|
|
23
|
-
PK_WHITESPACE = "PrimaryKeyContainsWhitespace"
|
|
24
|
-
|
|
25
|
-
# Types / content
|
|
26
|
-
TYPE_MISMATCH = "TypeMismatch"
|
|
27
|
-
TEXT_TOO_LONG = "TextTooLong"
|
|
28
|
-
FORBIDDEN_CHARACTER = "ForbiddenCharacter"
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# Settings
|
|
32
|
-
APPLIES_WHOLE_COLUMN = {
|
|
33
|
-
IssueType.MISSING_COLUMN,
|
|
34
|
-
IssueType.EXTRA_COLUMN,
|
|
35
|
-
IssueType.FULLY_NULL_COLUMN,
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
PRIMARY_KEY_ISSUES = {
|
|
39
|
-
IssueType.PK_NULL,
|
|
40
|
-
IssueType.PK_COLLISION,
|
|
41
|
-
IssueType.PK_WHITESPACE,
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass
|
|
46
|
-
class Range:
|
|
47
|
-
start: int
|
|
48
|
-
end: int
|
|
49
|
-
|
|
50
|
-
def __init__(self, start: int, end: int):
|
|
51
|
-
self.start: int = int(start)
|
|
52
|
-
self.end: int = int(end)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@dataclass
|
|
56
|
-
class Issue:
|
|
57
|
-
"""
|
|
58
|
-
Summary:
|
|
59
|
-
Dataclass representing an issue in the dataset.
|
|
60
|
-
|
|
61
|
-
Attributes:
|
|
62
|
-
type (IssueType): type of issue
|
|
63
|
-
table (str): name of the table where the issue was detected
|
|
64
|
-
column (str | None): name of the column where the issue was detected, or None if not applicable
|
|
65
|
-
ranges (list[Range]): list of contiguous ranges of rows where the issue was detected
|
|
66
|
-
parent (DatasetItemLike | None): parent dataset item, or None if not applicable
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
type: IssueType
|
|
70
|
-
table: str
|
|
71
|
-
column: str | None
|
|
72
|
-
ranges: list[Range] = field(default_factory=list)
|
|
73
|
-
parent: DatasetItemLike | None = None
|
|
74
|
-
|
|
75
|
-
# Magic
|
|
76
|
-
def __repr__(self) -> str:
|
|
77
|
-
column_part = f", column={self.column!r}" if self.column is not None else ""
|
|
78
|
-
sum_ranges = sum(r.end - r.start + 1 for r in self.ranges)
|
|
79
|
-
sum_range_part = f", total={sum_ranges}" if sum_ranges else ""
|
|
80
|
-
return f"Issue(type={self.type.value!r}, table={self.table!r}{column_part}{sum_range_part})"
|
|
81
|
-
|
|
82
|
-
# Methods
|
|
83
|
-
def add_ranges(self, new_ranges: Iterable[Range]) -> None:
|
|
84
|
-
"""
|
|
85
|
-
Summary:
|
|
86
|
-
Merge new contiguous/overlapping ranges into self.ranges (kept sorted).
|
|
87
|
-
|
|
88
|
-
Arguments:
|
|
89
|
-
new_ranges (Iterable[Range]): new contiguous/overlapping ranges to be merged into self.ranges
|
|
90
|
-
|
|
91
|
-
Raises:
|
|
92
|
-
ValueError: if new_ranges is empty
|
|
93
|
-
"""
|
|
94
|
-
all_ranges = self.ranges + list(new_ranges)
|
|
95
|
-
if not all_ranges:
|
|
96
|
-
self.ranges = []
|
|
97
|
-
return
|
|
98
|
-
all_ranges.sort(key=lambda r: (r.start, r.end))
|
|
99
|
-
merged: list[Range] = []
|
|
100
|
-
cur = all_ranges[0]
|
|
101
|
-
for r in all_ranges[1:]:
|
|
102
|
-
if r.start <= cur.end + 1: # contiguous/overlap
|
|
103
|
-
cur.end = max(cur.end, r.end)
|
|
104
|
-
else:
|
|
105
|
-
merged.append(cur)
|
|
106
|
-
cur = r
|
|
107
|
-
merged.append(cur)
|
|
108
|
-
self.ranges = merged
|
|
109
|
-
|
|
110
|
-
def inspect(
|
|
111
|
-
self,
|
|
112
|
-
additional_columns: bool | str | list[str] | None = None,
|
|
113
|
-
chunk_size: int = 1_000_000,
|
|
114
|
-
print_header: bool = True,
|
|
115
|
-
) -> DataFrame | str:
|
|
116
|
-
"""
|
|
117
|
-
Summary:
|
|
118
|
-
Inspect an issue in the dataset by returning a DataFrame containing the relevant values.
|
|
119
|
-
|
|
120
|
-
Arguments:
|
|
121
|
-
additional_columns (bool | str | list[str] | None): whether to include additional columns in the DataFrame
|
|
122
|
-
- if True, include all columns
|
|
123
|
-
- if str or list[str], include only the specified columns
|
|
124
|
-
- if None, do not include any additional columns
|
|
125
|
-
chunk_size (int): the number of rows to include in the DataFrame at a time
|
|
126
|
-
print_header (bool): whether to print the issue details as a header
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
DataFrame: a DataFrame containing the relevant rows of the dataset
|
|
130
|
-
|
|
131
|
-
Raises:
|
|
132
|
-
ValueError: if the issue has no parent DatasetItem
|
|
133
|
-
"""
|
|
134
|
-
# Guard
|
|
135
|
-
if not self.parent:
|
|
136
|
-
raise ValueError("Issue has no parent DatasetItem")
|
|
137
|
-
header = self.__repr__() if print_header else ""
|
|
138
|
-
# Not applicable
|
|
139
|
-
if self.type in APPLIES_WHOLE_COLUMN:
|
|
140
|
-
print(f"{header}: applies to whole column")
|
|
141
|
-
return None
|
|
142
|
-
|
|
143
|
-
# Column Inclusion
|
|
144
|
-
if print_header:
|
|
145
|
-
print(f"{header}:")
|
|
146
|
-
if additional_columns is True:
|
|
147
|
-
columns = None
|
|
148
|
-
else:
|
|
149
|
-
additional_columns = (
|
|
150
|
-
[additional_columns]
|
|
151
|
-
if isinstance(additional_columns, str)
|
|
152
|
-
else additional_columns
|
|
153
|
-
)
|
|
154
|
-
base = (
|
|
155
|
-
set(self.parent.primary_keys)
|
|
156
|
-
if self.type in PRIMARY_KEY_ISSUES
|
|
157
|
-
else {self.column}
|
|
158
|
-
)
|
|
159
|
-
base |= set(additional_columns or [])
|
|
160
|
-
base.discard(None)
|
|
161
|
-
columns = list(base) if base else None
|
|
162
|
-
|
|
163
|
-
if not self.ranges:
|
|
164
|
-
return DataFrame(columns=columns) if columns else DataFrame()
|
|
165
|
-
|
|
166
|
-
spans: list[tuple[int, int]] = [(r.start, r.end) for r in self.ranges]
|
|
167
|
-
|
|
168
|
-
# DataFrame source: slice directly
|
|
169
|
-
if self.parent.is_dataframe:
|
|
170
|
-
df: DataFrame = self.parent.data
|
|
171
|
-
n = len(df)
|
|
172
|
-
if n == 0:
|
|
173
|
-
return DataFrame(columns=columns) if columns else DataFrame()
|
|
174
|
-
|
|
175
|
-
# Clamp spans to df length; build parts
|
|
176
|
-
parts: list[DataFrame] = []
|
|
177
|
-
for s, e in spans:
|
|
178
|
-
if s > e or s >= n or e < 0:
|
|
179
|
-
continue
|
|
180
|
-
lo = max(0, s)
|
|
181
|
-
hi = min(n - 1, e)
|
|
182
|
-
part: DataFrame = df.iloc[lo : hi + 1]
|
|
183
|
-
parts.append(part if columns is None else part.loc[:, columns])
|
|
184
|
-
|
|
185
|
-
if not parts:
|
|
186
|
-
return DataFrame(columns=columns) if columns else DataFrame()
|
|
187
|
-
return concat(parts, axis=0, ignore_index=False)
|
|
188
|
-
|
|
189
|
-
# CSV source: delegate reading to csv_readers
|
|
190
|
-
if self.parent.is_path:
|
|
191
|
-
path = self.parent.data
|
|
192
|
-
cfg = CsvReadConfig(usecols=columns)
|
|
193
|
-
out = read_csv_ranges(path, spans, cfg=cfg, chunk_size=chunk_size)
|
|
194
|
-
|
|
195
|
-
return out if columns is None else out.loc[:, columns]
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
@dataclass
|
|
199
|
-
class Issues:
|
|
200
|
-
"""List-like container holding Issues with case-insensitive get and range
|
|
201
|
-
merging."""
|
|
202
|
-
|
|
203
|
-
# Magic
|
|
204
|
-
def __init__(self) -> None:
|
|
205
|
-
self._items: list[Issue] = []
|
|
206
|
-
self._index: dict[
|
|
207
|
-
tuple[str, Optional[str], IssueType], Issue
|
|
208
|
-
] = {} # table, column, issue_type
|
|
209
|
-
|
|
210
|
-
def __iter__(self) -> Iterator[Issue]:
|
|
211
|
-
return iter(self._items)
|
|
212
|
-
|
|
213
|
-
def __len__(self) -> int:
|
|
214
|
-
return len(self._items)
|
|
215
|
-
|
|
216
|
-
def __bool__(self) -> bool:
|
|
217
|
-
return bool(self._items)
|
|
218
|
-
|
|
219
|
-
def __getitem__(self, idx) -> Issue | list[Issue]:
|
|
220
|
-
return self._items[idx]
|
|
221
|
-
|
|
222
|
-
def __repr__(self) -> str:
|
|
223
|
-
if not self._items:
|
|
224
|
-
return "Issues([])"
|
|
225
|
-
issues = list_as_bullets(elements=[repr(item) for item in self._items])
|
|
226
|
-
return f"Issues({issues}\n)"
|
|
227
|
-
|
|
228
|
-
# Methods
|
|
229
|
-
def add(
|
|
230
|
-
self,
|
|
231
|
-
issue_type: IssueType,
|
|
232
|
-
table: str,
|
|
233
|
-
column: str | None = None,
|
|
234
|
-
ranges: Iterable[Range] | None = None,
|
|
235
|
-
parent: DatasetItemLike | None = None,
|
|
236
|
-
) -> Issue:
|
|
237
|
-
key = (
|
|
238
|
-
_normalise_name(table),
|
|
239
|
-
_normalise_name(column) if column is not None else None,
|
|
240
|
-
issue_type,
|
|
241
|
-
)
|
|
242
|
-
issue = self._index.get(key)
|
|
243
|
-
if issue is None:
|
|
244
|
-
issue = Issue(type=issue_type, table=table, column=column, parent=parent)
|
|
245
|
-
self._items.append(issue)
|
|
246
|
-
self._index[key] = issue
|
|
247
|
-
if ranges:
|
|
248
|
-
issue.add_ranges(ranges)
|
|
249
|
-
return issue
|
|
250
|
-
|
|
251
|
-
def get(
|
|
252
|
-
self,
|
|
253
|
-
table: str,
|
|
254
|
-
column: str | None = None,
|
|
255
|
-
issue_type: IssueType | None = None,
|
|
256
|
-
) -> list[Issue]:
|
|
257
|
-
"""Case-insensitive filter; any arg can be None to act as a wildcard."""
|
|
258
|
-
table = _normalise_name(table)
|
|
259
|
-
column = _normalise_name(column) if column is not None else None
|
|
260
|
-
output: list[Issue] = []
|
|
261
|
-
if issue_type is not None:
|
|
262
|
-
# direct index lookup where possible
|
|
263
|
-
key = (table, column, issue_type)
|
|
264
|
-
hit = self._index.get(key)
|
|
265
|
-
if hit:
|
|
266
|
-
output.append(hit)
|
|
267
|
-
return output
|
|
268
|
-
|
|
269
|
-
# otherwise scan (still cheap; we maintain a compact list)
|
|
270
|
-
for item in self._items:
|
|
271
|
-
if _normalise_name(item.table) != table:
|
|
272
|
-
continue
|
|
273
|
-
if column is not None and (_normalise_name(item.column) or "") != column:
|
|
274
|
-
continue
|
|
275
|
-
output.append(item)
|
|
276
|
-
return output
|
|
277
|
-
|
|
278
|
-
def extend(self, issues: Issues) -> None:
|
|
279
|
-
for issue in issues:
|
|
280
|
-
self.add(issue.type, issue.table, issue.column, issue.ranges, issue.parent)
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Iterable, Iterator, Optional
|
|
6
|
+
|
|
7
|
+
from pandas import DataFrame, concat
|
|
8
|
+
|
|
9
|
+
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
10
|
+
from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
|
|
11
|
+
from valediction.support import _normalise_name, list_as_bullets
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class IssueType(Enum):
|
|
15
|
+
# Column / schema
|
|
16
|
+
MISSING_COLUMN = "MissingColumn"
|
|
17
|
+
EXTRA_COLUMN = "ExtraColumn"
|
|
18
|
+
FULLY_NULL_COLUMN = "FullyNullColumn"
|
|
19
|
+
|
|
20
|
+
# Keys
|
|
21
|
+
PK_NULL = "PrimaryKeyNull"
|
|
22
|
+
PK_COLLISION = "PrimaryKeyCollision"
|
|
23
|
+
PK_WHITESPACE = "PrimaryKeyContainsWhitespace"
|
|
24
|
+
|
|
25
|
+
# Types / content
|
|
26
|
+
TYPE_MISMATCH = "TypeMismatch"
|
|
27
|
+
TEXT_TOO_LONG = "TextTooLong"
|
|
28
|
+
FORBIDDEN_CHARACTER = "ForbiddenCharacter"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Settings
|
|
32
|
+
APPLIES_WHOLE_COLUMN = {
|
|
33
|
+
IssueType.MISSING_COLUMN,
|
|
34
|
+
IssueType.EXTRA_COLUMN,
|
|
35
|
+
IssueType.FULLY_NULL_COLUMN,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
PRIMARY_KEY_ISSUES = {
|
|
39
|
+
IssueType.PK_NULL,
|
|
40
|
+
IssueType.PK_COLLISION,
|
|
41
|
+
IssueType.PK_WHITESPACE,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Range:
|
|
47
|
+
start: int
|
|
48
|
+
end: int
|
|
49
|
+
|
|
50
|
+
def __init__(self, start: int, end: int):
|
|
51
|
+
self.start: int = int(start)
|
|
52
|
+
self.end: int = int(end)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class Issue:
|
|
57
|
+
"""
|
|
58
|
+
Summary:
|
|
59
|
+
Dataclass representing an issue in the dataset.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
type (IssueType): type of issue
|
|
63
|
+
table (str): name of the table where the issue was detected
|
|
64
|
+
column (str | None): name of the column where the issue was detected, or None if not applicable
|
|
65
|
+
ranges (list[Range]): list of contiguous ranges of rows where the issue was detected
|
|
66
|
+
parent (DatasetItemLike | None): parent dataset item, or None if not applicable
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
type: IssueType
|
|
70
|
+
table: str
|
|
71
|
+
column: str | None
|
|
72
|
+
ranges: list[Range] = field(default_factory=list)
|
|
73
|
+
parent: DatasetItemLike | None = None
|
|
74
|
+
|
|
75
|
+
# Magic
|
|
76
|
+
def __repr__(self) -> str:
|
|
77
|
+
column_part = f", column={self.column!r}" if self.column is not None else ""
|
|
78
|
+
sum_ranges = sum(r.end - r.start + 1 for r in self.ranges)
|
|
79
|
+
sum_range_part = f", total={sum_ranges}" if sum_ranges else ""
|
|
80
|
+
return f"Issue(type={self.type.value!r}, table={self.table!r}{column_part}{sum_range_part})"
|
|
81
|
+
|
|
82
|
+
# Methods
|
|
83
|
+
def add_ranges(self, new_ranges: Iterable[Range]) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Summary:
|
|
86
|
+
Merge new contiguous/overlapping ranges into self.ranges (kept sorted).
|
|
87
|
+
|
|
88
|
+
Arguments:
|
|
89
|
+
new_ranges (Iterable[Range]): new contiguous/overlapping ranges to be merged into self.ranges
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: if new_ranges is empty
|
|
93
|
+
"""
|
|
94
|
+
all_ranges = self.ranges + list(new_ranges)
|
|
95
|
+
if not all_ranges:
|
|
96
|
+
self.ranges = []
|
|
97
|
+
return
|
|
98
|
+
all_ranges.sort(key=lambda r: (r.start, r.end))
|
|
99
|
+
merged: list[Range] = []
|
|
100
|
+
cur = all_ranges[0]
|
|
101
|
+
for r in all_ranges[1:]:
|
|
102
|
+
if r.start <= cur.end + 1: # contiguous/overlap
|
|
103
|
+
cur.end = max(cur.end, r.end)
|
|
104
|
+
else:
|
|
105
|
+
merged.append(cur)
|
|
106
|
+
cur = r
|
|
107
|
+
merged.append(cur)
|
|
108
|
+
self.ranges = merged
|
|
109
|
+
|
|
110
|
+
def inspect(
|
|
111
|
+
self,
|
|
112
|
+
additional_columns: bool | str | list[str] | None = None,
|
|
113
|
+
chunk_size: int = 1_000_000,
|
|
114
|
+
print_header: bool = True,
|
|
115
|
+
) -> DataFrame | str:
|
|
116
|
+
"""
|
|
117
|
+
Summary:
|
|
118
|
+
Inspect an issue in the dataset by returning a DataFrame containing the relevant values.
|
|
119
|
+
|
|
120
|
+
Arguments:
|
|
121
|
+
additional_columns (bool | str | list[str] | None): whether to include additional columns in the DataFrame
|
|
122
|
+
- if True, include all columns
|
|
123
|
+
- if str or list[str], include only the specified columns
|
|
124
|
+
- if None, do not include any additional columns
|
|
125
|
+
chunk_size (int): the number of rows to include in the DataFrame at a time
|
|
126
|
+
print_header (bool): whether to print the issue details as a header
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
DataFrame: a DataFrame containing the relevant rows of the dataset
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: if the issue has no parent DatasetItem
|
|
133
|
+
"""
|
|
134
|
+
# Guard
|
|
135
|
+
if not self.parent:
|
|
136
|
+
raise ValueError("Issue has no parent DatasetItem")
|
|
137
|
+
header = self.__repr__() if print_header else ""
|
|
138
|
+
# Not applicable
|
|
139
|
+
if self.type in APPLIES_WHOLE_COLUMN:
|
|
140
|
+
print(f"{header}: applies to whole column")
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# Column Inclusion
|
|
144
|
+
if print_header:
|
|
145
|
+
print(f"{header}:")
|
|
146
|
+
if additional_columns is True:
|
|
147
|
+
columns = None
|
|
148
|
+
else:
|
|
149
|
+
additional_columns = (
|
|
150
|
+
[additional_columns]
|
|
151
|
+
if isinstance(additional_columns, str)
|
|
152
|
+
else additional_columns
|
|
153
|
+
)
|
|
154
|
+
base = (
|
|
155
|
+
set(self.parent.primary_keys)
|
|
156
|
+
if self.type in PRIMARY_KEY_ISSUES
|
|
157
|
+
else {self.column}
|
|
158
|
+
)
|
|
159
|
+
base |= set(additional_columns or [])
|
|
160
|
+
base.discard(None)
|
|
161
|
+
columns = list(base) if base else None
|
|
162
|
+
|
|
163
|
+
if not self.ranges:
|
|
164
|
+
return DataFrame(columns=columns) if columns else DataFrame()
|
|
165
|
+
|
|
166
|
+
spans: list[tuple[int, int]] = [(r.start, r.end) for r in self.ranges]
|
|
167
|
+
|
|
168
|
+
# DataFrame source: slice directly
|
|
169
|
+
if self.parent.is_dataframe:
|
|
170
|
+
df: DataFrame = self.parent.data
|
|
171
|
+
n = len(df)
|
|
172
|
+
if n == 0:
|
|
173
|
+
return DataFrame(columns=columns) if columns else DataFrame()
|
|
174
|
+
|
|
175
|
+
# Clamp spans to df length; build parts
|
|
176
|
+
parts: list[DataFrame] = []
|
|
177
|
+
for s, e in spans:
|
|
178
|
+
if s > e or s >= n or e < 0:
|
|
179
|
+
continue
|
|
180
|
+
lo = max(0, s)
|
|
181
|
+
hi = min(n - 1, e)
|
|
182
|
+
part: DataFrame = df.iloc[lo : hi + 1]
|
|
183
|
+
parts.append(part if columns is None else part.loc[:, columns])
|
|
184
|
+
|
|
185
|
+
if not parts:
|
|
186
|
+
return DataFrame(columns=columns) if columns else DataFrame()
|
|
187
|
+
return concat(parts, axis=0, ignore_index=False)
|
|
188
|
+
|
|
189
|
+
# CSV source: delegate reading to csv_readers
|
|
190
|
+
if self.parent.is_path:
|
|
191
|
+
path = self.parent.data
|
|
192
|
+
cfg = CsvReadConfig(usecols=columns)
|
|
193
|
+
out = read_csv_ranges(path, spans, cfg=cfg, chunk_size=chunk_size)
|
|
194
|
+
|
|
195
|
+
return out if columns is None else out.loc[:, columns]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@dataclass
|
|
199
|
+
class Issues:
|
|
200
|
+
"""List-like container holding Issues with case-insensitive get and range
|
|
201
|
+
merging."""
|
|
202
|
+
|
|
203
|
+
# Magic
|
|
204
|
+
def __init__(self) -> None:
|
|
205
|
+
self._items: list[Issue] = []
|
|
206
|
+
self._index: dict[
|
|
207
|
+
tuple[str, Optional[str], IssueType], Issue
|
|
208
|
+
] = {} # table, column, issue_type
|
|
209
|
+
|
|
210
|
+
def __iter__(self) -> Iterator[Issue]:
|
|
211
|
+
return iter(self._items)
|
|
212
|
+
|
|
213
|
+
def __len__(self) -> int:
|
|
214
|
+
return len(self._items)
|
|
215
|
+
|
|
216
|
+
def __bool__(self) -> bool:
|
|
217
|
+
return bool(self._items)
|
|
218
|
+
|
|
219
|
+
def __getitem__(self, idx) -> Issue | list[Issue]:
|
|
220
|
+
return self._items[idx]
|
|
221
|
+
|
|
222
|
+
def __repr__(self) -> str:
|
|
223
|
+
if not self._items:
|
|
224
|
+
return "Issues([])"
|
|
225
|
+
issues = list_as_bullets(elements=[repr(item) for item in self._items])
|
|
226
|
+
return f"Issues({issues}\n)"
|
|
227
|
+
|
|
228
|
+
# Methods
|
|
229
|
+
def add(
|
|
230
|
+
self,
|
|
231
|
+
issue_type: IssueType,
|
|
232
|
+
table: str,
|
|
233
|
+
column: str | None = None,
|
|
234
|
+
ranges: Iterable[Range] | None = None,
|
|
235
|
+
parent: DatasetItemLike | None = None,
|
|
236
|
+
) -> Issue:
|
|
237
|
+
key = (
|
|
238
|
+
_normalise_name(table),
|
|
239
|
+
_normalise_name(column) if column is not None else None,
|
|
240
|
+
issue_type,
|
|
241
|
+
)
|
|
242
|
+
issue = self._index.get(key)
|
|
243
|
+
if issue is None:
|
|
244
|
+
issue = Issue(type=issue_type, table=table, column=column, parent=parent)
|
|
245
|
+
self._items.append(issue)
|
|
246
|
+
self._index[key] = issue
|
|
247
|
+
if ranges:
|
|
248
|
+
issue.add_ranges(ranges)
|
|
249
|
+
return issue
|
|
250
|
+
|
|
251
|
+
def get(
|
|
252
|
+
self,
|
|
253
|
+
table: str,
|
|
254
|
+
column: str | None = None,
|
|
255
|
+
issue_type: IssueType | None = None,
|
|
256
|
+
) -> list[Issue]:
|
|
257
|
+
"""Case-insensitive filter; any arg can be None to act as a wildcard."""
|
|
258
|
+
table = _normalise_name(table)
|
|
259
|
+
column = _normalise_name(column) if column is not None else None
|
|
260
|
+
output: list[Issue] = []
|
|
261
|
+
if issue_type is not None:
|
|
262
|
+
# direct index lookup where possible
|
|
263
|
+
key = (table, column, issue_type)
|
|
264
|
+
hit = self._index.get(key)
|
|
265
|
+
if hit:
|
|
266
|
+
output.append(hit)
|
|
267
|
+
return output
|
|
268
|
+
|
|
269
|
+
# otherwise scan (still cheap; we maintain a compact list)
|
|
270
|
+
for item in self._items:
|
|
271
|
+
if _normalise_name(item.table) != table:
|
|
272
|
+
continue
|
|
273
|
+
if column is not None and (_normalise_name(item.column) or "") != column:
|
|
274
|
+
continue
|
|
275
|
+
output.append(item)
|
|
276
|
+
return output
|
|
277
|
+
|
|
278
|
+
def extend(self, issues: Issues) -> None:
|
|
279
|
+
for issue in issues:
|
|
280
|
+
self.add(issue.type, issue.table, issue.column, issue.ranges, issue.parent)
|