valediction 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -8
- valediction/convenience.py +50 -50
- valediction/data_types/data_type_helpers.py +75 -75
- valediction/data_types/data_types.py +58 -58
- valediction/data_types/type_inference.py +541 -541
- valediction/datasets/datasets.py +870 -870
- valediction/datasets/datasets_helpers.py +46 -46
- valediction/demo/DEMOGRAPHICS.csv +101 -101
- valediction/demo/DIAGNOSES.csv +650 -650
- valediction/demo/LAB_TESTS.csv +1001 -1001
- valediction/demo/VITALS.csv +1001 -1001
- valediction/demo/__init__.py +6 -6
- valediction/demo/demo_dictionary.py +129 -129
- valediction/dictionary/exporting.py +501 -501
- valediction/dictionary/exporting_helpers.py +371 -371
- valediction/dictionary/generation.py +357 -357
- valediction/dictionary/helpers.py +174 -174
- valediction/dictionary/importing.py +494 -494
- valediction/dictionary/integrity.py +37 -37
- valediction/dictionary/model.py +582 -582
- valediction/exceptions.py +22 -22
- valediction/integrity.py +97 -97
- valediction/io/csv_readers.py +307 -307
- valediction/progress.py +206 -206
- valediction/support.py +72 -72
- valediction/validation/helpers.py +315 -315
- valediction/validation/issues.py +280 -280
- valediction/validation/validation.py +598 -598
- {valediction-1.0.0.dist-info → valediction-1.0.3.dist-info}/METADATA +1 -1
- valediction-1.0.3.dist-info/RECORD +38 -0
- {valediction-1.0.0.dist-info → valediction-1.0.3.dist-info}/WHEEL +1 -1
- valediction-1.0.0.dist-info/RECORD +0 -38
valediction/io/csv_readers.py
CHANGED
|
@@ -1,307 +1,307 @@
|
|
|
1
|
-
# valediction/io/csv_readers.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from math import ceil
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Iterator, NamedTuple
|
|
9
|
-
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from pandas import DataFrame
|
|
12
|
-
from pandas.errors import ParserError
|
|
13
|
-
|
|
14
|
-
from valediction.support import _normalise_name
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class FrameChunk(NamedTuple):
|
|
18
|
-
"""A chunk of rows + I/O metadata.
|
|
19
|
-
|
|
20
|
-
- start/end are 0-based inclusive row numbers in the full dataset.
|
|
21
|
-
- file_pos/total_size/bytes_read are None when not reading from disk.
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
df: DataFrame
|
|
25
|
-
start: int # 0-based, inclusive
|
|
26
|
-
end: int # 10-based, inclusive
|
|
27
|
-
total_size: int | None # bytes of the whole file
|
|
28
|
-
file_pos: int | None # f.tell() after producing this chunk
|
|
29
|
-
bytes_read: int | None # bytes consumed to produce this chunk
|
|
30
|
-
chunk_index: int | None # 0-based index of this chunk
|
|
31
|
-
|
|
32
|
-
# Cumulative Totals
|
|
33
|
-
total_bytes_read: int | None
|
|
34
|
-
total_chunks_seen: int | None
|
|
35
|
-
|
|
36
|
-
def estimate_chunk_count(self) -> int:
|
|
37
|
-
# Buffers (accounting for CSV tails/bytes innacuracy)
|
|
38
|
-
EPS_ABS = 4096 # Fixed
|
|
39
|
-
EPS_REL = 0.05 # 5% tail buffer
|
|
40
|
-
|
|
41
|
-
bytes_seen = int(self.total_bytes_read)
|
|
42
|
-
chunks_seen = max(1, int(self.total_chunks_seen))
|
|
43
|
-
average = max(1.0, bytes_seen / float(chunks_seen))
|
|
44
|
-
|
|
45
|
-
remaining = max(0, int(self.total_size) - bytes_seen)
|
|
46
|
-
|
|
47
|
-
# Account for small tail if potentially complete
|
|
48
|
-
tail_thresh = max(EPS_ABS, int(EPS_REL * average))
|
|
49
|
-
if remaining <= tail_thresh:
|
|
50
|
-
remaining = 0
|
|
51
|
-
|
|
52
|
-
return chunks_seen + (0 if remaining == 0 else int(ceil(remaining / average)))
|
|
53
|
-
|
|
54
|
-
def update_df(self, df: DataFrame) -> FrameChunk:
|
|
55
|
-
return self._replace(df=df)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
@dataclass(slots=True)
|
|
59
|
-
class CsvReadConfig:
|
|
60
|
-
"""Canonical CSV reading defaults for the overall project.
|
|
61
|
-
|
|
62
|
-
Notes:
|
|
63
|
-
- dtype="string" always reads columns as string, permitting downstream inference/validation.
|
|
64
|
-
- keep_default_na=False and na_values=[] prevent pandas from coercing tokens like "NA".
|
|
65
|
-
- We normalise headers and strip string values post-read (vectorised).
|
|
66
|
-
"""
|
|
67
|
-
|
|
68
|
-
dtype: str = "string"
|
|
69
|
-
keep_default_na: bool = False
|
|
70
|
-
na_values: list[str] | None = None
|
|
71
|
-
encoding: str = "utf-8"
|
|
72
|
-
normalise_headers: bool = True
|
|
73
|
-
strip_values: bool = True
|
|
74
|
-
usecols: list[str] | None = None
|
|
75
|
-
|
|
76
|
-
def __post_init__(self) -> None:
|
|
77
|
-
if self.na_values is None:
|
|
78
|
-
self.na_values = []
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _kwargs(cfg: CsvReadConfig | None = None) -> dict:
|
|
82
|
-
cfg = cfg or CsvReadConfig()
|
|
83
|
-
return dict(
|
|
84
|
-
dtype=cfg.dtype,
|
|
85
|
-
keep_default_na=cfg.keep_default_na,
|
|
86
|
-
na_values=cfg.na_values,
|
|
87
|
-
encoding=cfg.encoding,
|
|
88
|
-
usecols=cfg.usecols,
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
|
|
93
|
-
"""Apply header normalisation and vectorised value stripping after reading."""
|
|
94
|
-
cfg = cfg or CsvReadConfig()
|
|
95
|
-
if cfg.normalise_headers:
|
|
96
|
-
df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
|
|
97
|
-
if cfg.strip_values:
|
|
98
|
-
str_cols = df.select_dtypes(include=["string"]).columns
|
|
99
|
-
if len(str_cols) > 0:
|
|
100
|
-
df[str_cols] = df[str_cols].apply(lambda s: s.str.strip())
|
|
101
|
-
return df
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def read_csv_headers(path: str | Path, cfg: CsvReadConfig | None = None) -> DataFrame:
|
|
105
|
-
"""Read headers only (nrows=0) with canonical settings; returns a DataFrame."""
|
|
106
|
-
cfg = cfg or CsvReadConfig()
|
|
107
|
-
cfg.strip_values = False
|
|
108
|
-
|
|
109
|
-
try:
|
|
110
|
-
header = pd.read_csv(path, nrows=0, **_kwargs(cfg))
|
|
111
|
-
return _post_read_processing(header, cfg)
|
|
112
|
-
|
|
113
|
-
except ParserError as e:
|
|
114
|
-
raise ParserError(
|
|
115
|
-
f"Malformed CSV while reading header from '{path}': {e}"
|
|
116
|
-
) from e
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def read_csv_all(path: str | Path, cfg: CsvReadConfig | None = None) -> FrameChunk:
|
|
120
|
-
"""Read the entire CSV with canonical settings; returns a single FrameChunk."""
|
|
121
|
-
cfg = cfg or CsvReadConfig()
|
|
122
|
-
try:
|
|
123
|
-
file_size = os.path.getsize(path)
|
|
124
|
-
|
|
125
|
-
with open(path, "rb") as file:
|
|
126
|
-
start_pos = file.tell()
|
|
127
|
-
df = pd.read_csv(file, **_kwargs(cfg))
|
|
128
|
-
end_pos = file.tell()
|
|
129
|
-
df = _post_read_processing(df, cfg)
|
|
130
|
-
n = len(df)
|
|
131
|
-
|
|
132
|
-
return FrameChunk(
|
|
133
|
-
df=df,
|
|
134
|
-
start=0,
|
|
135
|
-
end=n - 1,
|
|
136
|
-
total_size=file_size,
|
|
137
|
-
file_pos=end_pos,
|
|
138
|
-
bytes_read=end_pos - start_pos,
|
|
139
|
-
chunk_index=1,
|
|
140
|
-
total_bytes_read=file_size,
|
|
141
|
-
total_chunks_seen=1,
|
|
142
|
-
)
|
|
143
|
-
except ParserError as e:
|
|
144
|
-
raise ParserError(f"Malformed CSV while reading '{path}': {e}") from e
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def read_csv_sample(
|
|
148
|
-
path: str | Path, nrows: int, cfg: CsvReadConfig | None = None
|
|
149
|
-
) -> FrameChunk:
|
|
150
|
-
"""Read first `nrows` with canonical settings; returns a FrameChunk with I/O
|
|
151
|
-
metadata."""
|
|
152
|
-
cfg = cfg or CsvReadConfig()
|
|
153
|
-
|
|
154
|
-
try:
|
|
155
|
-
file_size = os.path.getsize(path)
|
|
156
|
-
with open(path, "rb") as file:
|
|
157
|
-
start_pos = file.tell()
|
|
158
|
-
df = pd.read_csv(file, nrows=nrows, **_kwargs(cfg))
|
|
159
|
-
end_pos = file.tell()
|
|
160
|
-
|
|
161
|
-
df = _post_read_processing(df, cfg)
|
|
162
|
-
n = len(df)
|
|
163
|
-
|
|
164
|
-
bytes_read = (end_pos - start_pos) if end_pos > 0 else None
|
|
165
|
-
file_pos = end_pos if end_pos > 0 else None
|
|
166
|
-
|
|
167
|
-
return FrameChunk(
|
|
168
|
-
df=df,
|
|
169
|
-
start=0,
|
|
170
|
-
end=n - 1,
|
|
171
|
-
total_size=file_size,
|
|
172
|
-
file_pos=file_pos,
|
|
173
|
-
bytes_read=bytes_read,
|
|
174
|
-
chunk_index=1,
|
|
175
|
-
total_bytes_read=bytes_read or 0,
|
|
176
|
-
total_chunks_seen=1,
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
except ParserError as e:
|
|
180
|
-
raise ParserError(
|
|
181
|
-
f"Malformed CSV while reading sample from '{path}': {e}"
|
|
182
|
-
) from e
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def iter_csv_chunks(
|
|
186
|
-
path: str | Path, chunk_size: int | None, cfg: CsvReadConfig | None = None
|
|
187
|
-
) -> Iterator[FrameChunk]:
|
|
188
|
-
"""Yield FrameChunk with canonical settings.
|
|
189
|
-
|
|
190
|
-
Behaviour:
|
|
191
|
-
- If chunk_size is None or <= 0: yields a single chunk for the entire file.
|
|
192
|
-
- Else: yields multiple chunks each with populated bytes/position metadata.
|
|
193
|
-
"""
|
|
194
|
-
cfg = cfg or CsvReadConfig()
|
|
195
|
-
try:
|
|
196
|
-
file_size = os.path.getsize(path)
|
|
197
|
-
|
|
198
|
-
# No chunking: one full-file chunk with metadata
|
|
199
|
-
if not chunk_size or (isinstance(chunk_size, int) and chunk_size <= 0):
|
|
200
|
-
with open(path, "rb") as file:
|
|
201
|
-
start_pos = file.tell()
|
|
202
|
-
df = pd.read_csv(file, **_kwargs(cfg))
|
|
203
|
-
end_pos = file.tell()
|
|
204
|
-
df = _post_read_processing(df, cfg)
|
|
205
|
-
n = len(df)
|
|
206
|
-
if n == 0:
|
|
207
|
-
return
|
|
208
|
-
yield FrameChunk(
|
|
209
|
-
df=df,
|
|
210
|
-
start=0,
|
|
211
|
-
end=n - 1,
|
|
212
|
-
total_size=file_size,
|
|
213
|
-
file_pos=end_pos,
|
|
214
|
-
bytes_read=file_size,
|
|
215
|
-
chunk_index=1,
|
|
216
|
-
total_bytes_read=end_pos - start_pos,
|
|
217
|
-
total_chunks_seen=1,
|
|
218
|
-
)
|
|
219
|
-
return
|
|
220
|
-
|
|
221
|
-
# Chunking: stream with bytes/pos metadata
|
|
222
|
-
with open(path, "rb") as file:
|
|
223
|
-
reader = pd.read_csv(file, chunksize=chunk_size, **_kwargs(cfg))
|
|
224
|
-
prev_pos = file.tell()
|
|
225
|
-
offset = 0
|
|
226
|
-
idx = 0
|
|
227
|
-
cumulative_bytes = 0
|
|
228
|
-
for raw in reader:
|
|
229
|
-
idx += 1
|
|
230
|
-
curr_pos = file.tell()
|
|
231
|
-
bytes_read = max(0, curr_pos - prev_pos)
|
|
232
|
-
prev_pos = curr_pos
|
|
233
|
-
cumulative_bytes += bytes_read
|
|
234
|
-
|
|
235
|
-
df = _post_read_processing(raw, cfg)
|
|
236
|
-
n = len(df)
|
|
237
|
-
if n == 0:
|
|
238
|
-
continue
|
|
239
|
-
|
|
240
|
-
start = offset
|
|
241
|
-
end = offset + n - 1
|
|
242
|
-
offset += n
|
|
243
|
-
|
|
244
|
-
yield FrameChunk(
|
|
245
|
-
df=df,
|
|
246
|
-
start=start,
|
|
247
|
-
end=end,
|
|
248
|
-
total_size=file_size,
|
|
249
|
-
file_pos=curr_pos,
|
|
250
|
-
bytes_read=bytes_read,
|
|
251
|
-
chunk_index=idx,
|
|
252
|
-
total_bytes_read=cumulative_bytes,
|
|
253
|
-
total_chunks_seen=idx,
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
except ParserError as e:
|
|
257
|
-
raise ParserError(
|
|
258
|
-
f"Malformed CSV while reading chunks from '{path}': {e}"
|
|
259
|
-
) from e
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
# Reading specific ranges
|
|
263
|
-
def _intersect_local_spans(
|
|
264
|
-
ranges: list[tuple[int, int]],
|
|
265
|
-
chunk_start: int,
|
|
266
|
-
chunk_end: int,
|
|
267
|
-
) -> list[tuple[int, int]]:
|
|
268
|
-
out: list[tuple[int, int]] = []
|
|
269
|
-
for r_start, r_end in ranges:
|
|
270
|
-
lo = max(r_start, chunk_start)
|
|
271
|
-
hi = min(r_end, chunk_end)
|
|
272
|
-
if hi >= lo:
|
|
273
|
-
out.append((lo - chunk_start, hi - chunk_start))
|
|
274
|
-
return out
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def read_csv_ranges(
|
|
278
|
-
path: str | Path,
|
|
279
|
-
ranges: list[tuple[int, int]],
|
|
280
|
-
cfg: CsvReadConfig | None = None,
|
|
281
|
-
chunk_size: int | None = 1_000_000,
|
|
282
|
-
) -> pd.DataFrame:
|
|
283
|
-
"""Read only the rows covered by `ranges` (global 0-based inclusive pairs).
|
|
284
|
-
|
|
285
|
-
Respects CsvReadConfig (including usecols for column pruning). Returns a
|
|
286
|
-
concatenated DataFrame (may be empty).
|
|
287
|
-
"""
|
|
288
|
-
if not ranges:
|
|
289
|
-
# honour columns if specified
|
|
290
|
-
cols = cfg.usecols if (cfg and cfg.usecols) else None
|
|
291
|
-
return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
|
|
292
|
-
|
|
293
|
-
parts: list[pd.DataFrame] = []
|
|
294
|
-
for chunk in iter_csv_chunks(path, chunk_size=chunk_size, cfg=cfg):
|
|
295
|
-
local_spans = _intersect_local_spans(ranges, chunk.start, chunk.end)
|
|
296
|
-
if not local_spans:
|
|
297
|
-
continue
|
|
298
|
-
|
|
299
|
-
for lo, hi in local_spans:
|
|
300
|
-
part = chunk.df.iloc[lo : hi + 1]
|
|
301
|
-
parts.append(part)
|
|
302
|
-
|
|
303
|
-
if not parts:
|
|
304
|
-
cols = cfg.usecols if (cfg and cfg.usecols) else None
|
|
305
|
-
return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
|
|
306
|
-
|
|
307
|
-
return pd.concat(parts, axis=0, ignore_index=False)
|
|
1
|
+
# valediction/io/csv_readers.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from math import ceil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator, NamedTuple
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pandas import DataFrame
|
|
12
|
+
from pandas.errors import ParserError
|
|
13
|
+
|
|
14
|
+
from valediction.support import _normalise_name
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FrameChunk(NamedTuple):
|
|
18
|
+
"""A chunk of rows + I/O metadata.
|
|
19
|
+
|
|
20
|
+
- start/end are 0-based inclusive row numbers in the full dataset.
|
|
21
|
+
- file_pos/total_size/bytes_read are None when not reading from disk.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
df: DataFrame
|
|
25
|
+
start: int # 0-based, inclusive
|
|
26
|
+
end: int # 10-based, inclusive
|
|
27
|
+
total_size: int | None # bytes of the whole file
|
|
28
|
+
file_pos: int | None # f.tell() after producing this chunk
|
|
29
|
+
bytes_read: int | None # bytes consumed to produce this chunk
|
|
30
|
+
chunk_index: int | None # 0-based index of this chunk
|
|
31
|
+
|
|
32
|
+
# Cumulative Totals
|
|
33
|
+
total_bytes_read: int | None
|
|
34
|
+
total_chunks_seen: int | None
|
|
35
|
+
|
|
36
|
+
def estimate_chunk_count(self) -> int:
|
|
37
|
+
# Buffers (accounting for CSV tails/bytes innacuracy)
|
|
38
|
+
EPS_ABS = 4096 # Fixed
|
|
39
|
+
EPS_REL = 0.05 # 5% tail buffer
|
|
40
|
+
|
|
41
|
+
bytes_seen = int(self.total_bytes_read)
|
|
42
|
+
chunks_seen = max(1, int(self.total_chunks_seen))
|
|
43
|
+
average = max(1.0, bytes_seen / float(chunks_seen))
|
|
44
|
+
|
|
45
|
+
remaining = max(0, int(self.total_size) - bytes_seen)
|
|
46
|
+
|
|
47
|
+
# Account for small tail if potentially complete
|
|
48
|
+
tail_thresh = max(EPS_ABS, int(EPS_REL * average))
|
|
49
|
+
if remaining <= tail_thresh:
|
|
50
|
+
remaining = 0
|
|
51
|
+
|
|
52
|
+
return chunks_seen + (0 if remaining == 0 else int(ceil(remaining / average)))
|
|
53
|
+
|
|
54
|
+
def update_df(self, df: DataFrame) -> FrameChunk:
|
|
55
|
+
return self._replace(df=df)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(slots=True)
|
|
59
|
+
class CsvReadConfig:
|
|
60
|
+
"""Canonical CSV reading defaults for the overall project.
|
|
61
|
+
|
|
62
|
+
Notes:
|
|
63
|
+
- dtype="string" always reads columns as string, permitting downstream inference/validation.
|
|
64
|
+
- keep_default_na=False and na_values=[] prevent pandas from coercing tokens like "NA".
|
|
65
|
+
- We normalise headers and strip string values post-read (vectorised).
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
dtype: str = "string"
|
|
69
|
+
keep_default_na: bool = False
|
|
70
|
+
na_values: list[str] | None = None
|
|
71
|
+
encoding: str = "utf-8"
|
|
72
|
+
normalise_headers: bool = True
|
|
73
|
+
strip_values: bool = True
|
|
74
|
+
usecols: list[str] | None = None
|
|
75
|
+
|
|
76
|
+
def __post_init__(self) -> None:
|
|
77
|
+
if self.na_values is None:
|
|
78
|
+
self.na_values = []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _kwargs(cfg: CsvReadConfig | None = None) -> dict:
|
|
82
|
+
cfg = cfg or CsvReadConfig()
|
|
83
|
+
return dict(
|
|
84
|
+
dtype=cfg.dtype,
|
|
85
|
+
keep_default_na=cfg.keep_default_na,
|
|
86
|
+
na_values=cfg.na_values,
|
|
87
|
+
encoding=cfg.encoding,
|
|
88
|
+
usecols=cfg.usecols,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
|
|
93
|
+
"""Apply header normalisation and vectorised value stripping after reading."""
|
|
94
|
+
cfg = cfg or CsvReadConfig()
|
|
95
|
+
if cfg.normalise_headers:
|
|
96
|
+
df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
|
|
97
|
+
if cfg.strip_values:
|
|
98
|
+
str_cols = df.select_dtypes(include=["string"]).columns
|
|
99
|
+
if len(str_cols) > 0:
|
|
100
|
+
df[str_cols] = df[str_cols].apply(lambda s: s.str.strip())
|
|
101
|
+
return df
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def read_csv_headers(path: str | Path, cfg: CsvReadConfig | None = None) -> DataFrame:
|
|
105
|
+
"""Read headers only (nrows=0) with canonical settings; returns a DataFrame."""
|
|
106
|
+
cfg = cfg or CsvReadConfig()
|
|
107
|
+
cfg.strip_values = False
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
header = pd.read_csv(path, nrows=0, **_kwargs(cfg))
|
|
111
|
+
return _post_read_processing(header, cfg)
|
|
112
|
+
|
|
113
|
+
except ParserError as e:
|
|
114
|
+
raise ParserError(
|
|
115
|
+
f"Malformed CSV while reading header from '{path}': {e}"
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def read_csv_all(path: str | Path, cfg: CsvReadConfig | None = None) -> FrameChunk:
|
|
120
|
+
"""Read the entire CSV with canonical settings; returns a single FrameChunk."""
|
|
121
|
+
cfg = cfg or CsvReadConfig()
|
|
122
|
+
try:
|
|
123
|
+
file_size = os.path.getsize(path)
|
|
124
|
+
|
|
125
|
+
with open(path, "rb") as file:
|
|
126
|
+
start_pos = file.tell()
|
|
127
|
+
df = pd.read_csv(file, **_kwargs(cfg))
|
|
128
|
+
end_pos = file.tell()
|
|
129
|
+
df = _post_read_processing(df, cfg)
|
|
130
|
+
n = len(df)
|
|
131
|
+
|
|
132
|
+
return FrameChunk(
|
|
133
|
+
df=df,
|
|
134
|
+
start=0,
|
|
135
|
+
end=n - 1,
|
|
136
|
+
total_size=file_size,
|
|
137
|
+
file_pos=end_pos,
|
|
138
|
+
bytes_read=end_pos - start_pos,
|
|
139
|
+
chunk_index=1,
|
|
140
|
+
total_bytes_read=file_size,
|
|
141
|
+
total_chunks_seen=1,
|
|
142
|
+
)
|
|
143
|
+
except ParserError as e:
|
|
144
|
+
raise ParserError(f"Malformed CSV while reading '{path}': {e}") from e
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def read_csv_sample(
|
|
148
|
+
path: str | Path, nrows: int, cfg: CsvReadConfig | None = None
|
|
149
|
+
) -> FrameChunk:
|
|
150
|
+
"""Read first `nrows` with canonical settings; returns a FrameChunk with I/O
|
|
151
|
+
metadata."""
|
|
152
|
+
cfg = cfg or CsvReadConfig()
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
file_size = os.path.getsize(path)
|
|
156
|
+
with open(path, "rb") as file:
|
|
157
|
+
start_pos = file.tell()
|
|
158
|
+
df = pd.read_csv(file, nrows=nrows, **_kwargs(cfg))
|
|
159
|
+
end_pos = file.tell()
|
|
160
|
+
|
|
161
|
+
df = _post_read_processing(df, cfg)
|
|
162
|
+
n = len(df)
|
|
163
|
+
|
|
164
|
+
bytes_read = (end_pos - start_pos) if end_pos > 0 else None
|
|
165
|
+
file_pos = end_pos if end_pos > 0 else None
|
|
166
|
+
|
|
167
|
+
return FrameChunk(
|
|
168
|
+
df=df,
|
|
169
|
+
start=0,
|
|
170
|
+
end=n - 1,
|
|
171
|
+
total_size=file_size,
|
|
172
|
+
file_pos=file_pos,
|
|
173
|
+
bytes_read=bytes_read,
|
|
174
|
+
chunk_index=1,
|
|
175
|
+
total_bytes_read=bytes_read or 0,
|
|
176
|
+
total_chunks_seen=1,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
except ParserError as e:
|
|
180
|
+
raise ParserError(
|
|
181
|
+
f"Malformed CSV while reading sample from '{path}': {e}"
|
|
182
|
+
) from e
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def iter_csv_chunks(
|
|
186
|
+
path: str | Path, chunk_size: int | None, cfg: CsvReadConfig | None = None
|
|
187
|
+
) -> Iterator[FrameChunk]:
|
|
188
|
+
"""Yield FrameChunk with canonical settings.
|
|
189
|
+
|
|
190
|
+
Behaviour:
|
|
191
|
+
- If chunk_size is None or <= 0: yields a single chunk for the entire file.
|
|
192
|
+
- Else: yields multiple chunks each with populated bytes/position metadata.
|
|
193
|
+
"""
|
|
194
|
+
cfg = cfg or CsvReadConfig()
|
|
195
|
+
try:
|
|
196
|
+
file_size = os.path.getsize(path)
|
|
197
|
+
|
|
198
|
+
# No chunking: one full-file chunk with metadata
|
|
199
|
+
if not chunk_size or (isinstance(chunk_size, int) and chunk_size <= 0):
|
|
200
|
+
with open(path, "rb") as file:
|
|
201
|
+
start_pos = file.tell()
|
|
202
|
+
df = pd.read_csv(file, **_kwargs(cfg))
|
|
203
|
+
end_pos = file.tell()
|
|
204
|
+
df = _post_read_processing(df, cfg)
|
|
205
|
+
n = len(df)
|
|
206
|
+
if n == 0:
|
|
207
|
+
return
|
|
208
|
+
yield FrameChunk(
|
|
209
|
+
df=df,
|
|
210
|
+
start=0,
|
|
211
|
+
end=n - 1,
|
|
212
|
+
total_size=file_size,
|
|
213
|
+
file_pos=end_pos,
|
|
214
|
+
bytes_read=file_size,
|
|
215
|
+
chunk_index=1,
|
|
216
|
+
total_bytes_read=end_pos - start_pos,
|
|
217
|
+
total_chunks_seen=1,
|
|
218
|
+
)
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
# Chunking: stream with bytes/pos metadata
|
|
222
|
+
with open(path, "rb") as file:
|
|
223
|
+
reader = pd.read_csv(file, chunksize=chunk_size, **_kwargs(cfg))
|
|
224
|
+
prev_pos = file.tell()
|
|
225
|
+
offset = 0
|
|
226
|
+
idx = 0
|
|
227
|
+
cumulative_bytes = 0
|
|
228
|
+
for raw in reader:
|
|
229
|
+
idx += 1
|
|
230
|
+
curr_pos = file.tell()
|
|
231
|
+
bytes_read = max(0, curr_pos - prev_pos)
|
|
232
|
+
prev_pos = curr_pos
|
|
233
|
+
cumulative_bytes += bytes_read
|
|
234
|
+
|
|
235
|
+
df = _post_read_processing(raw, cfg)
|
|
236
|
+
n = len(df)
|
|
237
|
+
if n == 0:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
start = offset
|
|
241
|
+
end = offset + n - 1
|
|
242
|
+
offset += n
|
|
243
|
+
|
|
244
|
+
yield FrameChunk(
|
|
245
|
+
df=df,
|
|
246
|
+
start=start,
|
|
247
|
+
end=end,
|
|
248
|
+
total_size=file_size,
|
|
249
|
+
file_pos=curr_pos,
|
|
250
|
+
bytes_read=bytes_read,
|
|
251
|
+
chunk_index=idx,
|
|
252
|
+
total_bytes_read=cumulative_bytes,
|
|
253
|
+
total_chunks_seen=idx,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
except ParserError as e:
|
|
257
|
+
raise ParserError(
|
|
258
|
+
f"Malformed CSV while reading chunks from '{path}': {e}"
|
|
259
|
+
) from e
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# Reading specific ranges
|
|
263
|
+
def _intersect_local_spans(
|
|
264
|
+
ranges: list[tuple[int, int]],
|
|
265
|
+
chunk_start: int,
|
|
266
|
+
chunk_end: int,
|
|
267
|
+
) -> list[tuple[int, int]]:
|
|
268
|
+
out: list[tuple[int, int]] = []
|
|
269
|
+
for r_start, r_end in ranges:
|
|
270
|
+
lo = max(r_start, chunk_start)
|
|
271
|
+
hi = min(r_end, chunk_end)
|
|
272
|
+
if hi >= lo:
|
|
273
|
+
out.append((lo - chunk_start, hi - chunk_start))
|
|
274
|
+
return out
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def read_csv_ranges(
|
|
278
|
+
path: str | Path,
|
|
279
|
+
ranges: list[tuple[int, int]],
|
|
280
|
+
cfg: CsvReadConfig | None = None,
|
|
281
|
+
chunk_size: int | None = 1_000_000,
|
|
282
|
+
) -> pd.DataFrame:
|
|
283
|
+
"""Read only the rows covered by `ranges` (global 0-based inclusive pairs).
|
|
284
|
+
|
|
285
|
+
Respects CsvReadConfig (including usecols for column pruning). Returns a
|
|
286
|
+
concatenated DataFrame (may be empty).
|
|
287
|
+
"""
|
|
288
|
+
if not ranges:
|
|
289
|
+
# honour columns if specified
|
|
290
|
+
cols = cfg.usecols if (cfg and cfg.usecols) else None
|
|
291
|
+
return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
|
|
292
|
+
|
|
293
|
+
parts: list[pd.DataFrame] = []
|
|
294
|
+
for chunk in iter_csv_chunks(path, chunk_size=chunk_size, cfg=cfg):
|
|
295
|
+
local_spans = _intersect_local_spans(ranges, chunk.start, chunk.end)
|
|
296
|
+
if not local_spans:
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
for lo, hi in local_spans:
|
|
300
|
+
part = chunk.df.iloc[lo : hi + 1]
|
|
301
|
+
parts.append(part)
|
|
302
|
+
|
|
303
|
+
if not parts:
|
|
304
|
+
cols = cfg.usecols if (cfg and cfg.usecols) else None
|
|
305
|
+
return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
|
|
306
|
+
|
|
307
|
+
return pd.concat(parts, axis=0, ignore_index=False)
|