valediction 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,307 +1,307 @@
1
- # valediction/io/csv_readers.py
2
- from __future__ import annotations
3
-
4
- import os
5
- from dataclasses import dataclass
6
- from math import ceil
7
- from pathlib import Path
8
- from typing import Iterator, NamedTuple
9
-
10
- import pandas as pd
11
- from pandas import DataFrame
12
- from pandas.errors import ParserError
13
-
14
- from valediction.support import _normalise_name
15
-
16
-
17
- class FrameChunk(NamedTuple):
18
- """A chunk of rows + I/O metadata.
19
-
20
- - start/end are 0-based inclusive row numbers in the full dataset.
21
- - file_pos/total_size/bytes_read are None when not reading from disk.
22
- """
23
-
24
- df: DataFrame
25
- start: int # 0-based, inclusive
26
- end: int # 10-based, inclusive
27
- total_size: int | None # bytes of the whole file
28
- file_pos: int | None # f.tell() after producing this chunk
29
- bytes_read: int | None # bytes consumed to produce this chunk
30
- chunk_index: int | None # 0-based index of this chunk
31
-
32
- # Cumulative Totals
33
- total_bytes_read: int | None
34
- total_chunks_seen: int | None
35
-
36
- def estimate_chunk_count(self) -> int:
37
- # Buffers (accounting for CSV tails/bytes innacuracy)
38
- EPS_ABS = 4096 # Fixed
39
- EPS_REL = 0.05 # 5% tail buffer
40
-
41
- bytes_seen = int(self.total_bytes_read)
42
- chunks_seen = max(1, int(self.total_chunks_seen))
43
- average = max(1.0, bytes_seen / float(chunks_seen))
44
-
45
- remaining = max(0, int(self.total_size) - bytes_seen)
46
-
47
- # Account for small tail if potentially complete
48
- tail_thresh = max(EPS_ABS, int(EPS_REL * average))
49
- if remaining <= tail_thresh:
50
- remaining = 0
51
-
52
- return chunks_seen + (0 if remaining == 0 else int(ceil(remaining / average)))
53
-
54
- def update_df(self, df: DataFrame) -> FrameChunk:
55
- return self._replace(df=df)
56
-
57
-
58
- @dataclass(slots=True)
59
- class CsvReadConfig:
60
- """Canonical CSV reading defaults for the overall project.
61
-
62
- Notes:
63
- - dtype="string" always reads columns as string, permitting downstream inference/validation.
64
- - keep_default_na=False and na_values=[] prevent pandas from coercing tokens like "NA".
65
- - We normalise headers and strip string values post-read (vectorised).
66
- """
67
-
68
- dtype: str = "string"
69
- keep_default_na: bool = False
70
- na_values: list[str] | None = None
71
- encoding: str = "utf-8"
72
- normalise_headers: bool = True
73
- strip_values: bool = True
74
- usecols: list[str] | None = None
75
-
76
- def __post_init__(self) -> None:
77
- if self.na_values is None:
78
- self.na_values = []
79
-
80
-
81
- def _kwargs(cfg: CsvReadConfig | None = None) -> dict:
82
- cfg = cfg or CsvReadConfig()
83
- return dict(
84
- dtype=cfg.dtype,
85
- keep_default_na=cfg.keep_default_na,
86
- na_values=cfg.na_values,
87
- encoding=cfg.encoding,
88
- usecols=cfg.usecols,
89
- )
90
-
91
-
92
- def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
93
- """Apply header normalisation and vectorised value stripping after reading."""
94
- cfg = cfg or CsvReadConfig()
95
- if cfg.normalise_headers:
96
- df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
97
- if cfg.strip_values:
98
- str_cols = df.select_dtypes(include=["string"]).columns
99
- if len(str_cols) > 0:
100
- df[str_cols] = df[str_cols].apply(lambda s: s.str.strip())
101
- return df
102
-
103
-
104
- def read_csv_headers(path: str | Path, cfg: CsvReadConfig | None = None) -> DataFrame:
105
- """Read headers only (nrows=0) with canonical settings; returns a DataFrame."""
106
- cfg = cfg or CsvReadConfig()
107
- cfg.strip_values = False
108
-
109
- try:
110
- header = pd.read_csv(path, nrows=0, **_kwargs(cfg))
111
- return _post_read_processing(header, cfg)
112
-
113
- except ParserError as e:
114
- raise ParserError(
115
- f"Malformed CSV while reading header from '{path}': {e}"
116
- ) from e
117
-
118
-
119
- def read_csv_all(path: str | Path, cfg: CsvReadConfig | None = None) -> FrameChunk:
120
- """Read the entire CSV with canonical settings; returns a single FrameChunk."""
121
- cfg = cfg or CsvReadConfig()
122
- try:
123
- file_size = os.path.getsize(path)
124
-
125
- with open(path, "rb") as file:
126
- start_pos = file.tell()
127
- df = pd.read_csv(file, **_kwargs(cfg))
128
- end_pos = file.tell()
129
- df = _post_read_processing(df, cfg)
130
- n = len(df)
131
-
132
- return FrameChunk(
133
- df=df,
134
- start=0,
135
- end=n - 1,
136
- total_size=file_size,
137
- file_pos=end_pos,
138
- bytes_read=end_pos - start_pos,
139
- chunk_index=1,
140
- total_bytes_read=file_size,
141
- total_chunks_seen=1,
142
- )
143
- except ParserError as e:
144
- raise ParserError(f"Malformed CSV while reading '{path}': {e}") from e
145
-
146
-
147
- def read_csv_sample(
148
- path: str | Path, nrows: int, cfg: CsvReadConfig | None = None
149
- ) -> FrameChunk:
150
- """Read first `nrows` with canonical settings; returns a FrameChunk with I/O
151
- metadata."""
152
- cfg = cfg or CsvReadConfig()
153
-
154
- try:
155
- file_size = os.path.getsize(path)
156
- with open(path, "rb") as file:
157
- start_pos = file.tell()
158
- df = pd.read_csv(file, nrows=nrows, **_kwargs(cfg))
159
- end_pos = file.tell()
160
-
161
- df = _post_read_processing(df, cfg)
162
- n = len(df)
163
-
164
- bytes_read = (end_pos - start_pos) if end_pos > 0 else None
165
- file_pos = end_pos if end_pos > 0 else None
166
-
167
- return FrameChunk(
168
- df=df,
169
- start=0,
170
- end=n - 1,
171
- total_size=file_size,
172
- file_pos=file_pos,
173
- bytes_read=bytes_read,
174
- chunk_index=1,
175
- total_bytes_read=bytes_read or 0,
176
- total_chunks_seen=1,
177
- )
178
-
179
- except ParserError as e:
180
- raise ParserError(
181
- f"Malformed CSV while reading sample from '{path}': {e}"
182
- ) from e
183
-
184
-
185
- def iter_csv_chunks(
186
- path: str | Path, chunk_size: int | None, cfg: CsvReadConfig | None = None
187
- ) -> Iterator[FrameChunk]:
188
- """Yield FrameChunk with canonical settings.
189
-
190
- Behaviour:
191
- - If chunk_size is None or <= 0: yields a single chunk for the entire file.
192
- - Else: yields multiple chunks each with populated bytes/position metadata.
193
- """
194
- cfg = cfg or CsvReadConfig()
195
- try:
196
- file_size = os.path.getsize(path)
197
-
198
- # No chunking: one full-file chunk with metadata
199
- if not chunk_size or (isinstance(chunk_size, int) and chunk_size <= 0):
200
- with open(path, "rb") as file:
201
- start_pos = file.tell()
202
- df = pd.read_csv(file, **_kwargs(cfg))
203
- end_pos = file.tell()
204
- df = _post_read_processing(df, cfg)
205
- n = len(df)
206
- if n == 0:
207
- return
208
- yield FrameChunk(
209
- df=df,
210
- start=0,
211
- end=n - 1,
212
- total_size=file_size,
213
- file_pos=end_pos,
214
- bytes_read=file_size,
215
- chunk_index=1,
216
- total_bytes_read=end_pos - start_pos,
217
- total_chunks_seen=1,
218
- )
219
- return
220
-
221
- # Chunking: stream with bytes/pos metadata
222
- with open(path, "rb") as file:
223
- reader = pd.read_csv(file, chunksize=chunk_size, **_kwargs(cfg))
224
- prev_pos = file.tell()
225
- offset = 0
226
- idx = 0
227
- cumulative_bytes = 0
228
- for raw in reader:
229
- idx += 1
230
- curr_pos = file.tell()
231
- bytes_read = max(0, curr_pos - prev_pos)
232
- prev_pos = curr_pos
233
- cumulative_bytes += bytes_read
234
-
235
- df = _post_read_processing(raw, cfg)
236
- n = len(df)
237
- if n == 0:
238
- continue
239
-
240
- start = offset
241
- end = offset + n - 1
242
- offset += n
243
-
244
- yield FrameChunk(
245
- df=df,
246
- start=start,
247
- end=end,
248
- total_size=file_size,
249
- file_pos=curr_pos,
250
- bytes_read=bytes_read,
251
- chunk_index=idx,
252
- total_bytes_read=cumulative_bytes,
253
- total_chunks_seen=idx,
254
- )
255
-
256
- except ParserError as e:
257
- raise ParserError(
258
- f"Malformed CSV while reading chunks from '{path}': {e}"
259
- ) from e
260
-
261
-
262
- # Reading specific ranges
263
- def _intersect_local_spans(
264
- ranges: list[tuple[int, int]],
265
- chunk_start: int,
266
- chunk_end: int,
267
- ) -> list[tuple[int, int]]:
268
- out: list[tuple[int, int]] = []
269
- for r_start, r_end in ranges:
270
- lo = max(r_start, chunk_start)
271
- hi = min(r_end, chunk_end)
272
- if hi >= lo:
273
- out.append((lo - chunk_start, hi - chunk_start))
274
- return out
275
-
276
-
277
- def read_csv_ranges(
278
- path: str | Path,
279
- ranges: list[tuple[int, int]],
280
- cfg: CsvReadConfig | None = None,
281
- chunk_size: int | None = 1_000_000,
282
- ) -> pd.DataFrame:
283
- """Read only the rows covered by `ranges` (global 0-based inclusive pairs).
284
-
285
- Respects CsvReadConfig (including usecols for column pruning). Returns a
286
- concatenated DataFrame (may be empty).
287
- """
288
- if not ranges:
289
- # honour columns if specified
290
- cols = cfg.usecols if (cfg and cfg.usecols) else None
291
- return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
292
-
293
- parts: list[pd.DataFrame] = []
294
- for chunk in iter_csv_chunks(path, chunk_size=chunk_size, cfg=cfg):
295
- local_spans = _intersect_local_spans(ranges, chunk.start, chunk.end)
296
- if not local_spans:
297
- continue
298
-
299
- for lo, hi in local_spans:
300
- part = chunk.df.iloc[lo : hi + 1]
301
- parts.append(part)
302
-
303
- if not parts:
304
- cols = cfg.usecols if (cfg and cfg.usecols) else None
305
- return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
306
-
307
- return pd.concat(parts, axis=0, ignore_index=False)
1
+ # valediction/io/csv_readers.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+ from math import ceil
7
+ from pathlib import Path
8
+ from typing import Iterator, NamedTuple
9
+
10
+ import pandas as pd
11
+ from pandas import DataFrame
12
+ from pandas.errors import ParserError
13
+
14
+ from valediction.support import _normalise_name
15
+
16
+
17
+ class FrameChunk(NamedTuple):
18
+ """A chunk of rows + I/O metadata.
19
+
20
+ - start/end are 0-based inclusive row numbers in the full dataset.
21
+ - file_pos/total_size/bytes_read are None when not reading from disk.
22
+ """
23
+
24
+ df: DataFrame
25
+ start: int # 0-based, inclusive
26
+ end: int # 10-based, inclusive
27
+ total_size: int | None # bytes of the whole file
28
+ file_pos: int | None # f.tell() after producing this chunk
29
+ bytes_read: int | None # bytes consumed to produce this chunk
30
+ chunk_index: int | None # 0-based index of this chunk
31
+
32
+ # Cumulative Totals
33
+ total_bytes_read: int | None
34
+ total_chunks_seen: int | None
35
+
36
+ def estimate_chunk_count(self) -> int:
37
+ # Buffers (accounting for CSV tails/bytes innacuracy)
38
+ EPS_ABS = 4096 # Fixed
39
+ EPS_REL = 0.05 # 5% tail buffer
40
+
41
+ bytes_seen = int(self.total_bytes_read)
42
+ chunks_seen = max(1, int(self.total_chunks_seen))
43
+ average = max(1.0, bytes_seen / float(chunks_seen))
44
+
45
+ remaining = max(0, int(self.total_size) - bytes_seen)
46
+
47
+ # Account for small tail if potentially complete
48
+ tail_thresh = max(EPS_ABS, int(EPS_REL * average))
49
+ if remaining <= tail_thresh:
50
+ remaining = 0
51
+
52
+ return chunks_seen + (0 if remaining == 0 else int(ceil(remaining / average)))
53
+
54
+ def update_df(self, df: DataFrame) -> FrameChunk:
55
+ return self._replace(df=df)
56
+
57
+
58
+ @dataclass(slots=True)
59
+ class CsvReadConfig:
60
+ """Canonical CSV reading defaults for the overall project.
61
+
62
+ Notes:
63
+ - dtype="string" always reads columns as string, permitting downstream inference/validation.
64
+ - keep_default_na=False and na_values=[] prevent pandas from coercing tokens like "NA".
65
+ - We normalise headers and strip string values post-read (vectorised).
66
+ """
67
+
68
+ dtype: str = "string"
69
+ keep_default_na: bool = False
70
+ na_values: list[str] | None = None
71
+ encoding: str = "utf-8"
72
+ normalise_headers: bool = True
73
+ strip_values: bool = True
74
+ usecols: list[str] | None = None
75
+
76
+ def __post_init__(self) -> None:
77
+ if self.na_values is None:
78
+ self.na_values = []
79
+
80
+
81
+ def _kwargs(cfg: CsvReadConfig | None = None) -> dict:
82
+ cfg = cfg or CsvReadConfig()
83
+ return dict(
84
+ dtype=cfg.dtype,
85
+ keep_default_na=cfg.keep_default_na,
86
+ na_values=cfg.na_values,
87
+ encoding=cfg.encoding,
88
+ usecols=cfg.usecols,
89
+ )
90
+
91
+
92
+ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
93
+ """Apply header normalisation and vectorised value stripping after reading."""
94
+ cfg = cfg or CsvReadConfig()
95
+ if cfg.normalise_headers:
96
+ df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
97
+ if cfg.strip_values:
98
+ str_cols = df.select_dtypes(include=["string"]).columns
99
+ if len(str_cols) > 0:
100
+ df[str_cols] = df[str_cols].apply(lambda s: s.str.strip())
101
+ return df
102
+
103
+
104
+ def read_csv_headers(path: str | Path, cfg: CsvReadConfig | None = None) -> DataFrame:
105
+ """Read headers only (nrows=0) with canonical settings; returns a DataFrame."""
106
+ cfg = cfg or CsvReadConfig()
107
+ cfg.strip_values = False
108
+
109
+ try:
110
+ header = pd.read_csv(path, nrows=0, **_kwargs(cfg))
111
+ return _post_read_processing(header, cfg)
112
+
113
+ except ParserError as e:
114
+ raise ParserError(
115
+ f"Malformed CSV while reading header from '{path}': {e}"
116
+ ) from e
117
+
118
+
119
+ def read_csv_all(path: str | Path, cfg: CsvReadConfig | None = None) -> FrameChunk:
120
+ """Read the entire CSV with canonical settings; returns a single FrameChunk."""
121
+ cfg = cfg or CsvReadConfig()
122
+ try:
123
+ file_size = os.path.getsize(path)
124
+
125
+ with open(path, "rb") as file:
126
+ start_pos = file.tell()
127
+ df = pd.read_csv(file, **_kwargs(cfg))
128
+ end_pos = file.tell()
129
+ df = _post_read_processing(df, cfg)
130
+ n = len(df)
131
+
132
+ return FrameChunk(
133
+ df=df,
134
+ start=0,
135
+ end=n - 1,
136
+ total_size=file_size,
137
+ file_pos=end_pos,
138
+ bytes_read=end_pos - start_pos,
139
+ chunk_index=1,
140
+ total_bytes_read=file_size,
141
+ total_chunks_seen=1,
142
+ )
143
+ except ParserError as e:
144
+ raise ParserError(f"Malformed CSV while reading '{path}': {e}") from e
145
+
146
+
147
+ def read_csv_sample(
148
+ path: str | Path, nrows: int, cfg: CsvReadConfig | None = None
149
+ ) -> FrameChunk:
150
+ """Read first `nrows` with canonical settings; returns a FrameChunk with I/O
151
+ metadata."""
152
+ cfg = cfg or CsvReadConfig()
153
+
154
+ try:
155
+ file_size = os.path.getsize(path)
156
+ with open(path, "rb") as file:
157
+ start_pos = file.tell()
158
+ df = pd.read_csv(file, nrows=nrows, **_kwargs(cfg))
159
+ end_pos = file.tell()
160
+
161
+ df = _post_read_processing(df, cfg)
162
+ n = len(df)
163
+
164
+ bytes_read = (end_pos - start_pos) if end_pos > 0 else None
165
+ file_pos = end_pos if end_pos > 0 else None
166
+
167
+ return FrameChunk(
168
+ df=df,
169
+ start=0,
170
+ end=n - 1,
171
+ total_size=file_size,
172
+ file_pos=file_pos,
173
+ bytes_read=bytes_read,
174
+ chunk_index=1,
175
+ total_bytes_read=bytes_read or 0,
176
+ total_chunks_seen=1,
177
+ )
178
+
179
+ except ParserError as e:
180
+ raise ParserError(
181
+ f"Malformed CSV while reading sample from '{path}': {e}"
182
+ ) from e
183
+
184
+
185
+ def iter_csv_chunks(
186
+ path: str | Path, chunk_size: int | None, cfg: CsvReadConfig | None = None
187
+ ) -> Iterator[FrameChunk]:
188
+ """Yield FrameChunk with canonical settings.
189
+
190
+ Behaviour:
191
+ - If chunk_size is None or <= 0: yields a single chunk for the entire file.
192
+ - Else: yields multiple chunks each with populated bytes/position metadata.
193
+ """
194
+ cfg = cfg or CsvReadConfig()
195
+ try:
196
+ file_size = os.path.getsize(path)
197
+
198
+ # No chunking: one full-file chunk with metadata
199
+ if not chunk_size or (isinstance(chunk_size, int) and chunk_size <= 0):
200
+ with open(path, "rb") as file:
201
+ start_pos = file.tell()
202
+ df = pd.read_csv(file, **_kwargs(cfg))
203
+ end_pos = file.tell()
204
+ df = _post_read_processing(df, cfg)
205
+ n = len(df)
206
+ if n == 0:
207
+ return
208
+ yield FrameChunk(
209
+ df=df,
210
+ start=0,
211
+ end=n - 1,
212
+ total_size=file_size,
213
+ file_pos=end_pos,
214
+ bytes_read=file_size,
215
+ chunk_index=1,
216
+ total_bytes_read=end_pos - start_pos,
217
+ total_chunks_seen=1,
218
+ )
219
+ return
220
+
221
+ # Chunking: stream with bytes/pos metadata
222
+ with open(path, "rb") as file:
223
+ reader = pd.read_csv(file, chunksize=chunk_size, **_kwargs(cfg))
224
+ prev_pos = file.tell()
225
+ offset = 0
226
+ idx = 0
227
+ cumulative_bytes = 0
228
+ for raw in reader:
229
+ idx += 1
230
+ curr_pos = file.tell()
231
+ bytes_read = max(0, curr_pos - prev_pos)
232
+ prev_pos = curr_pos
233
+ cumulative_bytes += bytes_read
234
+
235
+ df = _post_read_processing(raw, cfg)
236
+ n = len(df)
237
+ if n == 0:
238
+ continue
239
+
240
+ start = offset
241
+ end = offset + n - 1
242
+ offset += n
243
+
244
+ yield FrameChunk(
245
+ df=df,
246
+ start=start,
247
+ end=end,
248
+ total_size=file_size,
249
+ file_pos=curr_pos,
250
+ bytes_read=bytes_read,
251
+ chunk_index=idx,
252
+ total_bytes_read=cumulative_bytes,
253
+ total_chunks_seen=idx,
254
+ )
255
+
256
+ except ParserError as e:
257
+ raise ParserError(
258
+ f"Malformed CSV while reading chunks from '{path}': {e}"
259
+ ) from e
260
+
261
+
262
+ # Reading specific ranges
263
+ def _intersect_local_spans(
264
+ ranges: list[tuple[int, int]],
265
+ chunk_start: int,
266
+ chunk_end: int,
267
+ ) -> list[tuple[int, int]]:
268
+ out: list[tuple[int, int]] = []
269
+ for r_start, r_end in ranges:
270
+ lo = max(r_start, chunk_start)
271
+ hi = min(r_end, chunk_end)
272
+ if hi >= lo:
273
+ out.append((lo - chunk_start, hi - chunk_start))
274
+ return out
275
+
276
+
277
+ def read_csv_ranges(
278
+ path: str | Path,
279
+ ranges: list[tuple[int, int]],
280
+ cfg: CsvReadConfig | None = None,
281
+ chunk_size: int | None = 1_000_000,
282
+ ) -> pd.DataFrame:
283
+ """Read only the rows covered by `ranges` (global 0-based inclusive pairs).
284
+
285
+ Respects CsvReadConfig (including usecols for column pruning). Returns a
286
+ concatenated DataFrame (may be empty).
287
+ """
288
+ if not ranges:
289
+ # honour columns if specified
290
+ cols = cfg.usecols if (cfg and cfg.usecols) else None
291
+ return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
292
+
293
+ parts: list[pd.DataFrame] = []
294
+ for chunk in iter_csv_chunks(path, chunk_size=chunk_size, cfg=cfg):
295
+ local_spans = _intersect_local_spans(ranges, chunk.start, chunk.end)
296
+ if not local_spans:
297
+ continue
298
+
299
+ for lo, hi in local_spans:
300
+ part = chunk.df.iloc[lo : hi + 1]
301
+ parts.append(part)
302
+
303
+ if not parts:
304
+ cols = cfg.usecols if (cfg and cfg.usecols) else None
305
+ return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
306
+
307
+ return pd.concat(parts, axis=0, ignore_index=False)