datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +49 -1
- datablade/blade.py +322 -0
- datablade/core/__init__.py +28 -7
- datablade/core/frames.py +23 -236
- datablade/core/json.py +5 -10
- datablade/core/lists.py +5 -10
- datablade/core/messages.py +23 -11
- datablade/core/strings.py +5 -43
- datablade/core/zip.py +5 -24
- datablade/dataframes/__init__.py +51 -0
- datablade/dataframes/frames.py +585 -0
- datablade/dataframes/readers.py +1367 -0
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/__init__.py +15 -0
- datablade/io/json.py +70 -0
- datablade/io/zip.py +111 -0
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +56 -0
- datablade/sql/bulk_load.py +665 -0
- datablade/sql/ddl.py +402 -0
- datablade/sql/ddl_pyarrow.py +411 -0
- datablade/sql/dialects.py +12 -0
- datablade/sql/quoting.py +44 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +38 -0
- datablade/utils/lists.py +32 -0
- datablade/utils/logging.py +204 -0
- datablade/utils/messages.py +29 -0
- datablade/utils/strings.py +249 -0
- datablade-0.0.6.dist-info/METADATA +406 -0
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
- datablade-0.0.0.dist-info/METADATA +0 -13
- datablade-0.0.0.dist-info/RECORD +0 -13
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1367 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Memory-aware file reading utilities with Polars support.
|
|
3
|
+
|
|
4
|
+
This module provides intelligent file reading that:
|
|
5
|
+
- Estimates memory requirements before loading
|
|
6
|
+
- Automatically chunks large files
|
|
7
|
+
- Uses Polars for high-performance reading when available
|
|
8
|
+
- Writes large files to multiple Parquet partitions
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import csv
|
|
12
|
+
import json
|
|
13
|
+
import pathlib
|
|
14
|
+
from contextlib import contextmanager
|
|
15
|
+
from typing import TYPE_CHECKING, Callable, Iterable, Iterator, List, Optional, Union
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import pyarrow.parquet as pq
|
|
19
|
+
|
|
20
|
+
from ..utils.logging import (
|
|
21
|
+
build_log_context,
|
|
22
|
+
format_log_context,
|
|
23
|
+
log_debug,
|
|
24
|
+
log_info,
|
|
25
|
+
log_warning,
|
|
26
|
+
timed_step,
|
|
27
|
+
)
|
|
28
|
+
from ..utils.strings import coerce_path, ensure_directory
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
import polars as pl
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# File-size guardrails for text-based memory estimation.
|
|
35
|
+
_TEXT_MEDIUM_FILE_BYTES = 100 * 1024 * 1024 # 100 MB
|
|
36
|
+
_TEXT_LARGE_FILE_BYTES = 1024 * 1024 * 1024 # 1 GB
|
|
37
|
+
_TEXT_EXTREME_FILE_BYTES = 5 * 1024 * 1024 * 1024 # 5 GB
|
|
38
|
+
_TEXT_LARGE_MULTIPLIER = 4.0
|
|
39
|
+
_TEXT_EXTREME_MULTIPLIER = 6.0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Heuristic thresholds for text file sizing (bytes) and multipliers used
|
|
43
|
+
# when estimating memory requirements without sampling.
|
|
44
|
+
# These are intentionally exported (leading underscore) so tests can
|
|
45
|
+
# reference the same thresholds.
|
|
46
|
+
_TEXT_MEDIUM_FILE_BYTES = 5 * 1024 * 1024
|
|
47
|
+
_TEXT_LARGE_FILE_BYTES = 50 * 1024 * 1024
|
|
48
|
+
_TEXT_EXTREME_FILE_BYTES = 1 * 1024 * 1024 * 1024
|
|
49
|
+
|
|
50
|
+
_TEXT_LARGE_MULTIPLIER = 3.0
|
|
51
|
+
_TEXT_EXTREME_MULTIPLIER = 10.0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _normalize_text_delimiter_kwargs(suffix: str, read_kwargs: dict) -> dict:
|
|
55
|
+
"""Normalize delimiter/sep kwargs for delimited text formats.
|
|
56
|
+
|
|
57
|
+
- For TSV, default to tab separator unless the caller supplied one.
|
|
58
|
+
- For CSV/TXT, leave pandas defaults unless the caller supplied one.
|
|
59
|
+
"""
|
|
60
|
+
if suffix not in (".csv", ".tsv", ".txt"):
|
|
61
|
+
return read_kwargs
|
|
62
|
+
|
|
63
|
+
if "sep" in read_kwargs or "delimiter" in read_kwargs:
|
|
64
|
+
return read_kwargs
|
|
65
|
+
|
|
66
|
+
if suffix == ".tsv":
|
|
67
|
+
out = dict(read_kwargs)
|
|
68
|
+
out["sep"] = "\t"
|
|
69
|
+
return out
|
|
70
|
+
|
|
71
|
+
return read_kwargs
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _detect_text_encoding(
|
|
75
|
+
file_path: pathlib.Path,
|
|
76
|
+
sample_size: int = 10000,
|
|
77
|
+
) -> Optional[str]:
|
|
78
|
+
"""Detect file encoding using optional dependencies."""
|
|
79
|
+
try:
|
|
80
|
+
sample = file_path.read_bytes()[:sample_size]
|
|
81
|
+
except Exception:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
from charset_normalizer import from_bytes
|
|
86
|
+
|
|
87
|
+
best = from_bytes(sample).best()
|
|
88
|
+
if best and best.encoding:
|
|
89
|
+
return best.encoding
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
import chardet
|
|
95
|
+
|
|
96
|
+
detected = chardet.detect(sample)
|
|
97
|
+
return detected.get("encoding")
|
|
98
|
+
except Exception:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _detect_text_delimiter(sample_text: str) -> Optional[str]:
|
|
103
|
+
"""Detect delimiter using csv.Sniffer with common delimiters."""
|
|
104
|
+
try:
|
|
105
|
+
sniffer = csv.Sniffer()
|
|
106
|
+
dialect = sniffer.sniff(sample_text, delimiters=[",", "\t", ";", "|"])
|
|
107
|
+
return dialect.delimiter
|
|
108
|
+
except Exception:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _prepare_text_read_kwargs(
|
|
113
|
+
file_path: pathlib.Path,
|
|
114
|
+
suffix: str,
|
|
115
|
+
read_kwargs: dict,
|
|
116
|
+
verbose: bool,
|
|
117
|
+
) -> dict:
|
|
118
|
+
"""Apply optional encoding/delimiter detection for text formats."""
|
|
119
|
+
if suffix not in (".csv", ".tsv", ".txt"):
|
|
120
|
+
return dict(read_kwargs)
|
|
121
|
+
|
|
122
|
+
kwargs = dict(read_kwargs)
|
|
123
|
+
detect_encoding = bool(kwargs.pop("detect_encoding", False))
|
|
124
|
+
detect_delimiter = bool(kwargs.pop("detect_delimiter", False))
|
|
125
|
+
|
|
126
|
+
if detect_encoding and "encoding" not in kwargs:
|
|
127
|
+
detected_encoding = _detect_text_encoding(file_path)
|
|
128
|
+
if detected_encoding:
|
|
129
|
+
kwargs["encoding"] = detected_encoding
|
|
130
|
+
log_debug(
|
|
131
|
+
f"Detected encoding '{detected_encoding}' for {file_path.name}.",
|
|
132
|
+
verbose,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if detect_delimiter and "sep" not in kwargs and "delimiter" not in kwargs:
|
|
136
|
+
encoding = kwargs.get("encoding") or "utf-8"
|
|
137
|
+
try:
|
|
138
|
+
with open(file_path, "r", encoding=encoding, errors="replace") as handle:
|
|
139
|
+
sample_text = handle.read(8192)
|
|
140
|
+
except Exception:
|
|
141
|
+
sample_text = ""
|
|
142
|
+
|
|
143
|
+
detected_delimiter = _detect_text_delimiter(sample_text)
|
|
144
|
+
if detected_delimiter:
|
|
145
|
+
kwargs["sep"] = detected_delimiter
|
|
146
|
+
log_debug(
|
|
147
|
+
f"Detected delimiter '{detected_delimiter}' for {file_path.name}.",
|
|
148
|
+
verbose,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return kwargs
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _polars_scan_csv_kwargs(suffix: str, read_kwargs: dict) -> dict:
|
|
155
|
+
"""Best-effort mapping of pandas-style kwargs to polars scan_csv kwargs."""
|
|
156
|
+
# Polars uses `separator` (not `sep`). We only map delimiters because other
|
|
157
|
+
# pandas kwargs are not generally compatible.
|
|
158
|
+
scan_kwargs: dict = {}
|
|
159
|
+
|
|
160
|
+
if "sep" in read_kwargs:
|
|
161
|
+
scan_kwargs["separator"] = read_kwargs["sep"]
|
|
162
|
+
elif "delimiter" in read_kwargs:
|
|
163
|
+
scan_kwargs["separator"] = read_kwargs["delimiter"]
|
|
164
|
+
elif suffix == ".tsv":
|
|
165
|
+
scan_kwargs["separator"] = "\t"
|
|
166
|
+
|
|
167
|
+
if "has_header" in read_kwargs:
|
|
168
|
+
scan_kwargs["has_header"] = read_kwargs["has_header"]
|
|
169
|
+
elif "header" in read_kwargs:
|
|
170
|
+
header = read_kwargs["header"]
|
|
171
|
+
if header is None:
|
|
172
|
+
scan_kwargs["has_header"] = False
|
|
173
|
+
elif header == "infer" or header == 0:
|
|
174
|
+
scan_kwargs["has_header"] = True
|
|
175
|
+
elif isinstance(header, int):
|
|
176
|
+
scan_kwargs["has_header"] = True
|
|
177
|
+
if header > 0:
|
|
178
|
+
scan_kwargs["skip_rows"] = header
|
|
179
|
+
|
|
180
|
+
if "infer_schema_length" in read_kwargs:
|
|
181
|
+
scan_kwargs["infer_schema_length"] = read_kwargs["infer_schema_length"]
|
|
182
|
+
|
|
183
|
+
if "encoding" in read_kwargs:
|
|
184
|
+
scan_kwargs["encoding"] = read_kwargs["encoding"]
|
|
185
|
+
|
|
186
|
+
dtype_value = read_kwargs.get("dtypes", read_kwargs.get("dtype"))
|
|
187
|
+
if isinstance(dtype_value, (dict, list, tuple)):
|
|
188
|
+
scan_kwargs["dtypes"] = dtype_value
|
|
189
|
+
|
|
190
|
+
return scan_kwargs
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _polars_scan_source(path: pathlib.Path, suffix: str, read_kwargs: dict):
|
|
194
|
+
"""Create a Polars LazyFrame for formats with scan support."""
|
|
195
|
+
import polars as pl
|
|
196
|
+
|
|
197
|
+
if suffix == ".parquet":
|
|
198
|
+
return pl.scan_parquet(path)
|
|
199
|
+
if suffix in (".csv", ".tsv", ".txt"):
|
|
200
|
+
return pl.scan_csv(path, **_polars_scan_csv_kwargs(suffix, read_kwargs))
|
|
201
|
+
if suffix in (".json", ".jsonl") and read_kwargs.get("lines"):
|
|
202
|
+
return pl.scan_ndjson(path)
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _normalized_chunks(
|
|
207
|
+
chunks: Iterable[pd.DataFrame],
|
|
208
|
+
*,
|
|
209
|
+
convert_types: bool,
|
|
210
|
+
verbose: bool,
|
|
211
|
+
) -> Iterator[pd.DataFrame]:
|
|
212
|
+
"""Normalize chunk schemas and optionally coerce numeric strings."""
|
|
213
|
+
from .frames import clean_dataframe_columns, try_cast_string_columns_to_numeric
|
|
214
|
+
|
|
215
|
+
for chunk in chunks:
|
|
216
|
+
chunk = clean_dataframe_columns(chunk, verbose=verbose)
|
|
217
|
+
if convert_types:
|
|
218
|
+
chunk = try_cast_string_columns_to_numeric(chunk, verbose=verbose)
|
|
219
|
+
yield chunk
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _iter_json_objects(
|
|
223
|
+
file_path: pathlib.Path,
|
|
224
|
+
record_path: str,
|
|
225
|
+
) -> Iterator[dict]:
|
|
226
|
+
"""Yield JSON objects from a standard JSON file using ijson."""
|
|
227
|
+
try:
|
|
228
|
+
import ijson
|
|
229
|
+
except ImportError as exc:
|
|
230
|
+
raise ImportError(
|
|
231
|
+
"Streaming non-JSON Lines files requires the optional 'ijson' dependency."
|
|
232
|
+
) from exc
|
|
233
|
+
|
|
234
|
+
with open(file_path, "rb") as handle:
|
|
235
|
+
yield from ijson.items(handle, record_path)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def json_to_jsonl(
|
|
239
|
+
file_path: Union[str, pathlib.Path],
|
|
240
|
+
output_path: Union[str, pathlib.Path],
|
|
241
|
+
record_path: str = "item",
|
|
242
|
+
encoding: str = "utf-8",
|
|
243
|
+
verbose: bool = False,
|
|
244
|
+
) -> pathlib.Path:
|
|
245
|
+
"""Convert a standard JSON file to JSON Lines.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
file_path: Input JSON file path.
|
|
249
|
+
output_path: Destination JSON Lines file path.
|
|
250
|
+
record_path: ijson record path for arrays (default: "item").
|
|
251
|
+
encoding: Output encoding.
|
|
252
|
+
verbose: If True, logs progress.
|
|
253
|
+
"""
|
|
254
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
255
|
+
output = coerce_path(
|
|
256
|
+
output_path, must_exist=False, verbose=verbose, label="output_path"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
iterator = _iter_json_objects(path, record_path)
|
|
261
|
+
except ImportError:
|
|
262
|
+
data = json.loads(path.read_text(encoding=encoding))
|
|
263
|
+
if isinstance(data, list):
|
|
264
|
+
iterator = iter(data)
|
|
265
|
+
elif isinstance(data, dict) and record_path in data:
|
|
266
|
+
iterator = iter(data[record_path])
|
|
267
|
+
else:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
"JSON conversion requires a top-level list or a record_path key."
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
273
|
+
with open(output, "w", encoding=encoding) as handle:
|
|
274
|
+
count = 0
|
|
275
|
+
for obj in iterator:
|
|
276
|
+
handle.write(json.dumps(obj))
|
|
277
|
+
handle.write("\n")
|
|
278
|
+
count += 1
|
|
279
|
+
|
|
280
|
+
log_info(f"Wrote {count} JSON Lines records to {output}", verbose)
|
|
281
|
+
return output
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _infer_parquet_batch_rows(
|
|
285
|
+
file_path: pathlib.Path,
|
|
286
|
+
parquet_file: pq.ParquetFile,
|
|
287
|
+
memory_fraction: float,
|
|
288
|
+
verbose: bool,
|
|
289
|
+
) -> int:
|
|
290
|
+
"""Infer an approximate Parquet batch size (rows) to keep memory bounded."""
|
|
291
|
+
try:
|
|
292
|
+
available_memory = _get_available_memory()
|
|
293
|
+
target_memory = int(available_memory * memory_fraction)
|
|
294
|
+
file_size = file_path.stat().st_size
|
|
295
|
+
num_rows = int(getattr(parquet_file.metadata, "num_rows", 0) or 0)
|
|
296
|
+
if num_rows <= 0 or file_size <= 0 or target_memory <= 0:
|
|
297
|
+
return 65_536
|
|
298
|
+
|
|
299
|
+
# Parquet is compressed on disk; materialized batches are larger.
|
|
300
|
+
# We use a conservative multiplier to avoid overshooting.
|
|
301
|
+
bytes_per_row_on_disk = file_size / num_rows
|
|
302
|
+
inflated_bytes_per_row = max(1.0, bytes_per_row_on_disk * 3.0)
|
|
303
|
+
batch_rows = int(target_memory / inflated_bytes_per_row)
|
|
304
|
+
|
|
305
|
+
# Keep within sane bounds.
|
|
306
|
+
batch_rows = max(1_024, min(1_000_000, batch_rows))
|
|
307
|
+
log_debug(
|
|
308
|
+
f"Auto Parquet batch_rows={batch_rows}"
|
|
309
|
+
f"{format_log_context(build_log_context(file_path=file_path))}",
|
|
310
|
+
verbose,
|
|
311
|
+
)
|
|
312
|
+
return batch_rows
|
|
313
|
+
except Exception:
|
|
314
|
+
return 65_536
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _get_available_memory() -> int:
|
|
318
|
+
"""Get available system memory in bytes."""
|
|
319
|
+
try:
|
|
320
|
+
import psutil
|
|
321
|
+
|
|
322
|
+
return psutil.virtual_memory().available
|
|
323
|
+
except ImportError:
|
|
324
|
+
log_warning("psutil not installed; assuming 4GB available memory", verbose=True)
|
|
325
|
+
return 4 * 1024 * 1024 * 1024
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _estimate_text_column_count(
|
|
329
|
+
file_path: pathlib.Path,
|
|
330
|
+
delimiter: str,
|
|
331
|
+
) -> int:
|
|
332
|
+
"""Estimate the number of columns in a delimited text file."""
|
|
333
|
+
try:
|
|
334
|
+
with open(file_path, "r", encoding="utf-8", errors="replace") as handle:
|
|
335
|
+
header = handle.readline()
|
|
336
|
+
except Exception:
|
|
337
|
+
return 1
|
|
338
|
+
|
|
339
|
+
if not header:
|
|
340
|
+
return 1
|
|
341
|
+
|
|
342
|
+
return max(1, header.count(delimiter) + 1)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _adaptive_sample_rows(file_size: int, sample_rows: int, column_count: int) -> int:
|
|
346
|
+
"""Adapt sample rows to file size and width to limit costly inference."""
|
|
347
|
+
rows = max(25, sample_rows)
|
|
348
|
+
|
|
349
|
+
if file_size >= _TEXT_MEDIUM_FILE_BYTES:
|
|
350
|
+
rows = min(rows, 500)
|
|
351
|
+
|
|
352
|
+
if column_count >= 300:
|
|
353
|
+
rows = min(rows, 100)
|
|
354
|
+
elif column_count >= 100:
|
|
355
|
+
rows = min(rows, 200)
|
|
356
|
+
|
|
357
|
+
return max(25, rows)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _estimate_file_memory(file_path: pathlib.Path, sample_rows: int = 1000) -> int:
|
|
361
|
+
"""
|
|
362
|
+
Estimate memory required to load a file by sampling.
|
|
363
|
+
|
|
364
|
+
Returns estimated bytes needed to load entire file.
|
|
365
|
+
"""
|
|
366
|
+
file_size = file_path.stat().st_size
|
|
367
|
+
suffix = file_path.suffix.lower()
|
|
368
|
+
|
|
369
|
+
if suffix == ".parquet":
|
|
370
|
+
return file_size * 3
|
|
371
|
+
|
|
372
|
+
if suffix in (".csv", ".tsv", ".txt"):
|
|
373
|
+
if file_size >= _TEXT_EXTREME_FILE_BYTES:
|
|
374
|
+
return int(file_size * _TEXT_EXTREME_MULTIPLIER)
|
|
375
|
+
|
|
376
|
+
if file_size >= _TEXT_LARGE_FILE_BYTES:
|
|
377
|
+
return int(file_size * _TEXT_LARGE_MULTIPLIER)
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
sample_kwargs = {}
|
|
381
|
+
if suffix == ".tsv":
|
|
382
|
+
sample_kwargs["sep"] = "\t"
|
|
383
|
+
|
|
384
|
+
delimiter = sample_kwargs.get("sep", ",")
|
|
385
|
+
column_count = _estimate_text_column_count(file_path, delimiter)
|
|
386
|
+
adaptive_rows = _adaptive_sample_rows(file_size, sample_rows, column_count)
|
|
387
|
+
|
|
388
|
+
if file_size >= _TEXT_MEDIUM_FILE_BYTES or column_count >= 200:
|
|
389
|
+
sample_kwargs["dtype"] = str
|
|
390
|
+
|
|
391
|
+
sample = pd.read_csv(file_path, nrows=adaptive_rows, **sample_kwargs)
|
|
392
|
+
if len(sample) == 0:
|
|
393
|
+
return file_size * 3
|
|
394
|
+
|
|
395
|
+
memory_per_row = sample.memory_usage(deep=True).sum() / len(sample)
|
|
396
|
+
estimated_rows = _count_lines_estimate(file_path)
|
|
397
|
+
return int(memory_per_row * estimated_rows * 1.2)
|
|
398
|
+
except Exception:
|
|
399
|
+
return file_size * 3
|
|
400
|
+
|
|
401
|
+
if suffix in (".xlsx", ".xls"):
|
|
402
|
+
return file_size * 10
|
|
403
|
+
|
|
404
|
+
return file_size * 3
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _count_lines_estimate(file_path: pathlib.Path, sample_size: int = 65536) -> int:
|
|
408
|
+
"""Estimate number of lines in a file by sampling."""
|
|
409
|
+
file_size = file_path.stat().st_size
|
|
410
|
+
with open(file_path, "rb") as f:
|
|
411
|
+
sample = f.read(sample_size)
|
|
412
|
+
lines_in_sample = sample.count(b"\n")
|
|
413
|
+
|
|
414
|
+
if lines_in_sample == 0:
|
|
415
|
+
return 1
|
|
416
|
+
|
|
417
|
+
return int(file_size * lines_in_sample / len(sample))
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _read_file_chunked_path(
|
|
421
|
+
path: pathlib.Path,
|
|
422
|
+
chunksize: Optional[int] = None,
|
|
423
|
+
memory_fraction: float = 0.5,
|
|
424
|
+
verbose: bool = False,
|
|
425
|
+
**read_kwargs,
|
|
426
|
+
) -> Iterator[pd.DataFrame]:
|
|
427
|
+
"""Read a file in chunks from a validated Path."""
|
|
428
|
+
suffix = path.suffix.lower()
|
|
429
|
+
|
|
430
|
+
if suffix == ".parquet":
|
|
431
|
+
# Parquet can be read in row batches directly from metadata.
|
|
432
|
+
parquet_file = pq.ParquetFile(path)
|
|
433
|
+
batch_rows = chunksize
|
|
434
|
+
if batch_rows is None:
|
|
435
|
+
batch_rows = _infer_parquet_batch_rows(
|
|
436
|
+
file_path=path,
|
|
437
|
+
parquet_file=parquet_file,
|
|
438
|
+
memory_fraction=memory_fraction,
|
|
439
|
+
verbose=verbose,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
for chunk_num, batch in enumerate(
|
|
443
|
+
parquet_file.iter_batches(batch_size=int(batch_rows), use_threads=True),
|
|
444
|
+
start=1,
|
|
445
|
+
):
|
|
446
|
+
yield batch.to_pandas()
|
|
447
|
+
context = format_log_context(
|
|
448
|
+
build_log_context(
|
|
449
|
+
file_path=path,
|
|
450
|
+
chunk_index=chunk_num,
|
|
451
|
+
chunk_rows=len(batch),
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
log_debug(f"Read parquet batch with {len(batch)} rows.{context}", verbose)
|
|
455
|
+
return
|
|
456
|
+
|
|
457
|
+
if suffix not in (".csv", ".tsv", ".txt"):
|
|
458
|
+
raise ValueError(f"Unsupported file format for chunked reading: {suffix}")
|
|
459
|
+
|
|
460
|
+
if chunksize is None:
|
|
461
|
+
# Auto-size chunks so that each chunk stays under the memory budget.
|
|
462
|
+
available_memory = _get_available_memory()
|
|
463
|
+
target_memory = int(available_memory * memory_fraction)
|
|
464
|
+
estimated_total = _estimate_file_memory(path)
|
|
465
|
+
|
|
466
|
+
if estimated_total <= target_memory:
|
|
467
|
+
context = format_log_context(
|
|
468
|
+
build_log_context(
|
|
469
|
+
file_path=path,
|
|
470
|
+
estimated_mb=f"{estimated_total / 1e6:.1f}",
|
|
471
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
log_info(f"File fits in memory; reading all at once.{context}", verbose)
|
|
475
|
+
detected_kwargs = _prepare_text_read_kwargs(
|
|
476
|
+
file_path=path,
|
|
477
|
+
suffix=suffix,
|
|
478
|
+
read_kwargs=read_kwargs,
|
|
479
|
+
verbose=verbose,
|
|
480
|
+
)
|
|
481
|
+
normalized_kwargs = _normalize_text_delimiter_kwargs(
|
|
482
|
+
suffix, detected_kwargs
|
|
483
|
+
)
|
|
484
|
+
df = pd.read_csv(path, **normalized_kwargs)
|
|
485
|
+
yield df
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
total_lines = _count_lines_estimate(path)
|
|
489
|
+
memory_per_row = estimated_total / max(1, total_lines)
|
|
490
|
+
chunksize = max(1000, int(target_memory / memory_per_row))
|
|
491
|
+
context = format_log_context(
|
|
492
|
+
build_log_context(
|
|
493
|
+
file_path=path,
|
|
494
|
+
estimated_mb=f"{estimated_total / 1e6:.1f}",
|
|
495
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
496
|
+
chunk_rows=chunksize,
|
|
497
|
+
)
|
|
498
|
+
)
|
|
499
|
+
log_info(f"File too large; reading in chunks.{context}", verbose)
|
|
500
|
+
|
|
501
|
+
chunk_num = 0
|
|
502
|
+
detected_kwargs = _prepare_text_read_kwargs(
|
|
503
|
+
file_path=path,
|
|
504
|
+
suffix=suffix,
|
|
505
|
+
read_kwargs=read_kwargs,
|
|
506
|
+
verbose=verbose,
|
|
507
|
+
)
|
|
508
|
+
normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, detected_kwargs)
|
|
509
|
+
for chunk in pd.read_csv(path, chunksize=chunksize, **normalized_kwargs):
|
|
510
|
+
chunk_num += 1
|
|
511
|
+
context = format_log_context(
|
|
512
|
+
build_log_context(file_path=path, chunk_index=chunk_num)
|
|
513
|
+
)
|
|
514
|
+
log_debug(f"Read chunk with {len(chunk)} rows.{context}", verbose)
|
|
515
|
+
yield chunk
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def read_file_chunked(
|
|
519
|
+
file_path: Union[str, pathlib.Path],
|
|
520
|
+
chunksize: Optional[int] = None,
|
|
521
|
+
memory_fraction: float = 0.5,
|
|
522
|
+
verbose: bool = False,
|
|
523
|
+
**read_kwargs,
|
|
524
|
+
) -> Iterator[pd.DataFrame]:
|
|
525
|
+
"""
|
|
526
|
+
Read a file in chunks, automatically determining chunk size based on available memory.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
file_path: Path to the file to read.
|
|
530
|
+
chunksize: Optional explicit chunk size (rows). If None, auto-calculated.
|
|
531
|
+
memory_fraction: Fraction of available memory to use (default: 0.5).
|
|
532
|
+
verbose: If True, logs progress messages.
|
|
533
|
+
**read_kwargs: Additional arguments passed to pandas read function.
|
|
534
|
+
|
|
535
|
+
Yields:
|
|
536
|
+
DataFrame chunks.
|
|
537
|
+
|
|
538
|
+
Raises:
|
|
539
|
+
ValueError: If file does not exist or format is unsupported.
|
|
540
|
+
"""
|
|
541
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
542
|
+
yield from _read_file_chunked_path(
|
|
543
|
+
path=path,
|
|
544
|
+
chunksize=chunksize,
|
|
545
|
+
memory_fraction=memory_fraction,
|
|
546
|
+
verbose=verbose,
|
|
547
|
+
**read_kwargs,
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def read_file_iter(
|
|
552
|
+
file_path: Union[str, pathlib.Path],
|
|
553
|
+
chunksize: Optional[int] = None,
|
|
554
|
+
memory_fraction: float = 0.5,
|
|
555
|
+
verbose: bool = False,
|
|
556
|
+
**read_kwargs,
|
|
557
|
+
) -> Iterator[pd.DataFrame]:
|
|
558
|
+
"""Stream a file as an iterator of DataFrame chunks.
|
|
559
|
+
|
|
560
|
+
This is the "never materialize" API: unlike read_file_smart(), this function
|
|
561
|
+
does not concatenate chunks into a single DataFrame.
|
|
562
|
+
|
|
563
|
+
Supported streaming formats:
|
|
564
|
+
- .csv / .tsv / .txt (via pandas chunking)
|
|
565
|
+
- .parquet (via pyarrow iter_batches)
|
|
566
|
+
- .json (JSON Lines via pandas chunks; or standard JSON arrays via ijson)
|
|
567
|
+
- .xlsx / .xls (via openpyxl read-only streaming, if available)
|
|
568
|
+
|
|
569
|
+
Non-streaming formats:
|
|
570
|
+
- .xlsx / .xls are loaded fully and yielded as a single DataFrame if
|
|
571
|
+
openpyxl is unavailable and the file is estimated to fit within
|
|
572
|
+
memory_fraction of available memory.
|
|
573
|
+
|
|
574
|
+
Raises:
|
|
575
|
+
ValueError: If the file is missing, unsupported, or too large for a
|
|
576
|
+
non-streaming format.
|
|
577
|
+
"""
|
|
578
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
579
|
+
|
|
580
|
+
suffix = path.suffix.lower()
|
|
581
|
+
|
|
582
|
+
if suffix in (".csv", ".tsv", ".txt", ".parquet"):
|
|
583
|
+
yield from _read_file_chunked_path(
|
|
584
|
+
path=path,
|
|
585
|
+
chunksize=chunksize,
|
|
586
|
+
memory_fraction=memory_fraction,
|
|
587
|
+
verbose=verbose,
|
|
588
|
+
**read_kwargs,
|
|
589
|
+
)
|
|
590
|
+
return
|
|
591
|
+
|
|
592
|
+
if suffix in (".json", ".jsonl"):
|
|
593
|
+
# pandas can stream JSON only for JSON Lines (one JSON object per line).
|
|
594
|
+
lines = bool(read_kwargs.get("lines", False))
|
|
595
|
+
if suffix == ".jsonl" and "lines" not in read_kwargs:
|
|
596
|
+
read_kwargs = dict(read_kwargs)
|
|
597
|
+
read_kwargs["lines"] = True
|
|
598
|
+
lines = True
|
|
599
|
+
if not lines:
|
|
600
|
+
record_path = read_kwargs.get("record_path", "item")
|
|
601
|
+
if chunksize is None:
|
|
602
|
+
chunksize = 1_000
|
|
603
|
+
log_info(
|
|
604
|
+
"Standard JSON streaming enabled; using default chunksize=1000. "
|
|
605
|
+
f"Set record_path='{record_path}' if your JSON is nested.",
|
|
606
|
+
verbose,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
try:
|
|
610
|
+
iterator = _iter_json_objects(path, record_path)
|
|
611
|
+
except ImportError:
|
|
612
|
+
available_memory = _get_available_memory()
|
|
613
|
+
target_memory = int(available_memory * memory_fraction)
|
|
614
|
+
estimated_total = _estimate_file_memory(path)
|
|
615
|
+
if estimated_total > target_memory:
|
|
616
|
+
raise ValueError(
|
|
617
|
+
"Streaming standard JSON requires the optional 'ijson' dependency. "
|
|
618
|
+
"Install ijson or convert the file to JSON Lines with "
|
|
619
|
+
"`json_to_jsonl()` (then set lines=True)."
|
|
620
|
+
)
|
|
621
|
+
yield pd.read_json(path, **read_kwargs)
|
|
622
|
+
return
|
|
623
|
+
|
|
624
|
+
buffer: List[dict] = []
|
|
625
|
+
chunk_num = 0
|
|
626
|
+
for obj in iterator:
|
|
627
|
+
buffer.append(obj)
|
|
628
|
+
if len(buffer) >= int(chunksize):
|
|
629
|
+
chunk_num += 1
|
|
630
|
+
yield pd.DataFrame(buffer)
|
|
631
|
+
log_debug(
|
|
632
|
+
f"Read json chunk {chunk_num} with {len(buffer)} rows.",
|
|
633
|
+
verbose,
|
|
634
|
+
)
|
|
635
|
+
buffer = []
|
|
636
|
+
|
|
637
|
+
if buffer:
|
|
638
|
+
chunk_num += 1
|
|
639
|
+
yield pd.DataFrame(buffer)
|
|
640
|
+
log_debug(
|
|
641
|
+
f"Read json chunk {chunk_num} with {len(buffer)} rows.",
|
|
642
|
+
verbose,
|
|
643
|
+
)
|
|
644
|
+
return
|
|
645
|
+
|
|
646
|
+
# JSON Lines streaming.
|
|
647
|
+
if chunksize is None:
|
|
648
|
+
available_memory = _get_available_memory()
|
|
649
|
+
target_memory = int(available_memory * memory_fraction)
|
|
650
|
+
estimated_total = _estimate_file_memory(path)
|
|
651
|
+
|
|
652
|
+
if estimated_total <= target_memory:
|
|
653
|
+
yield pd.read_json(path, **read_kwargs)
|
|
654
|
+
return
|
|
655
|
+
|
|
656
|
+
total_lines = _count_lines_estimate(path)
|
|
657
|
+
memory_per_line = estimated_total / max(1, total_lines)
|
|
658
|
+
chunksize = max(1000, int(target_memory / max(1.0, memory_per_line)))
|
|
659
|
+
context = format_log_context(
|
|
660
|
+
build_log_context(
|
|
661
|
+
file_path=path,
|
|
662
|
+
estimated_mb=f"{estimated_total / 1e6:.1f}",
|
|
663
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
664
|
+
chunk_rows=chunksize,
|
|
665
|
+
)
|
|
666
|
+
)
|
|
667
|
+
log_info(f"JSON Lines too large; streaming in chunks.{context}", verbose)
|
|
668
|
+
|
|
669
|
+
# pandas returns a TextFileReader-like iterator when chunksize is provided.
|
|
670
|
+
json_iter = pd.read_json(path, chunksize=chunksize, **read_kwargs)
|
|
671
|
+
for i, chunk in enumerate(json_iter, start=1):
|
|
672
|
+
context = format_log_context(
|
|
673
|
+
build_log_context(file_path=path, chunk_index=i)
|
|
674
|
+
)
|
|
675
|
+
log_debug(f"Read json chunk with {len(chunk)} rows.{context}", verbose)
|
|
676
|
+
yield chunk
|
|
677
|
+
return
|
|
678
|
+
|
|
679
|
+
if suffix in (".xlsx", ".xls"):
|
|
680
|
+
available_memory = _get_available_memory()
|
|
681
|
+
target_memory = int(available_memory * memory_fraction)
|
|
682
|
+
estimated_total = _estimate_file_memory(path)
|
|
683
|
+
# If the file is estimated too large to load safely and no explicit
|
|
684
|
+
# chunksize was provided, avoid attempting to open the workbook
|
|
685
|
+
# (which would error on invalid files) and raise a clear ValueError.
|
|
686
|
+
if chunksize is None and estimated_total > target_memory:
|
|
687
|
+
raise ValueError("Excel streaming is not supported for very large files.")
|
|
688
|
+
|
|
689
|
+
if chunksize is None and estimated_total <= target_memory:
|
|
690
|
+
yield pd.read_excel(path, **read_kwargs)
|
|
691
|
+
return
|
|
692
|
+
|
|
693
|
+
try:
|
|
694
|
+
yield from _read_excel_streaming(
|
|
695
|
+
file_path=path,
|
|
696
|
+
chunksize=chunksize,
|
|
697
|
+
memory_fraction=memory_fraction,
|
|
698
|
+
verbose=verbose,
|
|
699
|
+
**read_kwargs,
|
|
700
|
+
)
|
|
701
|
+
return
|
|
702
|
+
except ImportError as exc:
|
|
703
|
+
if estimated_total > target_memory:
|
|
704
|
+
raise ValueError(
|
|
705
|
+
"Excel streaming is not supported for very large files. "
|
|
706
|
+
"Install openpyxl for streaming or convert to CSV/Parquet first."
|
|
707
|
+
) from exc
|
|
708
|
+
yield pd.read_excel(path, **read_kwargs)
|
|
709
|
+
return
|
|
710
|
+
|
|
711
|
+
raise ValueError(f"Unsupported file format for streaming: {suffix}")
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def _read_excel_streaming(
|
|
715
|
+
file_path: Union[str, pathlib.Path],
|
|
716
|
+
chunksize: Optional[int],
|
|
717
|
+
memory_fraction: float,
|
|
718
|
+
verbose: bool,
|
|
719
|
+
**read_kwargs,
|
|
720
|
+
) -> Iterator[pd.DataFrame]:
|
|
721
|
+
"""Stream Excel files using openpyxl read-only mode."""
|
|
722
|
+
try:
|
|
723
|
+
import openpyxl
|
|
724
|
+
except ImportError as exc: # pragma: no cover - depends on optional dependency
|
|
725
|
+
raise ImportError("openpyxl is required for Excel streaming") from exc
|
|
726
|
+
|
|
727
|
+
path = pathlib.Path(file_path)
|
|
728
|
+
if chunksize is None:
|
|
729
|
+
chunksize = 10_000
|
|
730
|
+
log_info(
|
|
731
|
+
f"Excel streaming enabled; using default chunksize={chunksize} rows.",
|
|
732
|
+
verbose,
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
sheet_name = read_kwargs.pop("sheet_name", 0)
|
|
736
|
+
header = read_kwargs.pop("header", 0)
|
|
737
|
+
data_only = read_kwargs.pop("data_only", True)
|
|
738
|
+
|
|
739
|
+
if isinstance(sheet_name, (list, tuple)):
|
|
740
|
+
raise ValueError("Excel streaming supports a single sheet_name at a time.")
|
|
741
|
+
|
|
742
|
+
@contextmanager
|
|
743
|
+
def _openpyxl_workbook() -> Iterator[openpyxl.Workbook]:
|
|
744
|
+
workbook = openpyxl.load_workbook(path, read_only=True, data_only=data_only)
|
|
745
|
+
try:
|
|
746
|
+
yield workbook
|
|
747
|
+
finally:
|
|
748
|
+
workbook.close()
|
|
749
|
+
|
|
750
|
+
with _openpyxl_workbook() as workbook:
|
|
751
|
+
try:
|
|
752
|
+
if sheet_name is None:
|
|
753
|
+
worksheet = workbook.active
|
|
754
|
+
elif isinstance(sheet_name, int):
|
|
755
|
+
worksheet = workbook.worksheets[sheet_name]
|
|
756
|
+
else:
|
|
757
|
+
worksheet = workbook[sheet_name]
|
|
758
|
+
except (KeyError, IndexError) as exc:
|
|
759
|
+
raise ValueError(f"Sheet not found: {sheet_name}") from exc
|
|
760
|
+
|
|
761
|
+
row_iter = worksheet.iter_rows(values_only=True)
|
|
762
|
+
columns: Optional[List[str]] = None
|
|
763
|
+
if header is not None:
|
|
764
|
+
header_index = int(header)
|
|
765
|
+
for _ in range(header_index):
|
|
766
|
+
next(row_iter, None)
|
|
767
|
+
header_row = next(row_iter, None)
|
|
768
|
+
if header_row is None:
|
|
769
|
+
return
|
|
770
|
+
columns = ["" if value is None else str(value) for value in header_row]
|
|
771
|
+
|
|
772
|
+
buffer: List[List[object]] = []
|
|
773
|
+
chunk_num = 0
|
|
774
|
+
for row in row_iter:
|
|
775
|
+
buffer.append(list(row))
|
|
776
|
+
if len(buffer) >= int(chunksize):
|
|
777
|
+
chunk_num += 1
|
|
778
|
+
yield pd.DataFrame(buffer, columns=columns)
|
|
779
|
+
log_debug(
|
|
780
|
+
f"Read excel chunk {chunk_num} with {len(buffer)} rows.",
|
|
781
|
+
verbose,
|
|
782
|
+
)
|
|
783
|
+
buffer = []
|
|
784
|
+
|
|
785
|
+
if buffer:
|
|
786
|
+
chunk_num += 1
|
|
787
|
+
yield pd.DataFrame(buffer, columns=columns)
|
|
788
|
+
log_debug(f"Read excel chunk {chunk_num} with {len(buffer)} rows.", verbose)
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def excel_to_parquets(
|
|
792
|
+
file_path: Union[str, pathlib.Path],
|
|
793
|
+
output_dir: Union[str, pathlib.Path],
|
|
794
|
+
output_prefix: str = "part",
|
|
795
|
+
rows_per_file: Optional[int] = None,
|
|
796
|
+
memory_fraction: float = 0.5,
|
|
797
|
+
convert_types: bool = True,
|
|
798
|
+
verbose: bool = False,
|
|
799
|
+
**read_kwargs,
|
|
800
|
+
) -> List[pathlib.Path]:
|
|
801
|
+
"""Stream an Excel file to multiple Parquet partitions.
|
|
802
|
+
|
|
803
|
+
This requires openpyxl and reads the Excel file in read-only mode.
|
|
804
|
+
"""
|
|
805
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
806
|
+
output_path = ensure_directory(output_dir, verbose=verbose, label="output_dir")
|
|
807
|
+
|
|
808
|
+
output_files: List[pathlib.Path] = []
|
|
809
|
+
part_num = 0
|
|
810
|
+
|
|
811
|
+
chunks = _read_excel_streaming(
|
|
812
|
+
file_path=path,
|
|
813
|
+
chunksize=rows_per_file,
|
|
814
|
+
memory_fraction=memory_fraction,
|
|
815
|
+
verbose=verbose,
|
|
816
|
+
**read_kwargs,
|
|
817
|
+
)
|
|
818
|
+
for chunk in _normalized_chunks(
|
|
819
|
+
chunks,
|
|
820
|
+
convert_types=convert_types,
|
|
821
|
+
verbose=verbose,
|
|
822
|
+
):
|
|
823
|
+
output_file = output_path / f"{output_prefix}_{part_num:05d}.parquet"
|
|
824
|
+
chunk.to_parquet(output_file, index=False)
|
|
825
|
+
output_files.append(output_file)
|
|
826
|
+
|
|
827
|
+
log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
|
|
828
|
+
part_num += 1
|
|
829
|
+
|
|
830
|
+
log_info(f"Created {len(output_files)} Parquet files in {output_path}", verbose)
|
|
831
|
+
return output_files
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def read_file_to_parquets(
|
|
835
|
+
file_path: Union[str, pathlib.Path],
|
|
836
|
+
output_dir: Union[str, pathlib.Path],
|
|
837
|
+
output_prefix: str = "part",
|
|
838
|
+
rows_per_file: Optional[int] = None,
|
|
839
|
+
memory_fraction: float = 0.5,
|
|
840
|
+
convert_types: bool = True,
|
|
841
|
+
verbose: bool = False,
|
|
842
|
+
**read_kwargs,
|
|
843
|
+
) -> List[pathlib.Path]:
|
|
844
|
+
"""
|
|
845
|
+
Read a large file and write it to multiple Parquet files if it doesn't fit in memory.
|
|
846
|
+
|
|
847
|
+
Args:
|
|
848
|
+
file_path: Path to the input file.
|
|
849
|
+
output_dir: Directory where Parquet files will be written.
|
|
850
|
+
output_prefix: Prefix for output file names (default: "part").
|
|
851
|
+
rows_per_file: Optional explicit rows per output file. If None, auto-calculated.
|
|
852
|
+
memory_fraction: Fraction of available memory to use.
|
|
853
|
+
convert_types: If True, attempts to convert string columns to numeric.
|
|
854
|
+
verbose: If True, logs progress messages.
|
|
855
|
+
**read_kwargs: Additional arguments passed to pandas read function.
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
List of paths to the created Parquet files.
|
|
859
|
+
|
|
860
|
+
Raises:
|
|
861
|
+
ValueError: If file does not exist or format is unsupported.
|
|
862
|
+
"""
|
|
863
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
864
|
+
output_path = ensure_directory(output_dir, verbose=verbose, label="output_dir")
|
|
865
|
+
|
|
866
|
+
output_files: List[pathlib.Path] = []
|
|
867
|
+
part_num = 0
|
|
868
|
+
|
|
869
|
+
chunks = _read_file_chunked_path(
|
|
870
|
+
path=path,
|
|
871
|
+
chunksize=rows_per_file,
|
|
872
|
+
memory_fraction=memory_fraction,
|
|
873
|
+
verbose=verbose,
|
|
874
|
+
**read_kwargs,
|
|
875
|
+
)
|
|
876
|
+
for chunk in _normalized_chunks(
|
|
877
|
+
chunks,
|
|
878
|
+
convert_types=convert_types,
|
|
879
|
+
verbose=verbose,
|
|
880
|
+
):
|
|
881
|
+
output_file = output_path / f"{output_prefix}_{part_num:05d}.parquet"
|
|
882
|
+
chunk.to_parquet(output_file, index=False)
|
|
883
|
+
output_files.append(output_file)
|
|
884
|
+
|
|
885
|
+
log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
|
|
886
|
+
part_num += 1
|
|
887
|
+
|
|
888
|
+
log_info(f"Created {len(output_files)} Parquet files in {output_path}", verbose)
|
|
889
|
+
return output_files
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def stream_to_sink(
|
|
893
|
+
chunks: Iterable[pd.DataFrame],
|
|
894
|
+
output_dir: Union[str, pathlib.Path],
|
|
895
|
+
output_prefix: str = "part",
|
|
896
|
+
convert_types: bool = True,
|
|
897
|
+
verbose: bool = False,
|
|
898
|
+
sink: Optional[Callable[[pd.DataFrame, pathlib.Path], None]] = None,
|
|
899
|
+
output_suffix: str = ".parquet",
|
|
900
|
+
) -> List[pathlib.Path]:
|
|
901
|
+
"""Consume an iterator of DataFrames and write incremental partitions.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
chunks: Iterable of DataFrame chunks (e.g., from read_file_iter()).
|
|
905
|
+
output_dir: Directory where partitions are written.
|
|
906
|
+
output_prefix: Filename prefix for partitions.
|
|
907
|
+
convert_types: If True, attempts to convert numeric-looking strings.
|
|
908
|
+
verbose: If True, logs progress.
|
|
909
|
+
sink: Optional custom sink function. If omitted, writes Parquet files via
|
|
910
|
+
DataFrame.to_parquet(output_file).
|
|
911
|
+
output_suffix: File suffix to use for output files (default: ".parquet").
|
|
912
|
+
|
|
913
|
+
Returns:
|
|
914
|
+
List of output file paths produced by the sink.
|
|
915
|
+
"""
|
|
916
|
+
output_path = ensure_directory(output_dir, verbose=verbose, label="output_dir")
|
|
917
|
+
|
|
918
|
+
output_files: List[pathlib.Path] = []
|
|
919
|
+
part_num = 0
|
|
920
|
+
|
|
921
|
+
if not isinstance(output_suffix, str) or not output_suffix:
|
|
922
|
+
raise ValueError("output_suffix must be a non-empty string")
|
|
923
|
+
if not output_suffix.startswith("."):
|
|
924
|
+
output_suffix = f".{output_suffix}"
|
|
925
|
+
|
|
926
|
+
if sink is None:
|
|
927
|
+
|
|
928
|
+
def _default_sink(chunk: pd.DataFrame, output_file: pathlib.Path) -> None:
|
|
929
|
+
chunk.to_parquet(output_file, index=False)
|
|
930
|
+
|
|
931
|
+
sink = _default_sink
|
|
932
|
+
|
|
933
|
+
with timed_step("stream_to_sink", verbose=verbose):
|
|
934
|
+
for chunk in _normalized_chunks(
|
|
935
|
+
chunks,
|
|
936
|
+
convert_types=convert_types,
|
|
937
|
+
verbose=verbose,
|
|
938
|
+
):
|
|
939
|
+
output_file = output_path / f"{output_prefix}_{part_num:05d}{output_suffix}"
|
|
940
|
+
sink(chunk, output_file)
|
|
941
|
+
output_files.append(output_file)
|
|
942
|
+
|
|
943
|
+
log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
|
|
944
|
+
part_num += 1
|
|
945
|
+
|
|
946
|
+
log_info(f"Created {len(output_files)} partitions in {output_path}", verbose)
|
|
947
|
+
return output_files
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
def stream_to_parquets(
|
|
951
|
+
file_path: Union[str, pathlib.Path],
|
|
952
|
+
output_dir: Union[str, pathlib.Path],
|
|
953
|
+
output_prefix: str = "part",
|
|
954
|
+
rows_per_file: Optional[int] = None,
|
|
955
|
+
memory_fraction: float = 0.5,
|
|
956
|
+
convert_types: bool = True,
|
|
957
|
+
verbose: bool = False,
|
|
958
|
+
**read_kwargs,
|
|
959
|
+
) -> List[pathlib.Path]:
|
|
960
|
+
"""Stream a file and write it to Parquet partitions without materializing.
|
|
961
|
+
|
|
962
|
+
This helper is the "no concat" companion to read_file_to_parquets(). It uses
|
|
963
|
+
read_file_iter() under the hood and writes each incoming chunk to a separate
|
|
964
|
+
Parquet file.
|
|
965
|
+
|
|
966
|
+
Args:
|
|
967
|
+
file_path: Input file path.
|
|
968
|
+
output_dir: Directory where Parquet partitions are written.
|
|
969
|
+
output_prefix: Output filename prefix.
|
|
970
|
+
rows_per_file: Desired rows per partition. For streaming formats this
|
|
971
|
+
is passed as chunksize; if None, chunk sizes are chosen automatically
|
|
972
|
+
based on memory_fraction.
|
|
973
|
+
memory_fraction: Fraction of available memory to use when auto-sizing.
|
|
974
|
+
convert_types: If True, attempts to convert numeric-looking strings.
|
|
975
|
+
verbose: If True, logs progress.
|
|
976
|
+
**read_kwargs: Passed to the underlying reader.
|
|
977
|
+
|
|
978
|
+
Returns:
|
|
979
|
+
List of Parquet file paths.
|
|
980
|
+
|
|
981
|
+
Raises:
|
|
982
|
+
ValueError: If the input is missing/unsupported.
|
|
983
|
+
"""
|
|
984
|
+
chunk_iter = read_file_iter(
|
|
985
|
+
file_path=file_path,
|
|
986
|
+
chunksize=rows_per_file,
|
|
987
|
+
memory_fraction=memory_fraction,
|
|
988
|
+
verbose=verbose,
|
|
989
|
+
**read_kwargs,
|
|
990
|
+
)
|
|
991
|
+
return stream_to_sink(
|
|
992
|
+
chunks=chunk_iter,
|
|
993
|
+
output_dir=output_dir,
|
|
994
|
+
output_prefix=output_prefix,
|
|
995
|
+
convert_types=convert_types,
|
|
996
|
+
verbose=verbose,
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def parquet_to_csv_partitions(
|
|
1001
|
+
file_path: Union[str, pathlib.Path],
|
|
1002
|
+
output_dir: Union[str, pathlib.Path],
|
|
1003
|
+
output_prefix: str = "part",
|
|
1004
|
+
rows_per_file: Optional[int] = None,
|
|
1005
|
+
memory_fraction: float = 0.5,
|
|
1006
|
+
convert_types: bool = True,
|
|
1007
|
+
verbose: bool = False,
|
|
1008
|
+
delimiter: str = ",",
|
|
1009
|
+
include_header: bool = True,
|
|
1010
|
+
line_terminator: str = "\n",
|
|
1011
|
+
drop_columns: Optional[list[str]] = None,
|
|
1012
|
+
column_order: Optional[list[str]] = None,
|
|
1013
|
+
drop_extra_columns: bool = False,
|
|
1014
|
+
) -> List[pathlib.Path]:
|
|
1015
|
+
"""Stream a Parquet file to CSV partitions without materializing.
|
|
1016
|
+
|
|
1017
|
+
Args:
|
|
1018
|
+
file_path: Parquet file path.
|
|
1019
|
+
output_dir: Directory where CSV partitions are written.
|
|
1020
|
+
output_prefix: Output filename prefix.
|
|
1021
|
+
rows_per_file: Desired rows per partition. If None, batch size is chosen
|
|
1022
|
+
automatically based on memory_fraction.
|
|
1023
|
+
memory_fraction: Fraction of available memory to use when auto-sizing.
|
|
1024
|
+
convert_types: If True, attempts to convert numeric-looking strings.
|
|
1025
|
+
verbose: If True, logs progress.
|
|
1026
|
+
delimiter: CSV delimiter.
|
|
1027
|
+
include_header: If True, include headers in each CSV file.
|
|
1028
|
+
line_terminator: Line terminator used in CSV output.
|
|
1029
|
+
drop_columns: Optional column names to drop before writing.
|
|
1030
|
+
column_order: Optional column order to enforce in CSV output.
|
|
1031
|
+
drop_extra_columns: If True, drop columns not in column_order.
|
|
1032
|
+
|
|
1033
|
+
Returns:
|
|
1034
|
+
List of CSV file paths.
|
|
1035
|
+
"""
|
|
1036
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
1037
|
+
if path.suffix.lower() != ".parquet":
|
|
1038
|
+
raise ValueError("file_path must point to a .parquet file")
|
|
1039
|
+
|
|
1040
|
+
chunk_iter = read_file_iter(
|
|
1041
|
+
file_path=path,
|
|
1042
|
+
chunksize=rows_per_file,
|
|
1043
|
+
memory_fraction=memory_fraction,
|
|
1044
|
+
verbose=verbose,
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
drop_set = set(drop_columns or [])
|
|
1048
|
+
if column_order is not None:
|
|
1049
|
+
if not isinstance(column_order, list) or not all(
|
|
1050
|
+
isinstance(col, str) and col.strip() for col in column_order
|
|
1051
|
+
):
|
|
1052
|
+
raise ValueError("column_order must be a list of non-empty strings")
|
|
1053
|
+
if len(column_order) != len(set(column_order)):
|
|
1054
|
+
raise ValueError("column_order must not contain duplicates")
|
|
1055
|
+
|
|
1056
|
+
def _csv_sink(chunk: pd.DataFrame, output_file: pathlib.Path) -> None:
|
|
1057
|
+
if drop_set:
|
|
1058
|
+
chunk = chunk.drop(columns=[c for c in drop_set if c in chunk.columns])
|
|
1059
|
+
if column_order is not None:
|
|
1060
|
+
missing = [c for c in column_order if c not in chunk.columns]
|
|
1061
|
+
if missing:
|
|
1062
|
+
raise ValueError(
|
|
1063
|
+
f"Missing columns for CSV output: {missing}. "
|
|
1064
|
+
"Ensure all Parquet files share the same schema."
|
|
1065
|
+
)
|
|
1066
|
+
if drop_extra_columns:
|
|
1067
|
+
chunk = chunk[[c for c in column_order]]
|
|
1068
|
+
else:
|
|
1069
|
+
extra = [c for c in chunk.columns if c not in column_order]
|
|
1070
|
+
chunk = chunk[[*column_order, *extra]]
|
|
1071
|
+
chunk.to_csv(
|
|
1072
|
+
output_file,
|
|
1073
|
+
index=False,
|
|
1074
|
+
sep=delimiter,
|
|
1075
|
+
header=include_header,
|
|
1076
|
+
lineterminator=line_terminator,
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
return stream_to_sink(
|
|
1080
|
+
chunks=chunk_iter,
|
|
1081
|
+
output_dir=output_dir,
|
|
1082
|
+
output_prefix=output_prefix,
|
|
1083
|
+
convert_types=convert_types,
|
|
1084
|
+
verbose=verbose,
|
|
1085
|
+
sink=_csv_sink,
|
|
1086
|
+
output_suffix=".csv",
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
def _resolve_return_type(
|
|
1091
|
+
return_polars: bool,
|
|
1092
|
+
return_type: str,
|
|
1093
|
+
) -> tuple[str, bool, bool, str]:
|
|
1094
|
+
if return_polars:
|
|
1095
|
+
if return_type not in ("pandas", "polars"):
|
|
1096
|
+
raise ValueError(
|
|
1097
|
+
"return_polars cannot be combined with return_type other than "
|
|
1098
|
+
"'pandas' or 'polars'."
|
|
1099
|
+
)
|
|
1100
|
+
return_type = "polars"
|
|
1101
|
+
normalized_return_type = return_type.lower()
|
|
1102
|
+
wants_polars = normalized_return_type in ("polars", "polars_lazy", "lazy")
|
|
1103
|
+
return_lazy = normalized_return_type in ("polars_lazy", "lazy")
|
|
1104
|
+
return normalized_return_type, wants_polars, return_lazy, return_type
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
def _validate_return_type(normalized_return_type: str, return_type: str) -> None:
|
|
1108
|
+
valid_return_types = {
|
|
1109
|
+
"pandas",
|
|
1110
|
+
"dataframe",
|
|
1111
|
+
"iterator",
|
|
1112
|
+
"polars",
|
|
1113
|
+
"polars_lazy",
|
|
1114
|
+
"lazy",
|
|
1115
|
+
}
|
|
1116
|
+
if normalized_return_type not in valid_return_types:
|
|
1117
|
+
raise ValueError(
|
|
1118
|
+
f"Invalid return_type: {return_type}. Expected one of "
|
|
1119
|
+
f"{sorted(valid_return_types)}."
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
def _require_polars():
|
|
1124
|
+
try:
|
|
1125
|
+
import polars as pl
|
|
1126
|
+
except ImportError as exc:
|
|
1127
|
+
raise ImportError(
|
|
1128
|
+
"Polars is required for return_type='polars' or 'polars_lazy'."
|
|
1129
|
+
) from exc
|
|
1130
|
+
return pl
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def _read_with_polars_return_type(
|
|
1134
|
+
*,
|
|
1135
|
+
path: pathlib.Path,
|
|
1136
|
+
suffix: str,
|
|
1137
|
+
text_read_kwargs: dict,
|
|
1138
|
+
fits_in_memory: bool,
|
|
1139
|
+
return_lazy: bool,
|
|
1140
|
+
verbose: bool,
|
|
1141
|
+
read_pandas_direct: Callable[[], pd.DataFrame],
|
|
1142
|
+
read_pandas_chunked: Callable[[], pd.DataFrame],
|
|
1143
|
+
):
|
|
1144
|
+
pl = _require_polars()
|
|
1145
|
+
polars_scan = _polars_scan_source(path, suffix, text_read_kwargs)
|
|
1146
|
+
if polars_scan is not None:
|
|
1147
|
+
if return_lazy:
|
|
1148
|
+
return polars_scan
|
|
1149
|
+
with timed_step("polars_collect", verbose=verbose):
|
|
1150
|
+
df_polars = polars_scan.collect(streaming=not fits_in_memory)
|
|
1151
|
+
context = format_log_context(build_log_context(file_path=path))
|
|
1152
|
+
log_info(f"Polars read {len(df_polars)} rows.{context}", verbose)
|
|
1153
|
+
return df_polars
|
|
1154
|
+
|
|
1155
|
+
if fits_in_memory:
|
|
1156
|
+
with timed_step("pandas_read_direct", verbose=verbose):
|
|
1157
|
+
df_pandas = read_pandas_direct()
|
|
1158
|
+
else:
|
|
1159
|
+
with timed_step("pandas_read_chunked", verbose=verbose):
|
|
1160
|
+
df_pandas = read_pandas_chunked()
|
|
1161
|
+
if return_lazy:
|
|
1162
|
+
return pl.from_pandas(df_pandas).lazy()
|
|
1163
|
+
return pl.from_pandas(df_pandas)
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _read_with_polars_fallback(
|
|
1167
|
+
*,
|
|
1168
|
+
path: pathlib.Path,
|
|
1169
|
+
suffix: str,
|
|
1170
|
+
text_read_kwargs: dict,
|
|
1171
|
+
fits_in_memory: bool,
|
|
1172
|
+
use_polars: bool,
|
|
1173
|
+
estimated_memory: int,
|
|
1174
|
+
target_memory: int,
|
|
1175
|
+
verbose: bool,
|
|
1176
|
+
) -> Optional[pd.DataFrame]:
|
|
1177
|
+
if not use_polars or fits_in_memory:
|
|
1178
|
+
return None
|
|
1179
|
+
try:
|
|
1180
|
+
context = format_log_context(
|
|
1181
|
+
build_log_context(
|
|
1182
|
+
file_path=path,
|
|
1183
|
+
estimated_mb=f"{estimated_memory / 1e6:.1f}",
|
|
1184
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
1185
|
+
)
|
|
1186
|
+
)
|
|
1187
|
+
log_info(f"Using Polars for large file.{context}", verbose)
|
|
1188
|
+
|
|
1189
|
+
lf = _polars_scan_source(path, suffix, text_read_kwargs)
|
|
1190
|
+
if lf is None:
|
|
1191
|
+
raise ValueError(f"Unsupported format for Polars: {suffix}")
|
|
1192
|
+
|
|
1193
|
+
with timed_step("polars_collect", verbose=verbose):
|
|
1194
|
+
df_polars = lf.collect(streaming=True)
|
|
1195
|
+
context = format_log_context(build_log_context(file_path=path))
|
|
1196
|
+
log_info(f"Polars read {len(df_polars)} rows.{context}", verbose)
|
|
1197
|
+
return df_polars.to_pandas()
|
|
1198
|
+
|
|
1199
|
+
except ImportError:
|
|
1200
|
+
context = format_log_context(build_log_context(file_path=path))
|
|
1201
|
+
log_warning(
|
|
1202
|
+
"Polars not installed; falling back to pandas chunked reading."
|
|
1203
|
+
f"{context}",
|
|
1204
|
+
verbose,
|
|
1205
|
+
)
|
|
1206
|
+
except Exception as e:
|
|
1207
|
+
context = format_log_context(build_log_context(file_path=path))
|
|
1208
|
+
log_warning(f"Polars failed: {e}; falling back to pandas.{context}", verbose)
|
|
1209
|
+
return None
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
def read_file_smart(
|
|
1213
|
+
file_path: Union[str, pathlib.Path],
|
|
1214
|
+
use_polars: bool = True,
|
|
1215
|
+
return_polars: bool = False,
|
|
1216
|
+
return_type: str = "pandas",
|
|
1217
|
+
memory_fraction: float = 0.5,
|
|
1218
|
+
verbose: bool = False,
|
|
1219
|
+
**read_kwargs,
|
|
1220
|
+
) -> Union[pd.DataFrame, "pl.DataFrame", "pl.LazyFrame"]:
|
|
1221
|
+
"""
|
|
1222
|
+
Intelligently read a file, using Polars for large files if available.
|
|
1223
|
+
|
|
1224
|
+
For files that fit in memory, reads directly. For large files, uses
|
|
1225
|
+
Polars lazy evaluation or pandas chunking as a fallback.
|
|
1226
|
+
|
|
1227
|
+
Args:
|
|
1228
|
+
file_path: Path to the file to read.
|
|
1229
|
+
use_polars: If True and Polars is available, uses Polars for large files.
|
|
1230
|
+
return_polars: If True, return a Polars DataFrame (alias for return_type="polars").
|
|
1231
|
+
return_type: "pandas" (default), "polars", "polars_lazy", or "iterator".
|
|
1232
|
+
memory_fraction: Fraction of available memory to use.
|
|
1233
|
+
verbose: If True, logs progress messages.
|
|
1234
|
+
return_type: One of "pandas", "polars", "polars_lazy", or "iterator".
|
|
1235
|
+
**read_kwargs: Additional arguments passed to the read function.
|
|
1236
|
+
|
|
1237
|
+
Returns:
|
|
1238
|
+
DataFrame with the file contents, or an iterator when return_type="iterator".
|
|
1239
|
+
|
|
1240
|
+
Raises:
|
|
1241
|
+
ValueError: If file does not exist or format is unsupported.
|
|
1242
|
+
"""
|
|
1243
|
+
(
|
|
1244
|
+
normalized_return_type,
|
|
1245
|
+
wants_polars,
|
|
1246
|
+
return_lazy,
|
|
1247
|
+
return_type,
|
|
1248
|
+
) = _resolve_return_type(return_polars, return_type)
|
|
1249
|
+
|
|
1250
|
+
path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
|
|
1251
|
+
|
|
1252
|
+
suffix = path.suffix.lower()
|
|
1253
|
+
if suffix == ".jsonl" and "lines" not in read_kwargs:
|
|
1254
|
+
read_kwargs = dict(read_kwargs)
|
|
1255
|
+
read_kwargs["lines"] = True
|
|
1256
|
+
|
|
1257
|
+
if normalized_return_type == "iterator":
|
|
1258
|
+
return read_file_iter(
|
|
1259
|
+
file_path=path,
|
|
1260
|
+
memory_fraction=memory_fraction,
|
|
1261
|
+
verbose=verbose,
|
|
1262
|
+
**read_kwargs,
|
|
1263
|
+
)
|
|
1264
|
+
available_memory = _get_available_memory()
|
|
1265
|
+
target_memory = int(available_memory * memory_fraction)
|
|
1266
|
+
estimated_memory = _estimate_file_memory(path)
|
|
1267
|
+
|
|
1268
|
+
fits_in_memory = estimated_memory <= target_memory
|
|
1269
|
+
_validate_return_type(normalized_return_type, return_type)
|
|
1270
|
+
|
|
1271
|
+
context = format_log_context(
|
|
1272
|
+
build_log_context(
|
|
1273
|
+
file_path=path,
|
|
1274
|
+
estimated_mb=f"{estimated_memory / 1e6:.1f}",
|
|
1275
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
1276
|
+
fits=fits_in_memory,
|
|
1277
|
+
)
|
|
1278
|
+
)
|
|
1279
|
+
log_debug(f"Computed file memory fit.{context}", verbose)
|
|
1280
|
+
|
|
1281
|
+
text_read_kwargs = (
|
|
1282
|
+
_prepare_text_read_kwargs(
|
|
1283
|
+
file_path=path,
|
|
1284
|
+
suffix=suffix,
|
|
1285
|
+
read_kwargs=read_kwargs,
|
|
1286
|
+
verbose=verbose,
|
|
1287
|
+
)
|
|
1288
|
+
if suffix in (".csv", ".tsv", ".txt")
|
|
1289
|
+
else dict(read_kwargs)
|
|
1290
|
+
)
|
|
1291
|
+
|
|
1292
|
+
def _read_pandas_direct() -> pd.DataFrame:
|
|
1293
|
+
if suffix == ".parquet":
|
|
1294
|
+
return pd.read_parquet(path, **read_kwargs)
|
|
1295
|
+
if suffix in (".csv", ".tsv", ".txt"):
|
|
1296
|
+
normalized_kwargs = _normalize_text_delimiter_kwargs(
|
|
1297
|
+
suffix, text_read_kwargs
|
|
1298
|
+
)
|
|
1299
|
+
return pd.read_csv(path, **normalized_kwargs)
|
|
1300
|
+
if suffix in (".xlsx", ".xls"):
|
|
1301
|
+
return pd.read_excel(path, **read_kwargs)
|
|
1302
|
+
if suffix in (".json", ".jsonl"):
|
|
1303
|
+
return pd.read_json(path, **read_kwargs)
|
|
1304
|
+
raise ValueError(f"Unsupported file format: {suffix}")
|
|
1305
|
+
|
|
1306
|
+
def _read_pandas_chunked() -> pd.DataFrame:
|
|
1307
|
+
normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, text_read_kwargs)
|
|
1308
|
+
return pd.concat(
|
|
1309
|
+
_read_file_chunked_path(
|
|
1310
|
+
path=path,
|
|
1311
|
+
memory_fraction=memory_fraction,
|
|
1312
|
+
verbose=verbose,
|
|
1313
|
+
**normalized_kwargs,
|
|
1314
|
+
),
|
|
1315
|
+
ignore_index=True,
|
|
1316
|
+
)
|
|
1317
|
+
|
|
1318
|
+
def _read_pandas_with_logging() -> pd.DataFrame:
|
|
1319
|
+
if fits_in_memory:
|
|
1320
|
+
context = format_log_context(
|
|
1321
|
+
build_log_context(
|
|
1322
|
+
file_path=path,
|
|
1323
|
+
estimated_mb=f"{estimated_memory / 1e6:.1f}",
|
|
1324
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
1325
|
+
)
|
|
1326
|
+
)
|
|
1327
|
+
log_info(f"Reading file directly.{context}", verbose)
|
|
1328
|
+
with timed_step("pandas_read_direct", verbose=verbose):
|
|
1329
|
+
return _read_pandas_direct()
|
|
1330
|
+
|
|
1331
|
+
context = format_log_context(
|
|
1332
|
+
build_log_context(
|
|
1333
|
+
file_path=path,
|
|
1334
|
+
estimated_mb=f"{estimated_memory / 1e6:.1f}",
|
|
1335
|
+
target_mb=f"{target_memory / 1e6:.1f}",
|
|
1336
|
+
)
|
|
1337
|
+
)
|
|
1338
|
+
log_info(f"Reading large file in chunks.{context}", verbose)
|
|
1339
|
+
with timed_step("pandas_read_chunked", verbose=verbose):
|
|
1340
|
+
return _read_pandas_chunked()
|
|
1341
|
+
|
|
1342
|
+
if wants_polars:
|
|
1343
|
+
return _read_with_polars_return_type(
|
|
1344
|
+
path=path,
|
|
1345
|
+
suffix=suffix,
|
|
1346
|
+
text_read_kwargs=text_read_kwargs,
|
|
1347
|
+
fits_in_memory=fits_in_memory,
|
|
1348
|
+
return_lazy=return_lazy,
|
|
1349
|
+
verbose=verbose,
|
|
1350
|
+
read_pandas_direct=_read_pandas_direct,
|
|
1351
|
+
read_pandas_chunked=_read_pandas_chunked,
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1354
|
+
polars_df = _read_with_polars_fallback(
|
|
1355
|
+
path=path,
|
|
1356
|
+
suffix=suffix,
|
|
1357
|
+
text_read_kwargs=text_read_kwargs,
|
|
1358
|
+
fits_in_memory=fits_in_memory,
|
|
1359
|
+
use_polars=use_polars,
|
|
1360
|
+
estimated_memory=estimated_memory,
|
|
1361
|
+
target_memory=target_memory,
|
|
1362
|
+
verbose=verbose,
|
|
1363
|
+
)
|
|
1364
|
+
if polars_df is not None:
|
|
1365
|
+
return polars_df
|
|
1366
|
+
|
|
1367
|
+
return _read_pandas_with_logging()
|