datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datablade/__init__.py +49 -1
  2. datablade/blade.py +322 -0
  3. datablade/core/__init__.py +28 -7
  4. datablade/core/frames.py +23 -236
  5. datablade/core/json.py +5 -10
  6. datablade/core/lists.py +5 -10
  7. datablade/core/messages.py +23 -11
  8. datablade/core/strings.py +5 -43
  9. datablade/core/zip.py +5 -24
  10. datablade/dataframes/__init__.py +51 -0
  11. datablade/dataframes/frames.py +585 -0
  12. datablade/dataframes/readers.py +1367 -0
  13. datablade/docs/ARCHITECTURE.md +102 -0
  14. datablade/docs/OBJECT_REGISTRY.md +194 -0
  15. datablade/docs/README.md +57 -0
  16. datablade/docs/TESTING.md +37 -0
  17. datablade/docs/USAGE.md +409 -0
  18. datablade/docs/__init__.py +87 -0
  19. datablade/docs/__main__.py +6 -0
  20. datablade/io/__init__.py +15 -0
  21. datablade/io/json.py +70 -0
  22. datablade/io/zip.py +111 -0
  23. datablade/registry.py +581 -0
  24. datablade/sql/__init__.py +56 -0
  25. datablade/sql/bulk_load.py +665 -0
  26. datablade/sql/ddl.py +402 -0
  27. datablade/sql/ddl_pyarrow.py +411 -0
  28. datablade/sql/dialects.py +12 -0
  29. datablade/sql/quoting.py +44 -0
  30. datablade/sql/schema_spec.py +65 -0
  31. datablade/sql/sqlserver.py +390 -0
  32. datablade/utils/__init__.py +38 -0
  33. datablade/utils/lists.py +32 -0
  34. datablade/utils/logging.py +204 -0
  35. datablade/utils/messages.py +29 -0
  36. datablade/utils/strings.py +249 -0
  37. datablade-0.0.6.dist-info/METADATA +406 -0
  38. datablade-0.0.6.dist-info/RECORD +41 -0
  39. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
  40. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
  41. datablade-0.0.0.dist-info/METADATA +0 -13
  42. datablade-0.0.0.dist-info/RECORD +0 -13
  43. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1367 @@
1
+ """
2
+ Memory-aware file reading utilities with Polars support.
3
+
4
+ This module provides intelligent file reading that:
5
+ - Estimates memory requirements before loading
6
+ - Automatically chunks large files
7
+ - Uses Polars for high-performance reading when available
8
+ - Writes large files to multiple Parquet partitions
9
+ """
10
+
11
+ import csv
12
+ import json
13
+ import pathlib
14
+ from contextlib import contextmanager
15
+ from typing import TYPE_CHECKING, Callable, Iterable, Iterator, List, Optional, Union
16
+
17
+ import pandas as pd
18
+ import pyarrow.parquet as pq
19
+
20
+ from ..utils.logging import (
21
+ build_log_context,
22
+ format_log_context,
23
+ log_debug,
24
+ log_info,
25
+ log_warning,
26
+ timed_step,
27
+ )
28
+ from ..utils.strings import coerce_path, ensure_directory
29
+
30
+ if TYPE_CHECKING:
31
+ import polars as pl
32
+
33
+
34
+ # File-size guardrails for text-based memory estimation.
35
+ _TEXT_MEDIUM_FILE_BYTES = 100 * 1024 * 1024 # 100 MB
36
+ _TEXT_LARGE_FILE_BYTES = 1024 * 1024 * 1024 # 1 GB
37
+ _TEXT_EXTREME_FILE_BYTES = 5 * 1024 * 1024 * 1024 # 5 GB
38
+ _TEXT_LARGE_MULTIPLIER = 4.0
39
+ _TEXT_EXTREME_MULTIPLIER = 6.0
40
+
41
+
42
+ # Heuristic thresholds for text file sizing (bytes) and multipliers used
43
+ # when estimating memory requirements without sampling.
44
+ # These are intentionally exported (leading underscore) so tests can
45
+ # reference the same thresholds.
46
+ _TEXT_MEDIUM_FILE_BYTES = 5 * 1024 * 1024
47
+ _TEXT_LARGE_FILE_BYTES = 50 * 1024 * 1024
48
+ _TEXT_EXTREME_FILE_BYTES = 1 * 1024 * 1024 * 1024
49
+
50
+ _TEXT_LARGE_MULTIPLIER = 3.0
51
+ _TEXT_EXTREME_MULTIPLIER = 10.0
52
+
53
+
54
+ def _normalize_text_delimiter_kwargs(suffix: str, read_kwargs: dict) -> dict:
55
+ """Normalize delimiter/sep kwargs for delimited text formats.
56
+
57
+ - For TSV, default to tab separator unless the caller supplied one.
58
+ - For CSV/TXT, leave pandas defaults unless the caller supplied one.
59
+ """
60
+ if suffix not in (".csv", ".tsv", ".txt"):
61
+ return read_kwargs
62
+
63
+ if "sep" in read_kwargs or "delimiter" in read_kwargs:
64
+ return read_kwargs
65
+
66
+ if suffix == ".tsv":
67
+ out = dict(read_kwargs)
68
+ out["sep"] = "\t"
69
+ return out
70
+
71
+ return read_kwargs
72
+
73
+
74
+ def _detect_text_encoding(
75
+ file_path: pathlib.Path,
76
+ sample_size: int = 10000,
77
+ ) -> Optional[str]:
78
+ """Detect file encoding using optional dependencies."""
79
+ try:
80
+ sample = file_path.read_bytes()[:sample_size]
81
+ except Exception:
82
+ return None
83
+
84
+ try:
85
+ from charset_normalizer import from_bytes
86
+
87
+ best = from_bytes(sample).best()
88
+ if best and best.encoding:
89
+ return best.encoding
90
+ except Exception:
91
+ pass
92
+
93
+ try:
94
+ import chardet
95
+
96
+ detected = chardet.detect(sample)
97
+ return detected.get("encoding")
98
+ except Exception:
99
+ return None
100
+
101
+
102
+ def _detect_text_delimiter(sample_text: str) -> Optional[str]:
103
+ """Detect delimiter using csv.Sniffer with common delimiters."""
104
+ try:
105
+ sniffer = csv.Sniffer()
106
+ dialect = sniffer.sniff(sample_text, delimiters=[",", "\t", ";", "|"])
107
+ return dialect.delimiter
108
+ except Exception:
109
+ return None
110
+
111
+
112
+ def _prepare_text_read_kwargs(
113
+ file_path: pathlib.Path,
114
+ suffix: str,
115
+ read_kwargs: dict,
116
+ verbose: bool,
117
+ ) -> dict:
118
+ """Apply optional encoding/delimiter detection for text formats."""
119
+ if suffix not in (".csv", ".tsv", ".txt"):
120
+ return dict(read_kwargs)
121
+
122
+ kwargs = dict(read_kwargs)
123
+ detect_encoding = bool(kwargs.pop("detect_encoding", False))
124
+ detect_delimiter = bool(kwargs.pop("detect_delimiter", False))
125
+
126
+ if detect_encoding and "encoding" not in kwargs:
127
+ detected_encoding = _detect_text_encoding(file_path)
128
+ if detected_encoding:
129
+ kwargs["encoding"] = detected_encoding
130
+ log_debug(
131
+ f"Detected encoding '{detected_encoding}' for {file_path.name}.",
132
+ verbose,
133
+ )
134
+
135
+ if detect_delimiter and "sep" not in kwargs and "delimiter" not in kwargs:
136
+ encoding = kwargs.get("encoding") or "utf-8"
137
+ try:
138
+ with open(file_path, "r", encoding=encoding, errors="replace") as handle:
139
+ sample_text = handle.read(8192)
140
+ except Exception:
141
+ sample_text = ""
142
+
143
+ detected_delimiter = _detect_text_delimiter(sample_text)
144
+ if detected_delimiter:
145
+ kwargs["sep"] = detected_delimiter
146
+ log_debug(
147
+ f"Detected delimiter '{detected_delimiter}' for {file_path.name}.",
148
+ verbose,
149
+ )
150
+
151
+ return kwargs
152
+
153
+
154
+ def _polars_scan_csv_kwargs(suffix: str, read_kwargs: dict) -> dict:
155
+ """Best-effort mapping of pandas-style kwargs to polars scan_csv kwargs."""
156
+ # Polars uses `separator` (not `sep`). We only map delimiters because other
157
+ # pandas kwargs are not generally compatible.
158
+ scan_kwargs: dict = {}
159
+
160
+ if "sep" in read_kwargs:
161
+ scan_kwargs["separator"] = read_kwargs["sep"]
162
+ elif "delimiter" in read_kwargs:
163
+ scan_kwargs["separator"] = read_kwargs["delimiter"]
164
+ elif suffix == ".tsv":
165
+ scan_kwargs["separator"] = "\t"
166
+
167
+ if "has_header" in read_kwargs:
168
+ scan_kwargs["has_header"] = read_kwargs["has_header"]
169
+ elif "header" in read_kwargs:
170
+ header = read_kwargs["header"]
171
+ if header is None:
172
+ scan_kwargs["has_header"] = False
173
+ elif header == "infer" or header == 0:
174
+ scan_kwargs["has_header"] = True
175
+ elif isinstance(header, int):
176
+ scan_kwargs["has_header"] = True
177
+ if header > 0:
178
+ scan_kwargs["skip_rows"] = header
179
+
180
+ if "infer_schema_length" in read_kwargs:
181
+ scan_kwargs["infer_schema_length"] = read_kwargs["infer_schema_length"]
182
+
183
+ if "encoding" in read_kwargs:
184
+ scan_kwargs["encoding"] = read_kwargs["encoding"]
185
+
186
+ dtype_value = read_kwargs.get("dtypes", read_kwargs.get("dtype"))
187
+ if isinstance(dtype_value, (dict, list, tuple)):
188
+ scan_kwargs["dtypes"] = dtype_value
189
+
190
+ return scan_kwargs
191
+
192
+
193
+ def _polars_scan_source(path: pathlib.Path, suffix: str, read_kwargs: dict):
194
+ """Create a Polars LazyFrame for formats with scan support."""
195
+ import polars as pl
196
+
197
+ if suffix == ".parquet":
198
+ return pl.scan_parquet(path)
199
+ if suffix in (".csv", ".tsv", ".txt"):
200
+ return pl.scan_csv(path, **_polars_scan_csv_kwargs(suffix, read_kwargs))
201
+ if suffix in (".json", ".jsonl") and read_kwargs.get("lines"):
202
+ return pl.scan_ndjson(path)
203
+ return None
204
+
205
+
206
+ def _normalized_chunks(
207
+ chunks: Iterable[pd.DataFrame],
208
+ *,
209
+ convert_types: bool,
210
+ verbose: bool,
211
+ ) -> Iterator[pd.DataFrame]:
212
+ """Normalize chunk schemas and optionally coerce numeric strings."""
213
+ from .frames import clean_dataframe_columns, try_cast_string_columns_to_numeric
214
+
215
+ for chunk in chunks:
216
+ chunk = clean_dataframe_columns(chunk, verbose=verbose)
217
+ if convert_types:
218
+ chunk = try_cast_string_columns_to_numeric(chunk, verbose=verbose)
219
+ yield chunk
220
+
221
+
222
+ def _iter_json_objects(
223
+ file_path: pathlib.Path,
224
+ record_path: str,
225
+ ) -> Iterator[dict]:
226
+ """Yield JSON objects from a standard JSON file using ijson."""
227
+ try:
228
+ import ijson
229
+ except ImportError as exc:
230
+ raise ImportError(
231
+ "Streaming non-JSON Lines files requires the optional 'ijson' dependency."
232
+ ) from exc
233
+
234
+ with open(file_path, "rb") as handle:
235
+ yield from ijson.items(handle, record_path)
236
+
237
+
238
+ def json_to_jsonl(
239
+ file_path: Union[str, pathlib.Path],
240
+ output_path: Union[str, pathlib.Path],
241
+ record_path: str = "item",
242
+ encoding: str = "utf-8",
243
+ verbose: bool = False,
244
+ ) -> pathlib.Path:
245
+ """Convert a standard JSON file to JSON Lines.
246
+
247
+ Args:
248
+ file_path: Input JSON file path.
249
+ output_path: Destination JSON Lines file path.
250
+ record_path: ijson record path for arrays (default: "item").
251
+ encoding: Output encoding.
252
+ verbose: If True, logs progress.
253
+ """
254
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
255
+ output = coerce_path(
256
+ output_path, must_exist=False, verbose=verbose, label="output_path"
257
+ )
258
+
259
+ try:
260
+ iterator = _iter_json_objects(path, record_path)
261
+ except ImportError:
262
+ data = json.loads(path.read_text(encoding=encoding))
263
+ if isinstance(data, list):
264
+ iterator = iter(data)
265
+ elif isinstance(data, dict) and record_path in data:
266
+ iterator = iter(data[record_path])
267
+ else:
268
+ raise ValueError(
269
+ "JSON conversion requires a top-level list or a record_path key."
270
+ )
271
+
272
+ output.parent.mkdir(parents=True, exist_ok=True)
273
+ with open(output, "w", encoding=encoding) as handle:
274
+ count = 0
275
+ for obj in iterator:
276
+ handle.write(json.dumps(obj))
277
+ handle.write("\n")
278
+ count += 1
279
+
280
+ log_info(f"Wrote {count} JSON Lines records to {output}", verbose)
281
+ return output
282
+
283
+
284
+ def _infer_parquet_batch_rows(
285
+ file_path: pathlib.Path,
286
+ parquet_file: pq.ParquetFile,
287
+ memory_fraction: float,
288
+ verbose: bool,
289
+ ) -> int:
290
+ """Infer an approximate Parquet batch size (rows) to keep memory bounded."""
291
+ try:
292
+ available_memory = _get_available_memory()
293
+ target_memory = int(available_memory * memory_fraction)
294
+ file_size = file_path.stat().st_size
295
+ num_rows = int(getattr(parquet_file.metadata, "num_rows", 0) or 0)
296
+ if num_rows <= 0 or file_size <= 0 or target_memory <= 0:
297
+ return 65_536
298
+
299
+ # Parquet is compressed on disk; materialized batches are larger.
300
+ # We use a conservative multiplier to avoid overshooting.
301
+ bytes_per_row_on_disk = file_size / num_rows
302
+ inflated_bytes_per_row = max(1.0, bytes_per_row_on_disk * 3.0)
303
+ batch_rows = int(target_memory / inflated_bytes_per_row)
304
+
305
+ # Keep within sane bounds.
306
+ batch_rows = max(1_024, min(1_000_000, batch_rows))
307
+ log_debug(
308
+ f"Auto Parquet batch_rows={batch_rows}"
309
+ f"{format_log_context(build_log_context(file_path=file_path))}",
310
+ verbose,
311
+ )
312
+ return batch_rows
313
+ except Exception:
314
+ return 65_536
315
+
316
+
317
+ def _get_available_memory() -> int:
318
+ """Get available system memory in bytes."""
319
+ try:
320
+ import psutil
321
+
322
+ return psutil.virtual_memory().available
323
+ except ImportError:
324
+ log_warning("psutil not installed; assuming 4GB available memory", verbose=True)
325
+ return 4 * 1024 * 1024 * 1024
326
+
327
+
328
+ def _estimate_text_column_count(
329
+ file_path: pathlib.Path,
330
+ delimiter: str,
331
+ ) -> int:
332
+ """Estimate the number of columns in a delimited text file."""
333
+ try:
334
+ with open(file_path, "r", encoding="utf-8", errors="replace") as handle:
335
+ header = handle.readline()
336
+ except Exception:
337
+ return 1
338
+
339
+ if not header:
340
+ return 1
341
+
342
+ return max(1, header.count(delimiter) + 1)
343
+
344
+
345
+ def _adaptive_sample_rows(file_size: int, sample_rows: int, column_count: int) -> int:
346
+ """Adapt sample rows to file size and width to limit costly inference."""
347
+ rows = max(25, sample_rows)
348
+
349
+ if file_size >= _TEXT_MEDIUM_FILE_BYTES:
350
+ rows = min(rows, 500)
351
+
352
+ if column_count >= 300:
353
+ rows = min(rows, 100)
354
+ elif column_count >= 100:
355
+ rows = min(rows, 200)
356
+
357
+ return max(25, rows)
358
+
359
+
360
+ def _estimate_file_memory(file_path: pathlib.Path, sample_rows: int = 1000) -> int:
361
+ """
362
+ Estimate memory required to load a file by sampling.
363
+
364
+ Returns estimated bytes needed to load entire file.
365
+ """
366
+ file_size = file_path.stat().st_size
367
+ suffix = file_path.suffix.lower()
368
+
369
+ if suffix == ".parquet":
370
+ return file_size * 3
371
+
372
+ if suffix in (".csv", ".tsv", ".txt"):
373
+ if file_size >= _TEXT_EXTREME_FILE_BYTES:
374
+ return int(file_size * _TEXT_EXTREME_MULTIPLIER)
375
+
376
+ if file_size >= _TEXT_LARGE_FILE_BYTES:
377
+ return int(file_size * _TEXT_LARGE_MULTIPLIER)
378
+
379
+ try:
380
+ sample_kwargs = {}
381
+ if suffix == ".tsv":
382
+ sample_kwargs["sep"] = "\t"
383
+
384
+ delimiter = sample_kwargs.get("sep", ",")
385
+ column_count = _estimate_text_column_count(file_path, delimiter)
386
+ adaptive_rows = _adaptive_sample_rows(file_size, sample_rows, column_count)
387
+
388
+ if file_size >= _TEXT_MEDIUM_FILE_BYTES or column_count >= 200:
389
+ sample_kwargs["dtype"] = str
390
+
391
+ sample = pd.read_csv(file_path, nrows=adaptive_rows, **sample_kwargs)
392
+ if len(sample) == 0:
393
+ return file_size * 3
394
+
395
+ memory_per_row = sample.memory_usage(deep=True).sum() / len(sample)
396
+ estimated_rows = _count_lines_estimate(file_path)
397
+ return int(memory_per_row * estimated_rows * 1.2)
398
+ except Exception:
399
+ return file_size * 3
400
+
401
+ if suffix in (".xlsx", ".xls"):
402
+ return file_size * 10
403
+
404
+ return file_size * 3
405
+
406
+
407
+ def _count_lines_estimate(file_path: pathlib.Path, sample_size: int = 65536) -> int:
408
+ """Estimate number of lines in a file by sampling."""
409
+ file_size = file_path.stat().st_size
410
+ with open(file_path, "rb") as f:
411
+ sample = f.read(sample_size)
412
+ lines_in_sample = sample.count(b"\n")
413
+
414
+ if lines_in_sample == 0:
415
+ return 1
416
+
417
+ return int(file_size * lines_in_sample / len(sample))
418
+
419
+
420
+ def _read_file_chunked_path(
421
+ path: pathlib.Path,
422
+ chunksize: Optional[int] = None,
423
+ memory_fraction: float = 0.5,
424
+ verbose: bool = False,
425
+ **read_kwargs,
426
+ ) -> Iterator[pd.DataFrame]:
427
+ """Read a file in chunks from a validated Path."""
428
+ suffix = path.suffix.lower()
429
+
430
+ if suffix == ".parquet":
431
+ # Parquet can be read in row batches directly from metadata.
432
+ parquet_file = pq.ParquetFile(path)
433
+ batch_rows = chunksize
434
+ if batch_rows is None:
435
+ batch_rows = _infer_parquet_batch_rows(
436
+ file_path=path,
437
+ parquet_file=parquet_file,
438
+ memory_fraction=memory_fraction,
439
+ verbose=verbose,
440
+ )
441
+
442
+ for chunk_num, batch in enumerate(
443
+ parquet_file.iter_batches(batch_size=int(batch_rows), use_threads=True),
444
+ start=1,
445
+ ):
446
+ yield batch.to_pandas()
447
+ context = format_log_context(
448
+ build_log_context(
449
+ file_path=path,
450
+ chunk_index=chunk_num,
451
+ chunk_rows=len(batch),
452
+ )
453
+ )
454
+ log_debug(f"Read parquet batch with {len(batch)} rows.{context}", verbose)
455
+ return
456
+
457
+ if suffix not in (".csv", ".tsv", ".txt"):
458
+ raise ValueError(f"Unsupported file format for chunked reading: {suffix}")
459
+
460
+ if chunksize is None:
461
+ # Auto-size chunks so that each chunk stays under the memory budget.
462
+ available_memory = _get_available_memory()
463
+ target_memory = int(available_memory * memory_fraction)
464
+ estimated_total = _estimate_file_memory(path)
465
+
466
+ if estimated_total <= target_memory:
467
+ context = format_log_context(
468
+ build_log_context(
469
+ file_path=path,
470
+ estimated_mb=f"{estimated_total / 1e6:.1f}",
471
+ target_mb=f"{target_memory / 1e6:.1f}",
472
+ )
473
+ )
474
+ log_info(f"File fits in memory; reading all at once.{context}", verbose)
475
+ detected_kwargs = _prepare_text_read_kwargs(
476
+ file_path=path,
477
+ suffix=suffix,
478
+ read_kwargs=read_kwargs,
479
+ verbose=verbose,
480
+ )
481
+ normalized_kwargs = _normalize_text_delimiter_kwargs(
482
+ suffix, detected_kwargs
483
+ )
484
+ df = pd.read_csv(path, **normalized_kwargs)
485
+ yield df
486
+ return
487
+
488
+ total_lines = _count_lines_estimate(path)
489
+ memory_per_row = estimated_total / max(1, total_lines)
490
+ chunksize = max(1000, int(target_memory / memory_per_row))
491
+ context = format_log_context(
492
+ build_log_context(
493
+ file_path=path,
494
+ estimated_mb=f"{estimated_total / 1e6:.1f}",
495
+ target_mb=f"{target_memory / 1e6:.1f}",
496
+ chunk_rows=chunksize,
497
+ )
498
+ )
499
+ log_info(f"File too large; reading in chunks.{context}", verbose)
500
+
501
+ chunk_num = 0
502
+ detected_kwargs = _prepare_text_read_kwargs(
503
+ file_path=path,
504
+ suffix=suffix,
505
+ read_kwargs=read_kwargs,
506
+ verbose=verbose,
507
+ )
508
+ normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, detected_kwargs)
509
+ for chunk in pd.read_csv(path, chunksize=chunksize, **normalized_kwargs):
510
+ chunk_num += 1
511
+ context = format_log_context(
512
+ build_log_context(file_path=path, chunk_index=chunk_num)
513
+ )
514
+ log_debug(f"Read chunk with {len(chunk)} rows.{context}", verbose)
515
+ yield chunk
516
+
517
+
518
+ def read_file_chunked(
519
+ file_path: Union[str, pathlib.Path],
520
+ chunksize: Optional[int] = None,
521
+ memory_fraction: float = 0.5,
522
+ verbose: bool = False,
523
+ **read_kwargs,
524
+ ) -> Iterator[pd.DataFrame]:
525
+ """
526
+ Read a file in chunks, automatically determining chunk size based on available memory.
527
+
528
+ Args:
529
+ file_path: Path to the file to read.
530
+ chunksize: Optional explicit chunk size (rows). If None, auto-calculated.
531
+ memory_fraction: Fraction of available memory to use (default: 0.5).
532
+ verbose: If True, logs progress messages.
533
+ **read_kwargs: Additional arguments passed to pandas read function.
534
+
535
+ Yields:
536
+ DataFrame chunks.
537
+
538
+ Raises:
539
+ ValueError: If file does not exist or format is unsupported.
540
+ """
541
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
542
+ yield from _read_file_chunked_path(
543
+ path=path,
544
+ chunksize=chunksize,
545
+ memory_fraction=memory_fraction,
546
+ verbose=verbose,
547
+ **read_kwargs,
548
+ )
549
+
550
+
551
+ def read_file_iter(
552
+ file_path: Union[str, pathlib.Path],
553
+ chunksize: Optional[int] = None,
554
+ memory_fraction: float = 0.5,
555
+ verbose: bool = False,
556
+ **read_kwargs,
557
+ ) -> Iterator[pd.DataFrame]:
558
+ """Stream a file as an iterator of DataFrame chunks.
559
+
560
+ This is the "never materialize" API: unlike read_file_smart(), this function
561
+ does not concatenate chunks into a single DataFrame.
562
+
563
+ Supported streaming formats:
564
+ - .csv / .tsv / .txt (via pandas chunking)
565
+ - .parquet (via pyarrow iter_batches)
566
+ - .json (JSON Lines via pandas chunks; or standard JSON arrays via ijson)
567
+ - .xlsx / .xls (via openpyxl read-only streaming, if available)
568
+
569
+ Non-streaming formats:
570
+ - .xlsx / .xls are loaded fully and yielded as a single DataFrame if
571
+ openpyxl is unavailable and the file is estimated to fit within
572
+ memory_fraction of available memory.
573
+
574
+ Raises:
575
+ ValueError: If the file is missing, unsupported, or too large for a
576
+ non-streaming format.
577
+ """
578
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
579
+
580
+ suffix = path.suffix.lower()
581
+
582
+ if suffix in (".csv", ".tsv", ".txt", ".parquet"):
583
+ yield from _read_file_chunked_path(
584
+ path=path,
585
+ chunksize=chunksize,
586
+ memory_fraction=memory_fraction,
587
+ verbose=verbose,
588
+ **read_kwargs,
589
+ )
590
+ return
591
+
592
+ if suffix in (".json", ".jsonl"):
593
+ # pandas can stream JSON only for JSON Lines (one JSON object per line).
594
+ lines = bool(read_kwargs.get("lines", False))
595
+ if suffix == ".jsonl" and "lines" not in read_kwargs:
596
+ read_kwargs = dict(read_kwargs)
597
+ read_kwargs["lines"] = True
598
+ lines = True
599
+ if not lines:
600
+ record_path = read_kwargs.get("record_path", "item")
601
+ if chunksize is None:
602
+ chunksize = 1_000
603
+ log_info(
604
+ "Standard JSON streaming enabled; using default chunksize=1000. "
605
+ f"Set record_path='{record_path}' if your JSON is nested.",
606
+ verbose,
607
+ )
608
+
609
+ try:
610
+ iterator = _iter_json_objects(path, record_path)
611
+ except ImportError:
612
+ available_memory = _get_available_memory()
613
+ target_memory = int(available_memory * memory_fraction)
614
+ estimated_total = _estimate_file_memory(path)
615
+ if estimated_total > target_memory:
616
+ raise ValueError(
617
+ "Streaming standard JSON requires the optional 'ijson' dependency. "
618
+ "Install ijson or convert the file to JSON Lines with "
619
+ "`json_to_jsonl()` (then set lines=True)."
620
+ )
621
+ yield pd.read_json(path, **read_kwargs)
622
+ return
623
+
624
+ buffer: List[dict] = []
625
+ chunk_num = 0
626
+ for obj in iterator:
627
+ buffer.append(obj)
628
+ if len(buffer) >= int(chunksize):
629
+ chunk_num += 1
630
+ yield pd.DataFrame(buffer)
631
+ log_debug(
632
+ f"Read json chunk {chunk_num} with {len(buffer)} rows.",
633
+ verbose,
634
+ )
635
+ buffer = []
636
+
637
+ if buffer:
638
+ chunk_num += 1
639
+ yield pd.DataFrame(buffer)
640
+ log_debug(
641
+ f"Read json chunk {chunk_num} with {len(buffer)} rows.",
642
+ verbose,
643
+ )
644
+ return
645
+
646
+ # JSON Lines streaming.
647
+ if chunksize is None:
648
+ available_memory = _get_available_memory()
649
+ target_memory = int(available_memory * memory_fraction)
650
+ estimated_total = _estimate_file_memory(path)
651
+
652
+ if estimated_total <= target_memory:
653
+ yield pd.read_json(path, **read_kwargs)
654
+ return
655
+
656
+ total_lines = _count_lines_estimate(path)
657
+ memory_per_line = estimated_total / max(1, total_lines)
658
+ chunksize = max(1000, int(target_memory / max(1.0, memory_per_line)))
659
+ context = format_log_context(
660
+ build_log_context(
661
+ file_path=path,
662
+ estimated_mb=f"{estimated_total / 1e6:.1f}",
663
+ target_mb=f"{target_memory / 1e6:.1f}",
664
+ chunk_rows=chunksize,
665
+ )
666
+ )
667
+ log_info(f"JSON Lines too large; streaming in chunks.{context}", verbose)
668
+
669
+ # pandas returns a TextFileReader-like iterator when chunksize is provided.
670
+ json_iter = pd.read_json(path, chunksize=chunksize, **read_kwargs)
671
+ for i, chunk in enumerate(json_iter, start=1):
672
+ context = format_log_context(
673
+ build_log_context(file_path=path, chunk_index=i)
674
+ )
675
+ log_debug(f"Read json chunk with {len(chunk)} rows.{context}", verbose)
676
+ yield chunk
677
+ return
678
+
679
+ if suffix in (".xlsx", ".xls"):
680
+ available_memory = _get_available_memory()
681
+ target_memory = int(available_memory * memory_fraction)
682
+ estimated_total = _estimate_file_memory(path)
683
+ # If the file is estimated too large to load safely and no explicit
684
+ # chunksize was provided, avoid attempting to open the workbook
685
+ # (which would error on invalid files) and raise a clear ValueError.
686
+ if chunksize is None and estimated_total > target_memory:
687
+ raise ValueError("Excel streaming is not supported for very large files.")
688
+
689
+ if chunksize is None and estimated_total <= target_memory:
690
+ yield pd.read_excel(path, **read_kwargs)
691
+ return
692
+
693
+ try:
694
+ yield from _read_excel_streaming(
695
+ file_path=path,
696
+ chunksize=chunksize,
697
+ memory_fraction=memory_fraction,
698
+ verbose=verbose,
699
+ **read_kwargs,
700
+ )
701
+ return
702
+ except ImportError as exc:
703
+ if estimated_total > target_memory:
704
+ raise ValueError(
705
+ "Excel streaming is not supported for very large files. "
706
+ "Install openpyxl for streaming or convert to CSV/Parquet first."
707
+ ) from exc
708
+ yield pd.read_excel(path, **read_kwargs)
709
+ return
710
+
711
+ raise ValueError(f"Unsupported file format for streaming: {suffix}")
712
+
713
+
714
+ def _read_excel_streaming(
715
+ file_path: Union[str, pathlib.Path],
716
+ chunksize: Optional[int],
717
+ memory_fraction: float,
718
+ verbose: bool,
719
+ **read_kwargs,
720
+ ) -> Iterator[pd.DataFrame]:
721
+ """Stream Excel files using openpyxl read-only mode."""
722
+ try:
723
+ import openpyxl
724
+ except ImportError as exc: # pragma: no cover - depends on optional dependency
725
+ raise ImportError("openpyxl is required for Excel streaming") from exc
726
+
727
+ path = pathlib.Path(file_path)
728
+ if chunksize is None:
729
+ chunksize = 10_000
730
+ log_info(
731
+ f"Excel streaming enabled; using default chunksize={chunksize} rows.",
732
+ verbose,
733
+ )
734
+
735
+ sheet_name = read_kwargs.pop("sheet_name", 0)
736
+ header = read_kwargs.pop("header", 0)
737
+ data_only = read_kwargs.pop("data_only", True)
738
+
739
+ if isinstance(sheet_name, (list, tuple)):
740
+ raise ValueError("Excel streaming supports a single sheet_name at a time.")
741
+
742
+ @contextmanager
743
+ def _openpyxl_workbook() -> Iterator[openpyxl.Workbook]:
744
+ workbook = openpyxl.load_workbook(path, read_only=True, data_only=data_only)
745
+ try:
746
+ yield workbook
747
+ finally:
748
+ workbook.close()
749
+
750
+ with _openpyxl_workbook() as workbook:
751
+ try:
752
+ if sheet_name is None:
753
+ worksheet = workbook.active
754
+ elif isinstance(sheet_name, int):
755
+ worksheet = workbook.worksheets[sheet_name]
756
+ else:
757
+ worksheet = workbook[sheet_name]
758
+ except (KeyError, IndexError) as exc:
759
+ raise ValueError(f"Sheet not found: {sheet_name}") from exc
760
+
761
+ row_iter = worksheet.iter_rows(values_only=True)
762
+ columns: Optional[List[str]] = None
763
+ if header is not None:
764
+ header_index = int(header)
765
+ for _ in range(header_index):
766
+ next(row_iter, None)
767
+ header_row = next(row_iter, None)
768
+ if header_row is None:
769
+ return
770
+ columns = ["" if value is None else str(value) for value in header_row]
771
+
772
+ buffer: List[List[object]] = []
773
+ chunk_num = 0
774
+ for row in row_iter:
775
+ buffer.append(list(row))
776
+ if len(buffer) >= int(chunksize):
777
+ chunk_num += 1
778
+ yield pd.DataFrame(buffer, columns=columns)
779
+ log_debug(
780
+ f"Read excel chunk {chunk_num} with {len(buffer)} rows.",
781
+ verbose,
782
+ )
783
+ buffer = []
784
+
785
+ if buffer:
786
+ chunk_num += 1
787
+ yield pd.DataFrame(buffer, columns=columns)
788
+ log_debug(f"Read excel chunk {chunk_num} with {len(buffer)} rows.", verbose)
789
+
790
+
791
+ def excel_to_parquets(
792
+ file_path: Union[str, pathlib.Path],
793
+ output_dir: Union[str, pathlib.Path],
794
+ output_prefix: str = "part",
795
+ rows_per_file: Optional[int] = None,
796
+ memory_fraction: float = 0.5,
797
+ convert_types: bool = True,
798
+ verbose: bool = False,
799
+ **read_kwargs,
800
+ ) -> List[pathlib.Path]:
801
+ """Stream an Excel file to multiple Parquet partitions.
802
+
803
+ This requires openpyxl and reads the Excel file in read-only mode.
804
+ """
805
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
806
+ output_path = ensure_directory(output_dir, verbose=verbose, label="output_dir")
807
+
808
+ output_files: List[pathlib.Path] = []
809
+ part_num = 0
810
+
811
+ chunks = _read_excel_streaming(
812
+ file_path=path,
813
+ chunksize=rows_per_file,
814
+ memory_fraction=memory_fraction,
815
+ verbose=verbose,
816
+ **read_kwargs,
817
+ )
818
+ for chunk in _normalized_chunks(
819
+ chunks,
820
+ convert_types=convert_types,
821
+ verbose=verbose,
822
+ ):
823
+ output_file = output_path / f"{output_prefix}_{part_num:05d}.parquet"
824
+ chunk.to_parquet(output_file, index=False)
825
+ output_files.append(output_file)
826
+
827
+ log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
828
+ part_num += 1
829
+
830
+ log_info(f"Created {len(output_files)} Parquet files in {output_path}", verbose)
831
+ return output_files
832
+
833
+
834
+ def read_file_to_parquets(
835
+ file_path: Union[str, pathlib.Path],
836
+ output_dir: Union[str, pathlib.Path],
837
+ output_prefix: str = "part",
838
+ rows_per_file: Optional[int] = None,
839
+ memory_fraction: float = 0.5,
840
+ convert_types: bool = True,
841
+ verbose: bool = False,
842
+ **read_kwargs,
843
+ ) -> List[pathlib.Path]:
844
+ """
845
+ Read a large file and write it to multiple Parquet files if it doesn't fit in memory.
846
+
847
+ Args:
848
+ file_path: Path to the input file.
849
+ output_dir: Directory where Parquet files will be written.
850
+ output_prefix: Prefix for output file names (default: "part").
851
+ rows_per_file: Optional explicit rows per output file. If None, auto-calculated.
852
+ memory_fraction: Fraction of available memory to use.
853
+ convert_types: If True, attempts to convert string columns to numeric.
854
+ verbose: If True, logs progress messages.
855
+ **read_kwargs: Additional arguments passed to pandas read function.
856
+
857
+ Returns:
858
+ List of paths to the created Parquet files.
859
+
860
+ Raises:
861
+ ValueError: If file does not exist or format is unsupported.
862
+ """
863
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
864
+ output_path = ensure_directory(output_dir, verbose=verbose, label="output_dir")
865
+
866
+ output_files: List[pathlib.Path] = []
867
+ part_num = 0
868
+
869
+ chunks = _read_file_chunked_path(
870
+ path=path,
871
+ chunksize=rows_per_file,
872
+ memory_fraction=memory_fraction,
873
+ verbose=verbose,
874
+ **read_kwargs,
875
+ )
876
+ for chunk in _normalized_chunks(
877
+ chunks,
878
+ convert_types=convert_types,
879
+ verbose=verbose,
880
+ ):
881
+ output_file = output_path / f"{output_prefix}_{part_num:05d}.parquet"
882
+ chunk.to_parquet(output_file, index=False)
883
+ output_files.append(output_file)
884
+
885
+ log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
886
+ part_num += 1
887
+
888
+ log_info(f"Created {len(output_files)} Parquet files in {output_path}", verbose)
889
+ return output_files
890
+
891
+
892
+ def stream_to_sink(
893
+ chunks: Iterable[pd.DataFrame],
894
+ output_dir: Union[str, pathlib.Path],
895
+ output_prefix: str = "part",
896
+ convert_types: bool = True,
897
+ verbose: bool = False,
898
+ sink: Optional[Callable[[pd.DataFrame, pathlib.Path], None]] = None,
899
+ output_suffix: str = ".parquet",
900
+ ) -> List[pathlib.Path]:
901
+ """Consume an iterator of DataFrames and write incremental partitions.
902
+
903
+ Args:
904
+ chunks: Iterable of DataFrame chunks (e.g., from read_file_iter()).
905
+ output_dir: Directory where partitions are written.
906
+ output_prefix: Filename prefix for partitions.
907
+ convert_types: If True, attempts to convert numeric-looking strings.
908
+ verbose: If True, logs progress.
909
+ sink: Optional custom sink function. If omitted, writes Parquet files via
910
+ DataFrame.to_parquet(output_file).
911
+ output_suffix: File suffix to use for output files (default: ".parquet").
912
+
913
+ Returns:
914
+ List of output file paths produced by the sink.
915
+ """
916
+ output_path = ensure_directory(output_dir, verbose=verbose, label="output_dir")
917
+
918
+ output_files: List[pathlib.Path] = []
919
+ part_num = 0
920
+
921
+ if not isinstance(output_suffix, str) or not output_suffix:
922
+ raise ValueError("output_suffix must be a non-empty string")
923
+ if not output_suffix.startswith("."):
924
+ output_suffix = f".{output_suffix}"
925
+
926
+ if sink is None:
927
+
928
+ def _default_sink(chunk: pd.DataFrame, output_file: pathlib.Path) -> None:
929
+ chunk.to_parquet(output_file, index=False)
930
+
931
+ sink = _default_sink
932
+
933
+ with timed_step("stream_to_sink", verbose=verbose):
934
+ for chunk in _normalized_chunks(
935
+ chunks,
936
+ convert_types=convert_types,
937
+ verbose=verbose,
938
+ ):
939
+ output_file = output_path / f"{output_prefix}_{part_num:05d}{output_suffix}"
940
+ sink(chunk, output_file)
941
+ output_files.append(output_file)
942
+
943
+ log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
944
+ part_num += 1
945
+
946
+ log_info(f"Created {len(output_files)} partitions in {output_path}", verbose)
947
+ return output_files
948
+
949
+
950
+ def stream_to_parquets(
951
+ file_path: Union[str, pathlib.Path],
952
+ output_dir: Union[str, pathlib.Path],
953
+ output_prefix: str = "part",
954
+ rows_per_file: Optional[int] = None,
955
+ memory_fraction: float = 0.5,
956
+ convert_types: bool = True,
957
+ verbose: bool = False,
958
+ **read_kwargs,
959
+ ) -> List[pathlib.Path]:
960
+ """Stream a file and write it to Parquet partitions without materializing.
961
+
962
+ This helper is the "no concat" companion to read_file_to_parquets(). It uses
963
+ read_file_iter() under the hood and writes each incoming chunk to a separate
964
+ Parquet file.
965
+
966
+ Args:
967
+ file_path: Input file path.
968
+ output_dir: Directory where Parquet partitions are written.
969
+ output_prefix: Output filename prefix.
970
+ rows_per_file: Desired rows per partition. For streaming formats this
971
+ is passed as chunksize; if None, chunk sizes are chosen automatically
972
+ based on memory_fraction.
973
+ memory_fraction: Fraction of available memory to use when auto-sizing.
974
+ convert_types: If True, attempts to convert numeric-looking strings.
975
+ verbose: If True, logs progress.
976
+ **read_kwargs: Passed to the underlying reader.
977
+
978
+ Returns:
979
+ List of Parquet file paths.
980
+
981
+ Raises:
982
+ ValueError: If the input is missing/unsupported.
983
+ """
984
+ chunk_iter = read_file_iter(
985
+ file_path=file_path,
986
+ chunksize=rows_per_file,
987
+ memory_fraction=memory_fraction,
988
+ verbose=verbose,
989
+ **read_kwargs,
990
+ )
991
+ return stream_to_sink(
992
+ chunks=chunk_iter,
993
+ output_dir=output_dir,
994
+ output_prefix=output_prefix,
995
+ convert_types=convert_types,
996
+ verbose=verbose,
997
+ )
998
+
999
+
1000
+ def parquet_to_csv_partitions(
1001
+ file_path: Union[str, pathlib.Path],
1002
+ output_dir: Union[str, pathlib.Path],
1003
+ output_prefix: str = "part",
1004
+ rows_per_file: Optional[int] = None,
1005
+ memory_fraction: float = 0.5,
1006
+ convert_types: bool = True,
1007
+ verbose: bool = False,
1008
+ delimiter: str = ",",
1009
+ include_header: bool = True,
1010
+ line_terminator: str = "\n",
1011
+ drop_columns: Optional[list[str]] = None,
1012
+ column_order: Optional[list[str]] = None,
1013
+ drop_extra_columns: bool = False,
1014
+ ) -> List[pathlib.Path]:
1015
+ """Stream a Parquet file to CSV partitions without materializing.
1016
+
1017
+ Args:
1018
+ file_path: Parquet file path.
1019
+ output_dir: Directory where CSV partitions are written.
1020
+ output_prefix: Output filename prefix.
1021
+ rows_per_file: Desired rows per partition. If None, batch size is chosen
1022
+ automatically based on memory_fraction.
1023
+ memory_fraction: Fraction of available memory to use when auto-sizing.
1024
+ convert_types: If True, attempts to convert numeric-looking strings.
1025
+ verbose: If True, logs progress.
1026
+ delimiter: CSV delimiter.
1027
+ include_header: If True, include headers in each CSV file.
1028
+ line_terminator: Line terminator used in CSV output.
1029
+ drop_columns: Optional column names to drop before writing.
1030
+ column_order: Optional column order to enforce in CSV output.
1031
+ drop_extra_columns: If True, drop columns not in column_order.
1032
+
1033
+ Returns:
1034
+ List of CSV file paths.
1035
+ """
1036
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
1037
+ if path.suffix.lower() != ".parquet":
1038
+ raise ValueError("file_path must point to a .parquet file")
1039
+
1040
+ chunk_iter = read_file_iter(
1041
+ file_path=path,
1042
+ chunksize=rows_per_file,
1043
+ memory_fraction=memory_fraction,
1044
+ verbose=verbose,
1045
+ )
1046
+
1047
+ drop_set = set(drop_columns or [])
1048
+ if column_order is not None:
1049
+ if not isinstance(column_order, list) or not all(
1050
+ isinstance(col, str) and col.strip() for col in column_order
1051
+ ):
1052
+ raise ValueError("column_order must be a list of non-empty strings")
1053
+ if len(column_order) != len(set(column_order)):
1054
+ raise ValueError("column_order must not contain duplicates")
1055
+
1056
+ def _csv_sink(chunk: pd.DataFrame, output_file: pathlib.Path) -> None:
1057
+ if drop_set:
1058
+ chunk = chunk.drop(columns=[c for c in drop_set if c in chunk.columns])
1059
+ if column_order is not None:
1060
+ missing = [c for c in column_order if c not in chunk.columns]
1061
+ if missing:
1062
+ raise ValueError(
1063
+ f"Missing columns for CSV output: {missing}. "
1064
+ "Ensure all Parquet files share the same schema."
1065
+ )
1066
+ if drop_extra_columns:
1067
+ chunk = chunk[[c for c in column_order]]
1068
+ else:
1069
+ extra = [c for c in chunk.columns if c not in column_order]
1070
+ chunk = chunk[[*column_order, *extra]]
1071
+ chunk.to_csv(
1072
+ output_file,
1073
+ index=False,
1074
+ sep=delimiter,
1075
+ header=include_header,
1076
+ lineterminator=line_terminator,
1077
+ )
1078
+
1079
+ return stream_to_sink(
1080
+ chunks=chunk_iter,
1081
+ output_dir=output_dir,
1082
+ output_prefix=output_prefix,
1083
+ convert_types=convert_types,
1084
+ verbose=verbose,
1085
+ sink=_csv_sink,
1086
+ output_suffix=".csv",
1087
+ )
1088
+
1089
+
1090
+ def _resolve_return_type(
1091
+ return_polars: bool,
1092
+ return_type: str,
1093
+ ) -> tuple[str, bool, bool, str]:
1094
+ if return_polars:
1095
+ if return_type not in ("pandas", "polars"):
1096
+ raise ValueError(
1097
+ "return_polars cannot be combined with return_type other than "
1098
+ "'pandas' or 'polars'."
1099
+ )
1100
+ return_type = "polars"
1101
+ normalized_return_type = return_type.lower()
1102
+ wants_polars = normalized_return_type in ("polars", "polars_lazy", "lazy")
1103
+ return_lazy = normalized_return_type in ("polars_lazy", "lazy")
1104
+ return normalized_return_type, wants_polars, return_lazy, return_type
1105
+
1106
+
1107
+ def _validate_return_type(normalized_return_type: str, return_type: str) -> None:
1108
+ valid_return_types = {
1109
+ "pandas",
1110
+ "dataframe",
1111
+ "iterator",
1112
+ "polars",
1113
+ "polars_lazy",
1114
+ "lazy",
1115
+ }
1116
+ if normalized_return_type not in valid_return_types:
1117
+ raise ValueError(
1118
+ f"Invalid return_type: {return_type}. Expected one of "
1119
+ f"{sorted(valid_return_types)}."
1120
+ )
1121
+
1122
+
1123
+ def _require_polars():
1124
+ try:
1125
+ import polars as pl
1126
+ except ImportError as exc:
1127
+ raise ImportError(
1128
+ "Polars is required for return_type='polars' or 'polars_lazy'."
1129
+ ) from exc
1130
+ return pl
1131
+
1132
+
1133
+ def _read_with_polars_return_type(
1134
+ *,
1135
+ path: pathlib.Path,
1136
+ suffix: str,
1137
+ text_read_kwargs: dict,
1138
+ fits_in_memory: bool,
1139
+ return_lazy: bool,
1140
+ verbose: bool,
1141
+ read_pandas_direct: Callable[[], pd.DataFrame],
1142
+ read_pandas_chunked: Callable[[], pd.DataFrame],
1143
+ ):
1144
+ pl = _require_polars()
1145
+ polars_scan = _polars_scan_source(path, suffix, text_read_kwargs)
1146
+ if polars_scan is not None:
1147
+ if return_lazy:
1148
+ return polars_scan
1149
+ with timed_step("polars_collect", verbose=verbose):
1150
+ df_polars = polars_scan.collect(streaming=not fits_in_memory)
1151
+ context = format_log_context(build_log_context(file_path=path))
1152
+ log_info(f"Polars read {len(df_polars)} rows.{context}", verbose)
1153
+ return df_polars
1154
+
1155
+ if fits_in_memory:
1156
+ with timed_step("pandas_read_direct", verbose=verbose):
1157
+ df_pandas = read_pandas_direct()
1158
+ else:
1159
+ with timed_step("pandas_read_chunked", verbose=verbose):
1160
+ df_pandas = read_pandas_chunked()
1161
+ if return_lazy:
1162
+ return pl.from_pandas(df_pandas).lazy()
1163
+ return pl.from_pandas(df_pandas)
1164
+
1165
+
1166
+ def _read_with_polars_fallback(
1167
+ *,
1168
+ path: pathlib.Path,
1169
+ suffix: str,
1170
+ text_read_kwargs: dict,
1171
+ fits_in_memory: bool,
1172
+ use_polars: bool,
1173
+ estimated_memory: int,
1174
+ target_memory: int,
1175
+ verbose: bool,
1176
+ ) -> Optional[pd.DataFrame]:
1177
+ if not use_polars or fits_in_memory:
1178
+ return None
1179
+ try:
1180
+ context = format_log_context(
1181
+ build_log_context(
1182
+ file_path=path,
1183
+ estimated_mb=f"{estimated_memory / 1e6:.1f}",
1184
+ target_mb=f"{target_memory / 1e6:.1f}",
1185
+ )
1186
+ )
1187
+ log_info(f"Using Polars for large file.{context}", verbose)
1188
+
1189
+ lf = _polars_scan_source(path, suffix, text_read_kwargs)
1190
+ if lf is None:
1191
+ raise ValueError(f"Unsupported format for Polars: {suffix}")
1192
+
1193
+ with timed_step("polars_collect", verbose=verbose):
1194
+ df_polars = lf.collect(streaming=True)
1195
+ context = format_log_context(build_log_context(file_path=path))
1196
+ log_info(f"Polars read {len(df_polars)} rows.{context}", verbose)
1197
+ return df_polars.to_pandas()
1198
+
1199
+ except ImportError:
1200
+ context = format_log_context(build_log_context(file_path=path))
1201
+ log_warning(
1202
+ "Polars not installed; falling back to pandas chunked reading."
1203
+ f"{context}",
1204
+ verbose,
1205
+ )
1206
+ except Exception as e:
1207
+ context = format_log_context(build_log_context(file_path=path))
1208
+ log_warning(f"Polars failed: {e}; falling back to pandas.{context}", verbose)
1209
+ return None
1210
+
1211
+
1212
+ def read_file_smart(
1213
+ file_path: Union[str, pathlib.Path],
1214
+ use_polars: bool = True,
1215
+ return_polars: bool = False,
1216
+ return_type: str = "pandas",
1217
+ memory_fraction: float = 0.5,
1218
+ verbose: bool = False,
1219
+ **read_kwargs,
1220
+ ) -> Union[pd.DataFrame, "pl.DataFrame", "pl.LazyFrame"]:
1221
+ """
1222
+ Intelligently read a file, using Polars for large files if available.
1223
+
1224
+ For files that fit in memory, reads directly. For large files, uses
1225
+ Polars lazy evaluation or pandas chunking as a fallback.
1226
+
1227
+ Args:
1228
+ file_path: Path to the file to read.
1229
+ use_polars: If True and Polars is available, uses Polars for large files.
1230
+ return_polars: If True, return a Polars DataFrame (alias for return_type="polars").
1231
+ return_type: "pandas" (default), "polars", "polars_lazy", or "iterator".
1232
+ memory_fraction: Fraction of available memory to use.
1233
+ verbose: If True, logs progress messages.
1234
+ return_type: One of "pandas", "polars", "polars_lazy", or "iterator".
1235
+ **read_kwargs: Additional arguments passed to the read function.
1236
+
1237
+ Returns:
1238
+ DataFrame with the file contents, or an iterator when return_type="iterator".
1239
+
1240
+ Raises:
1241
+ ValueError: If file does not exist or format is unsupported.
1242
+ """
1243
+ (
1244
+ normalized_return_type,
1245
+ wants_polars,
1246
+ return_lazy,
1247
+ return_type,
1248
+ ) = _resolve_return_type(return_polars, return_type)
1249
+
1250
+ path = coerce_path(file_path, must_exist=True, verbose=verbose, label="file_path")
1251
+
1252
+ suffix = path.suffix.lower()
1253
+ if suffix == ".jsonl" and "lines" not in read_kwargs:
1254
+ read_kwargs = dict(read_kwargs)
1255
+ read_kwargs["lines"] = True
1256
+
1257
+ if normalized_return_type == "iterator":
1258
+ return read_file_iter(
1259
+ file_path=path,
1260
+ memory_fraction=memory_fraction,
1261
+ verbose=verbose,
1262
+ **read_kwargs,
1263
+ )
1264
+ available_memory = _get_available_memory()
1265
+ target_memory = int(available_memory * memory_fraction)
1266
+ estimated_memory = _estimate_file_memory(path)
1267
+
1268
+ fits_in_memory = estimated_memory <= target_memory
1269
+ _validate_return_type(normalized_return_type, return_type)
1270
+
1271
+ context = format_log_context(
1272
+ build_log_context(
1273
+ file_path=path,
1274
+ estimated_mb=f"{estimated_memory / 1e6:.1f}",
1275
+ target_mb=f"{target_memory / 1e6:.1f}",
1276
+ fits=fits_in_memory,
1277
+ )
1278
+ )
1279
+ log_debug(f"Computed file memory fit.{context}", verbose)
1280
+
1281
+ text_read_kwargs = (
1282
+ _prepare_text_read_kwargs(
1283
+ file_path=path,
1284
+ suffix=suffix,
1285
+ read_kwargs=read_kwargs,
1286
+ verbose=verbose,
1287
+ )
1288
+ if suffix in (".csv", ".tsv", ".txt")
1289
+ else dict(read_kwargs)
1290
+ )
1291
+
1292
+ def _read_pandas_direct() -> pd.DataFrame:
1293
+ if suffix == ".parquet":
1294
+ return pd.read_parquet(path, **read_kwargs)
1295
+ if suffix in (".csv", ".tsv", ".txt"):
1296
+ normalized_kwargs = _normalize_text_delimiter_kwargs(
1297
+ suffix, text_read_kwargs
1298
+ )
1299
+ return pd.read_csv(path, **normalized_kwargs)
1300
+ if suffix in (".xlsx", ".xls"):
1301
+ return pd.read_excel(path, **read_kwargs)
1302
+ if suffix in (".json", ".jsonl"):
1303
+ return pd.read_json(path, **read_kwargs)
1304
+ raise ValueError(f"Unsupported file format: {suffix}")
1305
+
1306
+ def _read_pandas_chunked() -> pd.DataFrame:
1307
+ normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, text_read_kwargs)
1308
+ return pd.concat(
1309
+ _read_file_chunked_path(
1310
+ path=path,
1311
+ memory_fraction=memory_fraction,
1312
+ verbose=verbose,
1313
+ **normalized_kwargs,
1314
+ ),
1315
+ ignore_index=True,
1316
+ )
1317
+
1318
+ def _read_pandas_with_logging() -> pd.DataFrame:
1319
+ if fits_in_memory:
1320
+ context = format_log_context(
1321
+ build_log_context(
1322
+ file_path=path,
1323
+ estimated_mb=f"{estimated_memory / 1e6:.1f}",
1324
+ target_mb=f"{target_memory / 1e6:.1f}",
1325
+ )
1326
+ )
1327
+ log_info(f"Reading file directly.{context}", verbose)
1328
+ with timed_step("pandas_read_direct", verbose=verbose):
1329
+ return _read_pandas_direct()
1330
+
1331
+ context = format_log_context(
1332
+ build_log_context(
1333
+ file_path=path,
1334
+ estimated_mb=f"{estimated_memory / 1e6:.1f}",
1335
+ target_mb=f"{target_memory / 1e6:.1f}",
1336
+ )
1337
+ )
1338
+ log_info(f"Reading large file in chunks.{context}", verbose)
1339
+ with timed_step("pandas_read_chunked", verbose=verbose):
1340
+ return _read_pandas_chunked()
1341
+
1342
+ if wants_polars:
1343
+ return _read_with_polars_return_type(
1344
+ path=path,
1345
+ suffix=suffix,
1346
+ text_read_kwargs=text_read_kwargs,
1347
+ fits_in_memory=fits_in_memory,
1348
+ return_lazy=return_lazy,
1349
+ verbose=verbose,
1350
+ read_pandas_direct=_read_pandas_direct,
1351
+ read_pandas_chunked=_read_pandas_chunked,
1352
+ )
1353
+
1354
+ polars_df = _read_with_polars_fallback(
1355
+ path=path,
1356
+ suffix=suffix,
1357
+ text_read_kwargs=text_read_kwargs,
1358
+ fits_in_memory=fits_in_memory,
1359
+ use_polars=use_polars,
1360
+ estimated_memory=estimated_memory,
1361
+ target_memory=target_memory,
1362
+ verbose=verbose,
1363
+ )
1364
+ if polars_df is not None:
1365
+ return polars_df
1366
+
1367
+ return _read_pandas_with_logging()