dataframe-textual 1.4.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataframe_textual/__main__.py +10 -4
- dataframe_textual/common.py +201 -163
- dataframe_textual/data_frame_table.py +1037 -881
- dataframe_textual/data_frame_viewer.py +321 -104
- dataframe_textual/sql_screen.py +50 -11
- dataframe_textual/table_screen.py +1 -1
- dataframe_textual/yes_no_screen.py +89 -8
- {dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/METADATA +141 -185
- dataframe_textual-1.9.0.dist-info/RECORD +14 -0
- dataframe_textual-1.4.0.dist-info/RECORD +0 -14
- {dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/WHEEL +0 -0
- {dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/entry_points.txt +0 -0
- {dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/licenses/LICENSE +0 -0
dataframe_textual/__main__.py
CHANGED
|
@@ -39,14 +39,18 @@ def cli() -> argparse.Namespace:
|
|
|
39
39
|
parser.add_argument(
|
|
40
40
|
"-I", "--no-inferrence", action="store_true", help="Do not infer data types when reading CSV/TSV"
|
|
41
41
|
)
|
|
42
|
+
parser.add_argument("-E", "--ignore-errors", action="store_true", help="Ignore errors when reading CSV/TSV")
|
|
42
43
|
parser.add_argument(
|
|
43
|
-
"-
|
|
44
|
+
"-c", "--comment-prefix", nargs="?", const="#", help="Comment lines are skipped when reading CSV/TSV"
|
|
44
45
|
)
|
|
45
|
-
parser.add_argument("-L", "--skip-lines", type=int, default=0, help="Skip lines when reading CSV/TSV")
|
|
46
46
|
parser.add_argument(
|
|
47
|
-
"-
|
|
47
|
+
"-q", "--quote-char", nargs="?", const=None, default='"', help="Quote character for reading CSV/TSV"
|
|
48
48
|
)
|
|
49
|
-
parser.add_argument("-
|
|
49
|
+
parser.add_argument("-l", "--skip-lines", type=int, default=0, help="Skip lines when reading CSV/TSV")
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-a", "--skip-rows-after-header", type=int, default=0, help="Skip rows after header when reading CSV/TSV"
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument("-n", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
|
|
50
54
|
|
|
51
55
|
args = parser.parse_args()
|
|
52
56
|
if args.files is None:
|
|
@@ -78,9 +82,11 @@ def main() -> None:
|
|
|
78
82
|
has_header=not args.no_header,
|
|
79
83
|
infer_schema=not args.no_inferrence,
|
|
80
84
|
comment_prefix=args.comment_prefix,
|
|
85
|
+
quote_char=args.quote_char,
|
|
81
86
|
skip_lines=args.skip_lines,
|
|
82
87
|
skip_rows_after_header=args.skip_rows_after_header,
|
|
83
88
|
null_values=args.null,
|
|
89
|
+
ignore_errors=args.ignore_errors,
|
|
84
90
|
)
|
|
85
91
|
app = DataFrameViewer(*sources)
|
|
86
92
|
app.run()
|
dataframe_textual/common.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""Common utilities and constants for dataframe_viewer."""
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import re
|
|
4
5
|
import sys
|
|
5
6
|
from dataclasses import dataclass
|
|
7
|
+
from io import StringIO
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import Any
|
|
8
10
|
|
|
@@ -34,6 +36,16 @@ NULL_DISPLAY = "-"
|
|
|
34
36
|
|
|
35
37
|
@dataclass
|
|
36
38
|
class DtypeClass:
|
|
39
|
+
"""Data type class configuration.
|
|
40
|
+
|
|
41
|
+
Attributes:
|
|
42
|
+
gtype: Generic, high-level type as a string.
|
|
43
|
+
style: Style string for display purposes.
|
|
44
|
+
justify: Text justification for display.
|
|
45
|
+
itype: Input type for validation.
|
|
46
|
+
convert: Conversion function for the data type.
|
|
47
|
+
"""
|
|
48
|
+
|
|
37
49
|
gtype: str # generic, high-level type
|
|
38
50
|
style: str
|
|
39
51
|
justify: str
|
|
@@ -71,7 +83,6 @@ STYLES = {
|
|
|
71
83
|
}
|
|
72
84
|
# fmt: on
|
|
73
85
|
|
|
74
|
-
|
|
75
86
|
# Subscript digits mapping for sort indicators
|
|
76
87
|
SUBSCRIPT_DIGITS = {
|
|
77
88
|
0: "₀",
|
|
@@ -93,6 +104,21 @@ CURSOR_TYPES = ["row", "column", "cell"]
|
|
|
93
104
|
RIDX = "^_ridx_^"
|
|
94
105
|
|
|
95
106
|
|
|
107
|
+
@dataclass
|
|
108
|
+
class Source:
|
|
109
|
+
"""Data source representation.
|
|
110
|
+
|
|
111
|
+
Attributes:
|
|
112
|
+
frame: The Polars DataFrame or LazyFrame.
|
|
113
|
+
filename: The name of the source file.
|
|
114
|
+
tabname: The name of the tab to display.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
frame: pl.DataFrame | pl.LazyFrame
|
|
118
|
+
filename: str
|
|
119
|
+
tabname: str
|
|
120
|
+
|
|
121
|
+
|
|
96
122
|
def DtypeConfig(dtype: pl.DataType) -> DtypeClass:
|
|
97
123
|
"""Get the DtypeClass configuration for a given Polars data type.
|
|
98
124
|
|
|
@@ -222,14 +248,91 @@ def get_next_item(lst: list[Any], current, offset=1) -> Any:
|
|
|
222
248
|
return lst[next_index]
|
|
223
249
|
|
|
224
250
|
|
|
225
|
-
def
|
|
251
|
+
def parse_placeholders(template: str, columns: list[str], current_cidx: int) -> list[str | pl.Expr]:
|
|
252
|
+
"""Parse template string into a list of strings or Polars expressions
|
|
253
|
+
|
|
254
|
+
Supports multiple placeholder types:
|
|
255
|
+
- `$_` - Current column (based on current_cidx parameter)
|
|
256
|
+
- `$#` - Row index (1-based, requires '^__ridx__^' column to be present)
|
|
257
|
+
- `$1`, `$2`, etc. - Column index (1-based)
|
|
258
|
+
- `$name` - Column name (e.g., `$product_id`)
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
template: The template string containing placeholders and literal text
|
|
262
|
+
columns: List of column names in the dataframe
|
|
263
|
+
current_cidx: 0-based index of the current column for `$_` references in the columns list
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
A list of strings (literal text) and Polars expressions (for column references)
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
ValueError: If invalid column index or non-existent column name is referenced
|
|
270
|
+
"""
|
|
271
|
+
if "$" not in template or template.endswith("$"):
|
|
272
|
+
return [template]
|
|
273
|
+
|
|
274
|
+
# Regex matches: $_ or $\d+ or $\w+ (column names)
|
|
275
|
+
placeholder_pattern = r"\$(_|#|\d+|[a-zA-Z_]\w*)"
|
|
276
|
+
placeholders = re.finditer(placeholder_pattern, template)
|
|
277
|
+
|
|
278
|
+
parts = []
|
|
279
|
+
last_end = 0
|
|
280
|
+
|
|
281
|
+
# Get current column name for $_ references
|
|
282
|
+
try:
|
|
283
|
+
col_name = columns[current_cidx]
|
|
284
|
+
except IndexError:
|
|
285
|
+
raise ValueError(f"Current column index {current_cidx} is out of range for columns list")
|
|
286
|
+
|
|
287
|
+
for match in placeholders:
|
|
288
|
+
# Add literal text before this placeholder
|
|
289
|
+
if match.start() > last_end:
|
|
290
|
+
parts.append(template[last_end : match.start()])
|
|
291
|
+
|
|
292
|
+
placeholder = match.group(1) # Extract content after '$'
|
|
293
|
+
|
|
294
|
+
if placeholder == "_":
|
|
295
|
+
# $_ refers to current column (where cursor was)
|
|
296
|
+
parts.append(pl.col(col_name))
|
|
297
|
+
elif placeholder == "#":
|
|
298
|
+
# $# refers to row index (1-based)
|
|
299
|
+
parts.append((pl.col(RIDX)))
|
|
300
|
+
elif placeholder.isdigit():
|
|
301
|
+
# $1, $2, etc. refer to columns by 1-based position index
|
|
302
|
+
col_idx = int(placeholder) - 1 # Convert to 0-based
|
|
303
|
+
try:
|
|
304
|
+
col_ref = columns[col_idx]
|
|
305
|
+
parts.append(pl.col(col_ref))
|
|
306
|
+
except IndexError:
|
|
307
|
+
raise ValueError(f"Invalid column index: ${placeholder} (valid range: $1 to ${len(columns)})")
|
|
308
|
+
else:
|
|
309
|
+
# $name refers to column by name
|
|
310
|
+
if placeholder in columns:
|
|
311
|
+
parts.append(pl.col(placeholder))
|
|
312
|
+
else:
|
|
313
|
+
raise ValueError(f"Column not found: ${placeholder} (available columns: {', '.join(columns)})")
|
|
314
|
+
|
|
315
|
+
last_end = match.end()
|
|
316
|
+
|
|
317
|
+
# Add remaining literal text after last placeholder
|
|
318
|
+
if last_end < len(template):
|
|
319
|
+
parts.append(template[last_end:])
|
|
320
|
+
|
|
321
|
+
# If no placeholders found, treat entire template as literal
|
|
322
|
+
if not parts:
|
|
323
|
+
parts = [template]
|
|
324
|
+
|
|
325
|
+
return parts
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def parse_polars_expression(expression: str, columns: list[str], current_cidx: int) -> str:
|
|
226
329
|
"""Parse and convert an expression to Polars syntax.
|
|
227
330
|
|
|
228
331
|
Replaces column references with Polars col() expressions:
|
|
229
332
|
- $_ - Current selected column
|
|
230
333
|
- $# - Row index (1-based, requires '^__ridx__^' column to be present)
|
|
231
|
-
- $1, $2, etc. - Column
|
|
232
|
-
- $col_name - Column
|
|
334
|
+
- $1, $2, etc. - Column index (1-based)
|
|
335
|
+
- $col_name - Column name (valid identifier starting with _ or letter)
|
|
233
336
|
|
|
234
337
|
Examples:
|
|
235
338
|
- "$_ > 50" -> "pl.col('current_col') > 50"
|
|
@@ -241,7 +344,7 @@ def parse_polars_expression(expression: str, columns: list[str], current_col_idx
|
|
|
241
344
|
Args:
|
|
242
345
|
expression: The input expression as a string.
|
|
243
346
|
columns: The list of column names in the DataFrame.
|
|
244
|
-
|
|
347
|
+
current_cidx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
245
348
|
|
|
246
349
|
Returns:
|
|
247
350
|
A Python expression string with $references replaced by pl.col() calls.
|
|
@@ -258,38 +361,18 @@ def parse_polars_expression(expression: str, columns: list[str], current_col_idx
|
|
|
258
361
|
# Return as a literal string
|
|
259
362
|
return f"pl.lit({expression})"
|
|
260
363
|
|
|
261
|
-
|
|
262
|
-
# - _ (single underscore)
|
|
263
|
-
# - # (hash for row index)
|
|
264
|
-
# - digits (integer)
|
|
265
|
-
# - identifier (starts with letter or _, followed by letter/digit/_)
|
|
266
|
-
pattern = r"\$(_|#|\d+|[a-zA-Z_]\w*)"
|
|
267
|
-
|
|
268
|
-
def replace_column_ref(match):
|
|
269
|
-
col_ref = match.group(1)
|
|
270
|
-
|
|
271
|
-
if col_ref == "_":
|
|
272
|
-
# Current selected column
|
|
273
|
-
col_name = columns[current_col_idx]
|
|
274
|
-
elif col_ref == "#":
|
|
275
|
-
# RIDX is used to store 0-based row index; add 1 for 1-based index
|
|
276
|
-
return f"(pl.col('{RIDX}') + 1)"
|
|
277
|
-
elif col_ref.isdigit():
|
|
278
|
-
# Column by 1-based index
|
|
279
|
-
col_idx = int(col_ref) - 1
|
|
280
|
-
if col_idx < 0 or col_idx >= len(columns):
|
|
281
|
-
raise ValueError(f"Column index out of range: ${col_ref}")
|
|
282
|
-
col_name = columns[col_idx]
|
|
283
|
-
else:
|
|
284
|
-
# Column by name
|
|
285
|
-
if col_ref not in columns:
|
|
286
|
-
raise ValueError(f"Column not found: ${col_ref}")
|
|
287
|
-
col_name = col_ref
|
|
364
|
+
parts = parse_placeholders(expression, columns, current_cidx)
|
|
288
365
|
|
|
289
|
-
|
|
366
|
+
result = []
|
|
367
|
+
for part in parts:
|
|
368
|
+
if isinstance(part, pl.Expr):
|
|
369
|
+
col = part.meta.output_name()
|
|
290
370
|
|
|
291
|
-
|
|
292
|
-
|
|
371
|
+
result.append(f"pl.col('{col}')")
|
|
372
|
+
else:
|
|
373
|
+
result.append(part)
|
|
374
|
+
|
|
375
|
+
return "".join(result)
|
|
293
376
|
|
|
294
377
|
|
|
295
378
|
def tentative_expr(term: str) -> bool:
|
|
@@ -354,10 +437,12 @@ def load_dataframe(
|
|
|
354
437
|
has_header: bool = True,
|
|
355
438
|
infer_schema: bool = True,
|
|
356
439
|
comment_prefix: str | None = None,
|
|
440
|
+
quote_char: str | None = '"',
|
|
357
441
|
skip_lines: int = 0,
|
|
358
442
|
skip_rows_after_header: int = 0,
|
|
359
443
|
null_values: list[str] | None = None,
|
|
360
|
-
|
|
444
|
+
ignore_errors: bool = False,
|
|
445
|
+
) -> list[Source]:
|
|
361
446
|
"""Load DataFrames from file specifications.
|
|
362
447
|
|
|
363
448
|
Handles loading from multiple files, single files, or stdin. For Excel files,
|
|
@@ -369,42 +454,62 @@ def load_dataframe(
|
|
|
369
454
|
has_header: Whether the input files have a header row. Defaults to True.
|
|
370
455
|
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
371
456
|
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
457
|
+
quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
|
|
372
458
|
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
373
459
|
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
460
|
+
null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
|
|
461
|
+
ignore_errors: Whether to ignore errors when reading CSV/TSV files. Defaults to False.
|
|
374
462
|
|
|
375
463
|
Returns:
|
|
376
|
-
List of
|
|
464
|
+
List of `Source` objects.
|
|
377
465
|
"""
|
|
378
|
-
|
|
466
|
+
data: list[Source] = []
|
|
379
467
|
prefix_sheet = len(filenames) > 1
|
|
380
468
|
|
|
381
469
|
for filename in filenames:
|
|
382
|
-
|
|
383
|
-
|
|
470
|
+
if filename == "-":
|
|
471
|
+
source = StringIO(sys.stdin.read())
|
|
472
|
+
file_format = file_format or "tsv"
|
|
473
|
+
|
|
474
|
+
# Reopen stdin to /dev/tty for proper terminal interaction
|
|
475
|
+
try:
|
|
476
|
+
tty = open("/dev/tty")
|
|
477
|
+
os.dup2(tty.fileno(), sys.stdin.fileno())
|
|
478
|
+
except (OSError, FileNotFoundError):
|
|
479
|
+
pass
|
|
480
|
+
else:
|
|
481
|
+
source = filename
|
|
482
|
+
|
|
483
|
+
# If not specified, determine file format (may be different for each file)
|
|
484
|
+
fmt = file_format
|
|
485
|
+
if not fmt:
|
|
384
486
|
ext = Path(filename).suffix.lower()
|
|
385
|
-
if ext == ".gz"
|
|
487
|
+
if ext == ".gz":
|
|
386
488
|
ext = Path(filename).with_suffix("").suffix.lower()
|
|
387
489
|
fmt = ext.removeprefix(".")
|
|
388
490
|
|
|
389
491
|
# Default to TSV
|
|
390
|
-
|
|
492
|
+
if not fmt or fmt not in SUPPORTED_FORMATS:
|
|
493
|
+
fmt = "tsv"
|
|
391
494
|
|
|
392
|
-
# Load
|
|
393
|
-
|
|
495
|
+
# Load the file
|
|
496
|
+
data.extend(
|
|
394
497
|
load_file(
|
|
395
|
-
|
|
498
|
+
source,
|
|
396
499
|
prefix_sheet=prefix_sheet,
|
|
397
|
-
file_format=
|
|
500
|
+
file_format=fmt,
|
|
398
501
|
has_header=has_header,
|
|
399
502
|
infer_schema=infer_schema,
|
|
400
503
|
comment_prefix=comment_prefix,
|
|
504
|
+
quote_char=quote_char,
|
|
401
505
|
skip_lines=skip_lines,
|
|
402
506
|
skip_rows_after_header=skip_rows_after_header,
|
|
403
507
|
null_values=null_values,
|
|
508
|
+
ignore_errors=ignore_errors,
|
|
404
509
|
)
|
|
405
510
|
)
|
|
406
511
|
|
|
407
|
-
return
|
|
512
|
+
return data
|
|
408
513
|
|
|
409
514
|
|
|
410
515
|
RE_COMPUTE_ERROR = re.compile(r"at column '(.*?)' \(column number \d+\)")
|
|
@@ -435,12 +540,19 @@ def handle_compute_error(
|
|
|
435
540
|
"""
|
|
436
541
|
# Already disabled schema inference, cannot recover
|
|
437
542
|
if not infer_schema:
|
|
438
|
-
print(f"Error loading with schema inference disabled:\n{err_msg}", file=sys.stderr)
|
|
543
|
+
print(f"Error loading even with schema inference disabled:\n{err_msg}", file=sys.stderr)
|
|
544
|
+
|
|
545
|
+
if "CSV malformed" in err_msg:
|
|
546
|
+
print(
|
|
547
|
+
"\nSometimes quote characters might be mismatched. Try again with `-q` or `-E` to ignore errors",
|
|
548
|
+
file=sys.stderr,
|
|
549
|
+
)
|
|
550
|
+
|
|
439
551
|
sys.exit(1)
|
|
440
552
|
|
|
441
553
|
# Schema mismatch error
|
|
442
554
|
if "found more fields than defined in 'Schema'" in err_msg:
|
|
443
|
-
print(f"Input might be malformed:\n{err_msg}", file=sys.stderr)
|
|
555
|
+
print(f"Input might be malformed:\n{err_msg}.\nTry again with `-E` to ignore errors", file=sys.stderr)
|
|
444
556
|
sys.exit(1)
|
|
445
557
|
|
|
446
558
|
# ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
|
|
@@ -456,101 +568,21 @@ def handle_compute_error(
|
|
|
456
568
|
return infer_schema, schema_overrides
|
|
457
569
|
|
|
458
570
|
|
|
459
|
-
def load_stdin(
|
|
460
|
-
stdin_data=None,
|
|
461
|
-
file_format: str | None = None,
|
|
462
|
-
has_header: bool = True,
|
|
463
|
-
infer_schema: bool = True,
|
|
464
|
-
comment_prefix: str | None = None,
|
|
465
|
-
skip_lines: int = 0,
|
|
466
|
-
skip_rows_after_header: int = 0,
|
|
467
|
-
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
468
|
-
null_values: list[str] | None = None,
|
|
469
|
-
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
470
|
-
"""Load DataFrame from stdin.
|
|
471
|
-
|
|
472
|
-
If a ComputeError occurs during schema inference for a column, attempts to recover
|
|
473
|
-
by treating that column as a string and retrying the load. This process repeats until
|
|
474
|
-
all columns are successfully loaded or no further recovery is possible.
|
|
475
|
-
|
|
476
|
-
Args:
|
|
477
|
-
stdin_data: Optional stdin data as string. If None, read from sys.stdin.
|
|
478
|
-
file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
|
|
479
|
-
has_header: Whether the input files have a header row. Defaults to True.
|
|
480
|
-
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
481
|
-
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
482
|
-
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
483
|
-
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
484
|
-
|
|
485
|
-
Returns:
|
|
486
|
-
List of tuples of (DataFrame, filename, tabname) ready for display.
|
|
487
|
-
"""
|
|
488
|
-
import os
|
|
489
|
-
from io import StringIO
|
|
490
|
-
|
|
491
|
-
sources = []
|
|
492
|
-
|
|
493
|
-
# Read from stdin into memory first (stdin is not seekable)
|
|
494
|
-
if stdin_data is None:
|
|
495
|
-
stdin_data = sys.stdin.read()
|
|
496
|
-
|
|
497
|
-
# Reopen stdin to /dev/tty for proper terminal interaction
|
|
498
|
-
try:
|
|
499
|
-
tty = open("/dev/tty")
|
|
500
|
-
os.dup2(tty.fileno(), sys.stdin.fileno())
|
|
501
|
-
except (OSError, FileNotFoundError):
|
|
502
|
-
pass
|
|
503
|
-
|
|
504
|
-
lf = pl.scan_csv(
|
|
505
|
-
StringIO(stdin_data),
|
|
506
|
-
separator="," if file_format == "csv" else "\t",
|
|
507
|
-
has_header=has_header,
|
|
508
|
-
infer_schema=infer_schema,
|
|
509
|
-
comment_prefix=comment_prefix,
|
|
510
|
-
skip_lines=skip_lines,
|
|
511
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
512
|
-
schema_overrides=schema_overrides,
|
|
513
|
-
null_values=null_values,
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
sources = [(lf, f"stdin.{file_format}" if file_format else "stdin", "stdin")]
|
|
517
|
-
|
|
518
|
-
# Attempt to collect, handling ComputeError for schema inference issues
|
|
519
|
-
try:
|
|
520
|
-
sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
|
|
521
|
-
except pl.exceptions.ComputeError as ce:
|
|
522
|
-
# Handle the error and determine retry strategy
|
|
523
|
-
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
524
|
-
|
|
525
|
-
# Retry loading with updated schema overrides
|
|
526
|
-
return load_stdin(
|
|
527
|
-
stdin_data,
|
|
528
|
-
file_format=file_format,
|
|
529
|
-
has_header=has_header,
|
|
530
|
-
infer_schema=infer_schema,
|
|
531
|
-
comment_prefix=comment_prefix,
|
|
532
|
-
skip_lines=skip_lines,
|
|
533
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
534
|
-
schema_overrides=schema_overrides,
|
|
535
|
-
null_values=null_values,
|
|
536
|
-
)
|
|
537
|
-
|
|
538
|
-
return sources
|
|
539
|
-
|
|
540
|
-
|
|
541
571
|
def load_file(
|
|
542
|
-
|
|
572
|
+
source: str | StringIO,
|
|
543
573
|
first_sheet: bool = False,
|
|
544
574
|
prefix_sheet: bool = False,
|
|
545
575
|
file_format: str | None = None,
|
|
546
576
|
has_header: bool = True,
|
|
547
577
|
infer_schema: bool = True,
|
|
548
578
|
comment_prefix: str | None = None,
|
|
579
|
+
quote_char: str | None = '"',
|
|
549
580
|
skip_lines: int = 0,
|
|
550
581
|
skip_rows_after_header: int = 0,
|
|
551
582
|
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
552
583
|
null_values: list[str] | None = None,
|
|
553
|
-
|
|
584
|
+
ignore_errors: bool = False,
|
|
585
|
+
) -> list[Source]:
|
|
554
586
|
"""Load a single file.
|
|
555
587
|
|
|
556
588
|
For Excel files, when `first_sheet` is True, returns only the first sheet. Otherwise, returns one entry per sheet.
|
|
@@ -569,86 +601,92 @@ def load_file(
|
|
|
569
601
|
has_header: Whether the input files have a header row. Defaults to True.
|
|
570
602
|
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
571
603
|
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
604
|
+
quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
|
|
572
605
|
skip_lines: Number of lines to skip when reading CSV/TSV files. The header will be parsed at this offset. Defaults to 0.
|
|
573
606
|
skip_rows_after_header: Number of rows to skip after header when reading CSV/TSV files. Defaults to 0.
|
|
607
|
+
schema_overrides: Optional dictionary of column name to Polars data type to override inferred schema.
|
|
608
|
+
null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
|
|
609
|
+
ignore_errors: Whether to ignore errors when reading CSV/TSV files.
|
|
574
610
|
|
|
575
611
|
Returns:
|
|
576
|
-
List of
|
|
612
|
+
List of `Source` objects.
|
|
577
613
|
"""
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
if filename == "-":
|
|
581
|
-
return load_stdin(
|
|
582
|
-
file_format=file_format,
|
|
583
|
-
has_header=has_header,
|
|
584
|
-
infer_schema=infer_schema,
|
|
585
|
-
comment_prefix=comment_prefix,
|
|
586
|
-
skip_lines=skip_lines,
|
|
587
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
588
|
-
schema_overrides=schema_overrides,
|
|
589
|
-
null_values=null_values,
|
|
590
|
-
)
|
|
614
|
+
data: list[Source] = []
|
|
591
615
|
|
|
616
|
+
filename = f"stdin.{file_format}" if isinstance(source, StringIO) else source
|
|
592
617
|
filepath = Path(filename)
|
|
593
618
|
|
|
619
|
+
if not file_format:
|
|
620
|
+
ext = filepath.suffix.lower()
|
|
621
|
+
if ext == ".gz":
|
|
622
|
+
ext = Path(filename).with_suffix("").suffix.lower()
|
|
623
|
+
file_format = ext.removeprefix(".")
|
|
624
|
+
|
|
594
625
|
# Load based on file format
|
|
595
|
-
if file_format in ("
|
|
626
|
+
if file_format in ("csv", "tsv"):
|
|
596
627
|
lf = pl.scan_csv(
|
|
597
|
-
|
|
628
|
+
source,
|
|
598
629
|
separator="\t" if file_format == "tsv" else ",",
|
|
599
630
|
has_header=has_header,
|
|
600
631
|
infer_schema=infer_schema,
|
|
601
632
|
comment_prefix=comment_prefix,
|
|
633
|
+
quote_char=quote_char,
|
|
602
634
|
skip_lines=skip_lines,
|
|
603
635
|
skip_rows_after_header=skip_rows_after_header,
|
|
604
636
|
schema_overrides=schema_overrides,
|
|
605
637
|
null_values=null_values,
|
|
638
|
+
ignore_errors=ignore_errors,
|
|
606
639
|
)
|
|
607
|
-
|
|
640
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
608
641
|
elif file_format in ("xlsx", "xls", "excel"):
|
|
609
642
|
if first_sheet:
|
|
610
643
|
# Read only the first sheet for multiple files
|
|
611
|
-
lf = pl.read_excel(
|
|
612
|
-
|
|
644
|
+
lf = pl.read_excel(source).lazy()
|
|
645
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
613
646
|
else:
|
|
614
647
|
# For single file, expand all sheets
|
|
615
|
-
sheets = pl.read_excel(
|
|
648
|
+
sheets = pl.read_excel(source, sheet_id=0)
|
|
616
649
|
for sheet_name, df in sheets.items():
|
|
617
650
|
tabname = f"{filepath.stem}_{sheet_name}" if prefix_sheet else sheet_name
|
|
618
|
-
|
|
651
|
+
data.append(Source(df.lazy(), filename, tabname))
|
|
619
652
|
elif file_format == "parquet":
|
|
620
|
-
lf = pl.scan_parquet(
|
|
621
|
-
|
|
653
|
+
lf = pl.scan_parquet(source)
|
|
654
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
622
655
|
elif file_format == "json":
|
|
623
|
-
lf = pl.read_json(
|
|
624
|
-
|
|
656
|
+
lf = pl.read_json(source).lazy()
|
|
657
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
625
658
|
elif file_format == "ndjson":
|
|
626
|
-
lf = pl.scan_ndjson(
|
|
627
|
-
|
|
659
|
+
lf = pl.scan_ndjson(source, schema_overrides=schema_overrides)
|
|
660
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
628
661
|
else:
|
|
629
662
|
raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {SUPPORTED_FORMATS}")
|
|
630
663
|
|
|
631
664
|
# Attempt to collect, handling ComputeError for schema inference issues
|
|
632
665
|
try:
|
|
633
|
-
|
|
666
|
+
data = [Source(src.frame.collect(), src.filename, src.tabname) for src in data]
|
|
634
667
|
except pl.exceptions.ComputeError as ce:
|
|
635
668
|
# Handle the error and determine retry strategy
|
|
636
669
|
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
637
670
|
|
|
638
671
|
# Retry loading with updated schema overrides
|
|
672
|
+
if isinstance(source, StringIO):
|
|
673
|
+
source.seek(0)
|
|
674
|
+
|
|
639
675
|
return load_file(
|
|
640
|
-
|
|
676
|
+
source,
|
|
641
677
|
file_format=file_format,
|
|
642
678
|
has_header=has_header,
|
|
643
679
|
infer_schema=infer_schema,
|
|
644
680
|
comment_prefix=comment_prefix,
|
|
681
|
+
quote_char=quote_char,
|
|
645
682
|
skip_lines=skip_lines,
|
|
646
683
|
skip_rows_after_header=skip_rows_after_header,
|
|
647
684
|
schema_overrides=schema_overrides,
|
|
648
685
|
null_values=null_values,
|
|
686
|
+
ignore_errors=ignore_errors,
|
|
649
687
|
)
|
|
650
688
|
|
|
651
|
-
return
|
|
689
|
+
return data
|
|
652
690
|
|
|
653
691
|
|
|
654
692
|
def now() -> str:
|