dataframe-textual 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataframe_textual/__main__.py +42 -20
- dataframe_textual/common.py +280 -72
- dataframe_textual/data_frame_help_panel.py +6 -4
- dataframe_textual/data_frame_table.py +633 -370
- dataframe_textual/data_frame_viewer.py +24 -28
- dataframe_textual/sql_screen.py +202 -0
- dataframe_textual/table_screen.py +31 -20
- dataframe_textual/yes_no_screen.py +12 -8
- {dataframe_textual-1.2.0.dist-info → dataframe_textual-1.4.0.dist-info}/METADATA +149 -13
- dataframe_textual-1.4.0.dist-info/RECORD +14 -0
- {dataframe_textual-1.2.0.dist-info → dataframe_textual-1.4.0.dist-info}/entry_points.txt +1 -0
- dataframe_textual-1.2.0.dist-info/RECORD +0 -13
- {dataframe_textual-1.2.0.dist-info → dataframe_textual-1.4.0.dist-info}/WHEEL +0 -0
- {dataframe_textual-1.2.0.dist-info → dataframe_textual-1.4.0.dist-info}/licenses/LICENSE +0 -0
dataframe_textual/__main__.py
CHANGED
|
@@ -4,23 +4,14 @@ import argparse
|
|
|
4
4
|
import sys
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
|
-
from .common import load_dataframe
|
|
7
|
+
from .common import SUPPORTED_FORMATS, load_dataframe
|
|
8
8
|
from .data_frame_viewer import DataFrameViewer
|
|
9
9
|
|
|
10
|
-
SUPPORTED_FORMATS = ["csv", "excel", "tsv", "parquet", "json", "ndjson"]
|
|
11
10
|
|
|
11
|
+
def cli() -> argparse.Namespace:
|
|
12
|
+
"""Parse command-line arguments.
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
"""Run the DataFrame Viewer application.
|
|
15
|
-
|
|
16
|
-
Parses command-line arguments to determine input files or stdin, validates
|
|
17
|
-
file existence, and launches the interactive DataFrame Viewer application.
|
|
18
|
-
|
|
19
|
-
Returns:
|
|
20
|
-
None
|
|
21
|
-
|
|
22
|
-
Raises:
|
|
23
|
-
SystemExit: If invalid arguments are provided or required files are missing.
|
|
14
|
+
Determines input files or stdin and validates file existence
|
|
24
15
|
"""
|
|
25
16
|
parser = argparse.ArgumentParser(
|
|
26
17
|
prog="dv",
|
|
@@ -39,27 +30,58 @@ def main() -> None:
|
|
|
39
30
|
choices=SUPPORTED_FORMATS,
|
|
40
31
|
help="Specify the format of the input files (csv, excel, tsv etc.)",
|
|
41
32
|
)
|
|
42
|
-
parser.add_argument(
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"-H",
|
|
35
|
+
"--no-header",
|
|
36
|
+
action="store_true",
|
|
37
|
+
help="Specify that input files have no header row when reading CSV/TSV",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"-I", "--no-inferrence", action="store_true", help="Do not infer data types when reading CSV/TSV"
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-C", "--comment-prefix", nargs="?", const="#", help="Comment lines are skipped when reading CSV/TSV"
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument("-L", "--skip-lines", type=int, default=0, help="Skip lines when reading CSV/TSV")
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-K", "--skip-rows-after-header", type=int, default=0, help="Skip rows after header when reading CSV/TSV"
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument("-U", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
|
|
43
50
|
|
|
44
51
|
args = parser.parse_args()
|
|
45
|
-
|
|
52
|
+
if args.files is None:
|
|
53
|
+
args.files = []
|
|
46
54
|
|
|
47
55
|
# Check if reading from stdin (pipe or redirect)
|
|
48
56
|
if not sys.stdin.isatty():
|
|
49
|
-
|
|
50
|
-
|
|
57
|
+
args.files.append("-")
|
|
58
|
+
else:
|
|
51
59
|
# Validate all files exist
|
|
52
60
|
for filename in args.files:
|
|
53
61
|
if not Path(filename).exists():
|
|
54
62
|
print(f"File not found: {filename}")
|
|
55
63
|
sys.exit(1)
|
|
56
|
-
filenames.extend(args.files)
|
|
57
64
|
|
|
58
|
-
if not
|
|
65
|
+
if not args.files:
|
|
59
66
|
parser.print_help()
|
|
60
67
|
sys.exit(1)
|
|
61
68
|
|
|
62
|
-
|
|
69
|
+
return args
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main() -> None:
|
|
73
|
+
"""Run the DataFrame Viewer application."""
|
|
74
|
+
args = cli()
|
|
75
|
+
sources = load_dataframe(
|
|
76
|
+
args.files,
|
|
77
|
+
file_format=args.format,
|
|
78
|
+
has_header=not args.no_header,
|
|
79
|
+
infer_schema=not args.no_inferrence,
|
|
80
|
+
comment_prefix=args.comment_prefix,
|
|
81
|
+
skip_lines=args.skip_lines,
|
|
82
|
+
skip_rows_after_header=args.skip_rows_after_header,
|
|
83
|
+
null_values=args.null,
|
|
84
|
+
)
|
|
63
85
|
app = DataFrameViewer(*sources)
|
|
64
86
|
app.run()
|
|
65
87
|
|
dataframe_textual/common.py
CHANGED
|
@@ -9,9 +9,9 @@ from typing import Any
|
|
|
9
9
|
import polars as pl
|
|
10
10
|
from rich.text import Text
|
|
11
11
|
|
|
12
|
-
#
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
# Supported file formats
|
|
13
|
+
SUPPORTED_FORMATS = {"tsv", "csv", "excel", "xlsx", "xls", "parquet", "json", "ndjson"}
|
|
14
|
+
|
|
15
15
|
|
|
16
16
|
# Boolean string mappings
|
|
17
17
|
BOOLS = {
|
|
@@ -27,6 +27,10 @@ BOOLS = {
|
|
|
27
27
|
"0": False,
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
+
# Special string to represent null value
|
|
31
|
+
NULL = "NULL"
|
|
32
|
+
NULL_DISPLAY = "-"
|
|
33
|
+
|
|
30
34
|
|
|
31
35
|
@dataclass
|
|
32
36
|
class DtypeClass:
|
|
@@ -53,15 +57,15 @@ STYLES = {
|
|
|
53
57
|
pl.UInt32: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
54
58
|
pl.UInt64: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
55
59
|
# float
|
|
56
|
-
pl.Float32: DtypeClass(gtype="float", style="
|
|
57
|
-
pl.Float64: DtypeClass(gtype="float", style="
|
|
58
|
-
pl.Decimal: DtypeClass(gtype="float", style="
|
|
60
|
+
pl.Float32: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
61
|
+
pl.Float64: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
62
|
+
pl.Decimal: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
59
63
|
# bool
|
|
60
64
|
pl.Boolean: DtypeClass(gtype="boolean", style="blue", justify="center", itype="text", convert=lambda x: BOOLS[x.lower()]),
|
|
61
65
|
# temporal
|
|
62
|
-
pl.Date: DtypeClass(gtype="temporal", style="
|
|
63
|
-
pl.Datetime: DtypeClass(gtype="temporal", style="
|
|
64
|
-
pl.Time: DtypeClass(gtype="temporal", style="
|
|
66
|
+
pl.Date: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
67
|
+
pl.Datetime: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
68
|
+
pl.Time: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
65
69
|
# unknown
|
|
66
70
|
pl.Unknown: DtypeClass(gtype="unknown", style="", justify="", itype="text", convert=str),
|
|
67
71
|
}
|
|
@@ -133,7 +137,7 @@ def format_float(value: float, thousand_separator: bool = False, precision: int
|
|
|
133
137
|
return f"{value:,f}" if thousand_separator else str(value)
|
|
134
138
|
|
|
135
139
|
|
|
136
|
-
def format_row(vals, dtypes, apply_justify=True, thousand_separator=False) -> list[Text]:
|
|
140
|
+
def format_row(vals, dtypes, styles=None, apply_justify=True, thousand_separator=False) -> list[Text]:
|
|
137
141
|
"""Format a single row with proper styling and justification.
|
|
138
142
|
|
|
139
143
|
Converts raw row values to formatted Rich Text objects with appropriate
|
|
@@ -149,7 +153,7 @@ def format_row(vals, dtypes, apply_justify=True, thousand_separator=False) -> li
|
|
|
149
153
|
"""
|
|
150
154
|
formatted_row = []
|
|
151
155
|
|
|
152
|
-
for val, dtype in zip(vals, dtypes, strict=True):
|
|
156
|
+
for idx, (val, dtype) in enumerate(zip(vals, dtypes, strict=True)):
|
|
153
157
|
dc = DtypeConfig(dtype)
|
|
154
158
|
|
|
155
159
|
# Format the value
|
|
@@ -165,8 +169,10 @@ def format_row(vals, dtypes, apply_justify=True, thousand_separator=False) -> li
|
|
|
165
169
|
formatted_row.append(
|
|
166
170
|
Text(
|
|
167
171
|
text_val,
|
|
168
|
-
style=dc.style,
|
|
172
|
+
style=styles[idx] if styles and styles[idx] else dc.style,
|
|
169
173
|
justify=dc.justify if apply_justify else "",
|
|
174
|
+
overflow="ellipsis",
|
|
175
|
+
no_wrap=True,
|
|
170
176
|
)
|
|
171
177
|
)
|
|
172
178
|
|
|
@@ -216,7 +222,7 @@ def get_next_item(lst: list[Any], current, offset=1) -> Any:
|
|
|
216
222
|
return lst[next_index]
|
|
217
223
|
|
|
218
224
|
|
|
219
|
-
def parse_polars_expression(expression: str,
|
|
225
|
+
def parse_polars_expression(expression: str, columns: list[str], current_col_idx: int) -> str:
|
|
220
226
|
"""Parse and convert an expression to Polars syntax.
|
|
221
227
|
|
|
222
228
|
Replaces column references with Polars col() expressions:
|
|
@@ -234,7 +240,7 @@ def parse_polars_expression(expression: str, df: pl.DataFrame, current_col_idx:
|
|
|
234
240
|
|
|
235
241
|
Args:
|
|
236
242
|
expression: The input expression as a string.
|
|
237
|
-
|
|
243
|
+
columns: The list of column names in the DataFrame.
|
|
238
244
|
current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
239
245
|
|
|
240
246
|
Returns:
|
|
@@ -264,19 +270,19 @@ def parse_polars_expression(expression: str, df: pl.DataFrame, current_col_idx:
|
|
|
264
270
|
|
|
265
271
|
if col_ref == "_":
|
|
266
272
|
# Current selected column
|
|
267
|
-
col_name =
|
|
273
|
+
col_name = columns[current_col_idx]
|
|
268
274
|
elif col_ref == "#":
|
|
269
275
|
# RIDX is used to store 0-based row index; add 1 for 1-based index
|
|
270
276
|
return f"(pl.col('{RIDX}') + 1)"
|
|
271
277
|
elif col_ref.isdigit():
|
|
272
278
|
# Column by 1-based index
|
|
273
279
|
col_idx = int(col_ref) - 1
|
|
274
|
-
if col_idx < 0 or col_idx >= len(
|
|
280
|
+
if col_idx < 0 or col_idx >= len(columns):
|
|
275
281
|
raise ValueError(f"Column index out of range: ${col_ref}")
|
|
276
|
-
col_name =
|
|
282
|
+
col_name = columns[col_idx]
|
|
277
283
|
else:
|
|
278
284
|
# Column by name
|
|
279
|
-
if col_ref not in
|
|
285
|
+
if col_ref not in columns:
|
|
280
286
|
raise ValueError(f"Column not found: ${col_ref}")
|
|
281
287
|
col_name = col_ref
|
|
282
288
|
|
|
@@ -305,7 +311,7 @@ def tentative_expr(term: str) -> bool:
|
|
|
305
311
|
return False
|
|
306
312
|
|
|
307
313
|
|
|
308
|
-
def validate_expr(term: str,
|
|
314
|
+
def validate_expr(term: str, columns: list[str], current_col_idx: int) -> pl.Expr | None:
|
|
309
315
|
"""Validate and return the expression.
|
|
310
316
|
|
|
311
317
|
Parses a user-provided expression string and validates it as a valid Polars expression.
|
|
@@ -313,7 +319,7 @@ def validate_expr(term: str, df: pl.DataFrame, current_col_idx: int) -> pl.Expr
|
|
|
313
319
|
|
|
314
320
|
Args:
|
|
315
321
|
term: The input expression as a string.
|
|
316
|
-
|
|
322
|
+
columns: The list of column names in the DataFrame.
|
|
317
323
|
current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
318
324
|
|
|
319
325
|
Returns:
|
|
@@ -326,7 +332,7 @@ def validate_expr(term: str, df: pl.DataFrame, current_col_idx: int) -> pl.Expr
|
|
|
326
332
|
|
|
327
333
|
try:
|
|
328
334
|
# Parse the expression
|
|
329
|
-
expr_str = parse_polars_expression(term,
|
|
335
|
+
expr_str = parse_polars_expression(term, columns, current_col_idx)
|
|
330
336
|
|
|
331
337
|
# Validate by evaluating it
|
|
332
338
|
try:
|
|
@@ -343,8 +349,15 @@ def validate_expr(term: str, df: pl.DataFrame, current_col_idx: int) -> pl.Expr
|
|
|
343
349
|
|
|
344
350
|
|
|
345
351
|
def load_dataframe(
|
|
346
|
-
filenames: list[str],
|
|
347
|
-
|
|
352
|
+
filenames: list[str],
|
|
353
|
+
file_format: str | None = None,
|
|
354
|
+
has_header: bool = True,
|
|
355
|
+
infer_schema: bool = True,
|
|
356
|
+
comment_prefix: str | None = None,
|
|
357
|
+
skip_lines: int = 0,
|
|
358
|
+
skip_rows_after_header: int = 0,
|
|
359
|
+
null_values: list[str] | None = None,
|
|
360
|
+
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
348
361
|
"""Load DataFrames from file specifications.
|
|
349
362
|
|
|
350
363
|
Handles loading from multiple files, single files, or stdin. For Excel files,
|
|
@@ -354,16 +367,174 @@ def load_dataframe(
|
|
|
354
367
|
filenames: List of filenames to load. If single filename is "-", read from stdin.
|
|
355
368
|
file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
|
|
356
369
|
has_header: Whether the input files have a header row. Defaults to True.
|
|
370
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
371
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
372
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
373
|
+
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
357
374
|
|
|
358
375
|
Returns:
|
|
359
|
-
List of tuples of (
|
|
376
|
+
List of tuples of (DataFrame, filename, tabname) ready for display.
|
|
360
377
|
"""
|
|
361
378
|
sources = []
|
|
362
|
-
|
|
363
379
|
prefix_sheet = len(filenames) > 1
|
|
364
380
|
|
|
365
381
|
for filename in filenames:
|
|
366
|
-
|
|
382
|
+
# Determine file format if not specified
|
|
383
|
+
if not file_format:
|
|
384
|
+
ext = Path(filename).suffix.lower()
|
|
385
|
+
if ext == ".gz" or ext == ".bz2" or ext == ".xz":
|
|
386
|
+
ext = Path(filename).with_suffix("").suffix.lower()
|
|
387
|
+
fmt = ext.removeprefix(".")
|
|
388
|
+
|
|
389
|
+
# Default to TSV
|
|
390
|
+
file_format = fmt if fmt in SUPPORTED_FORMATS else "tsv"
|
|
391
|
+
|
|
392
|
+
# Load each file
|
|
393
|
+
sources.extend(
|
|
394
|
+
load_file(
|
|
395
|
+
filename,
|
|
396
|
+
prefix_sheet=prefix_sheet,
|
|
397
|
+
file_format=file_format,
|
|
398
|
+
has_header=has_header,
|
|
399
|
+
infer_schema=infer_schema,
|
|
400
|
+
comment_prefix=comment_prefix,
|
|
401
|
+
skip_lines=skip_lines,
|
|
402
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
403
|
+
null_values=null_values,
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return sources
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
RE_COMPUTE_ERROR = re.compile(r"at column '(.*?)' \(column number \d+\)")
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def handle_compute_error(
|
|
414
|
+
err_msg: str,
|
|
415
|
+
file_format: str | None,
|
|
416
|
+
infer_schema: bool,
|
|
417
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
418
|
+
) -> tuple[bool, dict[str, pl.DataType] | None]:
|
|
419
|
+
"""Handle ComputeError during schema inference and determine retry strategy.
|
|
420
|
+
|
|
421
|
+
Analyzes the error message and determines whether to retry with schema overrides,
|
|
422
|
+
disable schema inference, or exit with an error.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
err_msg: The error message from the ComputeError exception.
|
|
426
|
+
file_format: The file format being loaded (tsv, csv, etc.).
|
|
427
|
+
infer_schema: Whether schema inference is currently enabled.
|
|
428
|
+
schema_overrides: Current schema overrides, if any.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
A tuple of (infer_schema, schema_overrides):
|
|
432
|
+
|
|
433
|
+
Raises:
|
|
434
|
+
SystemExit: If the error is unrecoverable.
|
|
435
|
+
"""
|
|
436
|
+
# Already disabled schema inference, cannot recover
|
|
437
|
+
if not infer_schema:
|
|
438
|
+
print(f"Error loading with schema inference disabled:\n{err_msg}", file=sys.stderr)
|
|
439
|
+
sys.exit(1)
|
|
440
|
+
|
|
441
|
+
# Schema mismatch error
|
|
442
|
+
if "found more fields than defined in 'Schema'" in err_msg:
|
|
443
|
+
print(f"Input might be malformed:\n{err_msg}", file=sys.stderr)
|
|
444
|
+
sys.exit(1)
|
|
445
|
+
|
|
446
|
+
# ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
|
|
447
|
+
if file_format in ("tsv", "csv") and (m := RE_COMPUTE_ERROR.search(err_msg)):
|
|
448
|
+
col_name = m.group(1)
|
|
449
|
+
|
|
450
|
+
if schema_overrides is None:
|
|
451
|
+
schema_overrides = {}
|
|
452
|
+
schema_overrides.update({col_name: pl.String})
|
|
453
|
+
else:
|
|
454
|
+
infer_schema = False
|
|
455
|
+
|
|
456
|
+
return infer_schema, schema_overrides
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def load_stdin(
|
|
460
|
+
stdin_data=None,
|
|
461
|
+
file_format: str | None = None,
|
|
462
|
+
has_header: bool = True,
|
|
463
|
+
infer_schema: bool = True,
|
|
464
|
+
comment_prefix: str | None = None,
|
|
465
|
+
skip_lines: int = 0,
|
|
466
|
+
skip_rows_after_header: int = 0,
|
|
467
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
468
|
+
null_values: list[str] | None = None,
|
|
469
|
+
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
470
|
+
"""Load DataFrame from stdin.
|
|
471
|
+
|
|
472
|
+
If a ComputeError occurs during schema inference for a column, attempts to recover
|
|
473
|
+
by treating that column as a string and retrying the load. This process repeats until
|
|
474
|
+
all columns are successfully loaded or no further recovery is possible.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
stdin_data: Optional stdin data as string. If None, read from sys.stdin.
|
|
478
|
+
file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
|
|
479
|
+
has_header: Whether the input files have a header row. Defaults to True.
|
|
480
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
481
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
482
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
483
|
+
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
List of tuples of (DataFrame, filename, tabname) ready for display.
|
|
487
|
+
"""
|
|
488
|
+
import os
|
|
489
|
+
from io import StringIO
|
|
490
|
+
|
|
491
|
+
sources = []
|
|
492
|
+
|
|
493
|
+
# Read from stdin into memory first (stdin is not seekable)
|
|
494
|
+
if stdin_data is None:
|
|
495
|
+
stdin_data = sys.stdin.read()
|
|
496
|
+
|
|
497
|
+
# Reopen stdin to /dev/tty for proper terminal interaction
|
|
498
|
+
try:
|
|
499
|
+
tty = open("/dev/tty")
|
|
500
|
+
os.dup2(tty.fileno(), sys.stdin.fileno())
|
|
501
|
+
except (OSError, FileNotFoundError):
|
|
502
|
+
pass
|
|
503
|
+
|
|
504
|
+
lf = pl.scan_csv(
|
|
505
|
+
StringIO(stdin_data),
|
|
506
|
+
separator="," if file_format == "csv" else "\t",
|
|
507
|
+
has_header=has_header,
|
|
508
|
+
infer_schema=infer_schema,
|
|
509
|
+
comment_prefix=comment_prefix,
|
|
510
|
+
skip_lines=skip_lines,
|
|
511
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
512
|
+
schema_overrides=schema_overrides,
|
|
513
|
+
null_values=null_values,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
sources = [(lf, f"stdin.{file_format}" if file_format else "stdin", "stdin")]
|
|
517
|
+
|
|
518
|
+
# Attempt to collect, handling ComputeError for schema inference issues
|
|
519
|
+
try:
|
|
520
|
+
sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
|
|
521
|
+
except pl.exceptions.ComputeError as ce:
|
|
522
|
+
# Handle the error and determine retry strategy
|
|
523
|
+
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
524
|
+
|
|
525
|
+
# Retry loading with updated schema overrides
|
|
526
|
+
return load_stdin(
|
|
527
|
+
stdin_data,
|
|
528
|
+
file_format=file_format,
|
|
529
|
+
has_header=has_header,
|
|
530
|
+
infer_schema=infer_schema,
|
|
531
|
+
comment_prefix=comment_prefix,
|
|
532
|
+
skip_lines=skip_lines,
|
|
533
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
534
|
+
schema_overrides=schema_overrides,
|
|
535
|
+
null_values=null_values,
|
|
536
|
+
)
|
|
537
|
+
|
|
367
538
|
return sources
|
|
368
539
|
|
|
369
540
|
|
|
@@ -373,49 +544,68 @@ def load_file(
|
|
|
373
544
|
prefix_sheet: bool = False,
|
|
374
545
|
file_format: str | None = None,
|
|
375
546
|
has_header: bool = True,
|
|
376
|
-
|
|
377
|
-
|
|
547
|
+
infer_schema: bool = True,
|
|
548
|
+
comment_prefix: str | None = None,
|
|
549
|
+
skip_lines: int = 0,
|
|
550
|
+
skip_rows_after_header: int = 0,
|
|
551
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
552
|
+
null_values: list[str] | None = None,
|
|
553
|
+
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
554
|
+
"""Load a single file.
|
|
378
555
|
|
|
379
556
|
For Excel files, when `first_sheet` is True, returns only the first sheet. Otherwise, returns one entry per sheet.
|
|
380
557
|
For other files or multiple files, returns one entry per file.
|
|
381
558
|
|
|
559
|
+
If a ComputeError occurs during schema inference for a column, attempts to recover
|
|
560
|
+
by treating that column as a string and retrying the load. This process repeats until
|
|
561
|
+
all columns are successfully loaded or no further recovery is possible.
|
|
562
|
+
|
|
382
563
|
Args:
|
|
383
564
|
filename: Path to file to load.
|
|
384
565
|
first_sheet: If True, only load first sheet for Excel files. Defaults to False.
|
|
385
566
|
prefix_sheet: If True, prefix filename to sheet name as the tab name for Excel files. Defaults to False.
|
|
386
|
-
file_format: Optional format specifier (i.e., '
|
|
567
|
+
file_format: Optional format specifier (i.e., 'tsv', 'csv', 'excel', 'parquet', 'json', 'ndjson') for input files.
|
|
387
568
|
By default, infers from file extension.
|
|
388
569
|
has_header: Whether the input files have a header row. Defaults to True.
|
|
570
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
571
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
572
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. The header will be parsed at this offset. Defaults to 0.
|
|
573
|
+
skip_rows_after_header: Number of rows to skip after header when reading CSV/TSV files. Defaults to 0.
|
|
389
574
|
|
|
390
575
|
Returns:
|
|
391
|
-
List of tuples of (
|
|
576
|
+
List of tuples of (DataFrame, filename, tabname).
|
|
392
577
|
"""
|
|
393
578
|
sources = []
|
|
394
579
|
|
|
395
580
|
if filename == "-":
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
os.dup2(tty.fileno(), sys.stdin.fileno())
|
|
407
|
-
except (OSError, FileNotFoundError):
|
|
408
|
-
pass
|
|
409
|
-
|
|
410
|
-
sources.append((lf, f"stdin.{file_format}" if file_format else "stdin", "stdin"))
|
|
411
|
-
return sources
|
|
581
|
+
return load_stdin(
|
|
582
|
+
file_format=file_format,
|
|
583
|
+
has_header=has_header,
|
|
584
|
+
infer_schema=infer_schema,
|
|
585
|
+
comment_prefix=comment_prefix,
|
|
586
|
+
skip_lines=skip_lines,
|
|
587
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
588
|
+
schema_overrides=schema_overrides,
|
|
589
|
+
null_values=null_values,
|
|
590
|
+
)
|
|
412
591
|
|
|
413
592
|
filepath = Path(filename)
|
|
414
593
|
|
|
415
|
-
|
|
416
|
-
|
|
594
|
+
# Load based on file format
|
|
595
|
+
if file_format in ("tsv", "csv"):
|
|
596
|
+
lf = pl.scan_csv(
|
|
597
|
+
filename,
|
|
598
|
+
separator="\t" if file_format == "tsv" else ",",
|
|
599
|
+
has_header=has_header,
|
|
600
|
+
infer_schema=infer_schema,
|
|
601
|
+
comment_prefix=comment_prefix,
|
|
602
|
+
skip_lines=skip_lines,
|
|
603
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
604
|
+
schema_overrides=schema_overrides,
|
|
605
|
+
null_values=null_values,
|
|
606
|
+
)
|
|
417
607
|
sources.append((lf, filename, filepath.stem))
|
|
418
|
-
elif file_format
|
|
608
|
+
elif file_format in ("xlsx", "xls", "excel"):
|
|
419
609
|
if first_sheet:
|
|
420
610
|
# Read only the first sheet for multiple files
|
|
421
611
|
lf = pl.read_excel(filename).lazy()
|
|
@@ -426,36 +616,54 @@ def load_file(
|
|
|
426
616
|
for sheet_name, df in sheets.items():
|
|
427
617
|
tabname = f"{filepath.stem}_{sheet_name}" if prefix_sheet else sheet_name
|
|
428
618
|
sources.append((df.lazy(), filename, tabname))
|
|
429
|
-
elif file_format == "tsv":
|
|
430
|
-
lf = pl.scan_csv(filename, has_header=has_header, separator="\t")
|
|
431
|
-
sources.append((lf, filename, filepath.stem))
|
|
432
619
|
elif file_format == "parquet":
|
|
433
620
|
lf = pl.scan_parquet(filename)
|
|
434
621
|
sources.append((lf, filename, filepath.stem))
|
|
435
622
|
elif file_format == "json":
|
|
436
|
-
|
|
437
|
-
sources.append((
|
|
623
|
+
lf = pl.read_json(filename).lazy()
|
|
624
|
+
sources.append((lf, filename, filepath.stem))
|
|
438
625
|
elif file_format == "ndjson":
|
|
439
|
-
lf = pl.scan_ndjson(filename)
|
|
626
|
+
lf = pl.scan_ndjson(filename, schema_overrides=schema_overrides)
|
|
440
627
|
sources.append((lf, filename, filepath.stem))
|
|
441
628
|
else:
|
|
442
|
-
|
|
443
|
-
if ext == ".csv":
|
|
444
|
-
file_format = "csv"
|
|
445
|
-
elif ext in (".xlsx", ".xls"):
|
|
446
|
-
file_format = "excel"
|
|
447
|
-
elif ext in (".tsv", ".tab"):
|
|
448
|
-
file_format = "tsv"
|
|
449
|
-
elif ext == ".parquet":
|
|
450
|
-
file_format = "parquet"
|
|
451
|
-
elif ext == ".json":
|
|
452
|
-
file_format = "json"
|
|
453
|
-
elif ext == ".ndjson":
|
|
454
|
-
file_format = "ndjson"
|
|
455
|
-
else:
|
|
456
|
-
# Default to TSV
|
|
457
|
-
file_format = "tsv"
|
|
629
|
+
raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {SUPPORTED_FORMATS}")
|
|
458
630
|
|
|
459
|
-
|
|
631
|
+
# Attempt to collect, handling ComputeError for schema inference issues
|
|
632
|
+
try:
|
|
633
|
+
sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
|
|
634
|
+
except pl.exceptions.ComputeError as ce:
|
|
635
|
+
# Handle the error and determine retry strategy
|
|
636
|
+
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
637
|
+
|
|
638
|
+
# Retry loading with updated schema overrides
|
|
639
|
+
return load_file(
|
|
640
|
+
filename,
|
|
641
|
+
file_format=file_format,
|
|
642
|
+
has_header=has_header,
|
|
643
|
+
infer_schema=infer_schema,
|
|
644
|
+
comment_prefix=comment_prefix,
|
|
645
|
+
skip_lines=skip_lines,
|
|
646
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
647
|
+
schema_overrides=schema_overrides,
|
|
648
|
+
null_values=null_values,
|
|
649
|
+
)
|
|
460
650
|
|
|
461
651
|
return sources
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def now() -> str:
|
|
655
|
+
"""Get the current local time as a formatted string."""
|
|
656
|
+
import time
|
|
657
|
+
|
|
658
|
+
return time.strftime("%m/%d/%Y %H:%M:%S", time.localtime())
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
async def sleep_async(seconds: float) -> None:
|
|
662
|
+
"""Async sleep to yield control back to the event loop.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
seconds: The number of seconds to sleep.
|
|
666
|
+
"""
|
|
667
|
+
import asyncio
|
|
668
|
+
|
|
669
|
+
await asyncio.sleep(seconds)
|
|
@@ -79,10 +79,12 @@ class DataFrameHelpPanel(Widget):
|
|
|
79
79
|
None
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
|
-
def update_help(focused_widget: Widget | None):
|
|
83
|
-
|
|
82
|
+
# def update_help(focused_widget: Widget | None):
|
|
83
|
+
# self.update_help(focused_widget)
|
|
84
84
|
|
|
85
|
-
self.watch(self.screen, "focused", update_help)
|
|
85
|
+
# self.watch(self.screen, "focused", update_help)
|
|
86
|
+
|
|
87
|
+
self.update_help(self.screen.focused)
|
|
86
88
|
|
|
87
89
|
def update_help(self, focused_widget: Widget | None) -> None:
|
|
88
90
|
"""Update the help for the focused widget.
|
|
@@ -96,7 +98,7 @@ class DataFrameHelpPanel(Widget):
|
|
|
96
98
|
return
|
|
97
99
|
self.set_class(focused_widget is not None, "-show-help")
|
|
98
100
|
if focused_widget is not None:
|
|
99
|
-
help = self.app.HELP + "\n" + focused_widget.HELP or ""
|
|
101
|
+
help = (self.app.HELP or "") + "\n" + (focused_widget.HELP or "")
|
|
100
102
|
if not help:
|
|
101
103
|
self.remove_class("-show-help")
|
|
102
104
|
try:
|