dataframe-textual 1.0.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataframe_textual/__init__.py +1 -2
- dataframe_textual/__main__.py +48 -23
- dataframe_textual/common.py +372 -23
- dataframe_textual/data_frame_help_panel.py +6 -4
- dataframe_textual/data_frame_table.py +893 -449
- dataframe_textual/data_frame_viewer.py +39 -141
- dataframe_textual/sql_screen.py +202 -0
- dataframe_textual/table_screen.py +45 -28
- dataframe_textual/yes_no_screen.py +12 -8
- {dataframe_textual-1.0.0.dist-info → dataframe_textual-1.4.0.dist-info}/METADATA +205 -46
- dataframe_textual-1.4.0.dist-info/RECORD +14 -0
- {dataframe_textual-1.0.0.dist-info → dataframe_textual-1.4.0.dist-info}/entry_points.txt +1 -0
- dataframe_textual-1.0.0.dist-info/RECORD +0 -13
- {dataframe_textual-1.0.0.dist-info → dataframe_textual-1.4.0.dist-info}/WHEEL +0 -0
- {dataframe_textual-1.0.0.dist-info → dataframe_textual-1.4.0.dist-info}/licenses/LICENSE +0 -0
dataframe_textual/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from .data_frame_help_panel import DataFrameHelpPanel
|
|
4
4
|
from .data_frame_table import DataFrameTable, History
|
|
5
|
-
from .data_frame_viewer import DataFrameViewer
|
|
5
|
+
from .data_frame_viewer import DataFrameViewer
|
|
6
6
|
from .table_screen import FrequencyScreen, RowDetailScreen, TableScreen
|
|
7
7
|
from .yes_no_screen import (
|
|
8
8
|
ConfirmScreen,
|
|
@@ -31,5 +31,4 @@ __all__ = [
|
|
|
31
31
|
"FilterScreen",
|
|
32
32
|
"FreezeScreen",
|
|
33
33
|
"OpenFileScreen",
|
|
34
|
-
"_load_dataframe",
|
|
35
34
|
]
|
dataframe_textual/__main__.py
CHANGED
|
@@ -4,31 +4,24 @@ import argparse
|
|
|
4
4
|
import sys
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
|
+
from .common import SUPPORTED_FORMATS, load_dataframe
|
|
7
8
|
from .data_frame_viewer import DataFrameViewer
|
|
8
9
|
|
|
9
|
-
SUPPORTED_FORMATS = ["csv", "excel", "tsv", "parquet", "json", "ndjson"]
|
|
10
10
|
|
|
11
|
+
def cli() -> argparse.Namespace:
|
|
12
|
+
"""Parse command-line arguments.
|
|
11
13
|
|
|
12
|
-
|
|
13
|
-
"""Run the DataFrame Viewer application.
|
|
14
|
-
|
|
15
|
-
Parses command-line arguments to determine input files or stdin, validates
|
|
16
|
-
file existence, and launches the interactive DataFrame Viewer application.
|
|
17
|
-
|
|
18
|
-
Returns:
|
|
19
|
-
None
|
|
20
|
-
|
|
21
|
-
Raises:
|
|
22
|
-
SystemExit: If invalid arguments are provided or required files are missing.
|
|
14
|
+
Determines input files or stdin and validates file existence
|
|
23
15
|
"""
|
|
24
16
|
parser = argparse.ArgumentParser(
|
|
17
|
+
prog="dv",
|
|
25
18
|
description="Interactive terminal based viewer/editor for tabular data (e.g., CSV/Excel).",
|
|
26
19
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
27
20
|
epilog="Examples:\n"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
" cat data.csv |
|
|
21
|
+
" %(prog)s data.csv\n"
|
|
22
|
+
" %(prog)s file1.csv file2.csv file3.csv\n"
|
|
23
|
+
" %(prog)s data.xlsx (opens each sheet in separate tab)\n"
|
|
24
|
+
" cat data.csv | %(prog)s --format csv\n",
|
|
32
25
|
)
|
|
33
26
|
parser.add_argument("files", nargs="*", help="Files to view (or read from stdin)")
|
|
34
27
|
parser.add_argument(
|
|
@@ -37,27 +30,59 @@ def main() -> None:
|
|
|
37
30
|
choices=SUPPORTED_FORMATS,
|
|
38
31
|
help="Specify the format of the input files (csv, excel, tsv etc.)",
|
|
39
32
|
)
|
|
40
|
-
parser.add_argument(
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"-H",
|
|
35
|
+
"--no-header",
|
|
36
|
+
action="store_true",
|
|
37
|
+
help="Specify that input files have no header row when reading CSV/TSV",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"-I", "--no-inferrence", action="store_true", help="Do not infer data types when reading CSV/TSV"
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-C", "--comment-prefix", nargs="?", const="#", help="Comment lines are skipped when reading CSV/TSV"
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument("-L", "--skip-lines", type=int, default=0, help="Skip lines when reading CSV/TSV")
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-K", "--skip-rows-after-header", type=int, default=0, help="Skip rows after header when reading CSV/TSV"
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument("-U", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
|
|
41
50
|
|
|
42
51
|
args = parser.parse_args()
|
|
43
|
-
|
|
52
|
+
if args.files is None:
|
|
53
|
+
args.files = []
|
|
44
54
|
|
|
45
55
|
# Check if reading from stdin (pipe or redirect)
|
|
46
56
|
if not sys.stdin.isatty():
|
|
47
|
-
|
|
48
|
-
|
|
57
|
+
args.files.append("-")
|
|
58
|
+
else:
|
|
49
59
|
# Validate all files exist
|
|
50
60
|
for filename in args.files:
|
|
51
61
|
if not Path(filename).exists():
|
|
52
62
|
print(f"File not found: {filename}")
|
|
53
63
|
sys.exit(1)
|
|
54
|
-
filenames.extend(args.files)
|
|
55
64
|
|
|
56
|
-
if not
|
|
65
|
+
if not args.files:
|
|
57
66
|
parser.print_help()
|
|
58
67
|
sys.exit(1)
|
|
59
68
|
|
|
60
|
-
|
|
69
|
+
return args
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main() -> None:
|
|
73
|
+
"""Run the DataFrame Viewer application."""
|
|
74
|
+
args = cli()
|
|
75
|
+
sources = load_dataframe(
|
|
76
|
+
args.files,
|
|
77
|
+
file_format=args.format,
|
|
78
|
+
has_header=not args.no_header,
|
|
79
|
+
infer_schema=not args.no_inferrence,
|
|
80
|
+
comment_prefix=args.comment_prefix,
|
|
81
|
+
skip_lines=args.skip_lines,
|
|
82
|
+
skip_rows_after_header=args.skip_rows_after_header,
|
|
83
|
+
null_values=args.null,
|
|
84
|
+
)
|
|
85
|
+
app = DataFrameViewer(*sources)
|
|
61
86
|
app.run()
|
|
62
87
|
|
|
63
88
|
|
dataframe_textual/common.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
"""Common utilities and constants for dataframe_viewer."""
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
+
import sys
|
|
4
5
|
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
5
7
|
from typing import Any
|
|
6
8
|
|
|
7
9
|
import polars as pl
|
|
8
10
|
from rich.text import Text
|
|
9
11
|
|
|
10
|
-
#
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
# Supported file formats
|
|
13
|
+
SUPPORTED_FORMATS = {"tsv", "csv", "excel", "xlsx", "xls", "parquet", "json", "ndjson"}
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
# Boolean string mappings
|
|
15
17
|
BOOLS = {
|
|
@@ -25,6 +27,10 @@ BOOLS = {
|
|
|
25
27
|
"0": False,
|
|
26
28
|
}
|
|
27
29
|
|
|
30
|
+
# Special string to represent null value
|
|
31
|
+
NULL = "NULL"
|
|
32
|
+
NULL_DISPLAY = "-"
|
|
33
|
+
|
|
28
34
|
|
|
29
35
|
@dataclass
|
|
30
36
|
class DtypeClass:
|
|
@@ -51,15 +57,15 @@ STYLES = {
|
|
|
51
57
|
pl.UInt32: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
52
58
|
pl.UInt64: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
53
59
|
# float
|
|
54
|
-
pl.Float32: DtypeClass(gtype="float", style="
|
|
55
|
-
pl.Float64: DtypeClass(gtype="float", style="
|
|
56
|
-
pl.Decimal: DtypeClass(gtype="float", style="
|
|
60
|
+
pl.Float32: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
61
|
+
pl.Float64: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
62
|
+
pl.Decimal: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
57
63
|
# bool
|
|
58
64
|
pl.Boolean: DtypeClass(gtype="boolean", style="blue", justify="center", itype="text", convert=lambda x: BOOLS[x.lower()]),
|
|
59
65
|
# temporal
|
|
60
|
-
pl.Date: DtypeClass(gtype="temporal", style="
|
|
61
|
-
pl.Datetime: DtypeClass(gtype="temporal", style="
|
|
62
|
-
pl.Time: DtypeClass(gtype="temporal", style="
|
|
66
|
+
pl.Date: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
67
|
+
pl.Datetime: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
68
|
+
pl.Time: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
63
69
|
# unknown
|
|
64
70
|
pl.Unknown: DtypeClass(gtype="unknown", style="", justify="", itype="text", convert=str),
|
|
65
71
|
}
|
|
@@ -111,7 +117,27 @@ def DtypeConfig(dtype: pl.DataType) -> DtypeClass:
|
|
|
111
117
|
return STYLES[pl.Unknown]
|
|
112
118
|
|
|
113
119
|
|
|
114
|
-
def
|
|
120
|
+
def format_float(value: float, thousand_separator: bool = False, precision: int = 2) -> str:
|
|
121
|
+
"""Format a float value, keeping integers without decimal point.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
val: The float value to format.
|
|
125
|
+
thousand_separator: Whether to include thousand separators. Defaults to False.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
The formatted float as a string.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
if (val := int(value)) == value:
|
|
132
|
+
return f"{val:,}" if thousand_separator else str(val)
|
|
133
|
+
else:
|
|
134
|
+
if precision > 0:
|
|
135
|
+
return f"{value:,.{precision}f}" if thousand_separator else f"{value:.{precision}f}"
|
|
136
|
+
else:
|
|
137
|
+
return f"{value:,f}" if thousand_separator else str(value)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def format_row(vals, dtypes, styles=None, apply_justify=True, thousand_separator=False) -> list[Text]:
|
|
115
141
|
"""Format a single row with proper styling and justification.
|
|
116
142
|
|
|
117
143
|
Converts raw row values to formatted Rich Text objects with appropriate
|
|
@@ -127,7 +153,7 @@ def format_row(vals, dtypes, apply_justify=True, thousand_separator=False) -> li
|
|
|
127
153
|
"""
|
|
128
154
|
formatted_row = []
|
|
129
155
|
|
|
130
|
-
for val, dtype in zip(vals, dtypes, strict=True):
|
|
156
|
+
for idx, (val, dtype) in enumerate(zip(vals, dtypes, strict=True)):
|
|
131
157
|
dc = DtypeConfig(dtype)
|
|
132
158
|
|
|
133
159
|
# Format the value
|
|
@@ -135,16 +161,18 @@ def format_row(vals, dtypes, apply_justify=True, thousand_separator=False) -> li
|
|
|
135
161
|
text_val = NULL_DISPLAY
|
|
136
162
|
elif dc.gtype == "integer" and thousand_separator:
|
|
137
163
|
text_val = f"{val:,}"
|
|
138
|
-
elif dc.gtype == "float"
|
|
139
|
-
text_val =
|
|
164
|
+
elif dc.gtype == "float":
|
|
165
|
+
text_val = format_float(val, thousand_separator)
|
|
140
166
|
else:
|
|
141
167
|
text_val = str(val)
|
|
142
168
|
|
|
143
169
|
formatted_row.append(
|
|
144
170
|
Text(
|
|
145
171
|
text_val,
|
|
146
|
-
style=dc.style,
|
|
172
|
+
style=styles[idx] if styles and styles[idx] else dc.style,
|
|
147
173
|
justify=dc.justify if apply_justify else "",
|
|
174
|
+
overflow="ellipsis",
|
|
175
|
+
no_wrap=True,
|
|
148
176
|
)
|
|
149
177
|
)
|
|
150
178
|
|
|
@@ -194,7 +222,7 @@ def get_next_item(lst: list[Any], current, offset=1) -> Any:
|
|
|
194
222
|
return lst[next_index]
|
|
195
223
|
|
|
196
224
|
|
|
197
|
-
def parse_polars_expression(expression: str,
|
|
225
|
+
def parse_polars_expression(expression: str, columns: list[str], current_col_idx: int) -> str:
|
|
198
226
|
"""Parse and convert an expression to Polars syntax.
|
|
199
227
|
|
|
200
228
|
Replaces column references with Polars col() expressions:
|
|
@@ -212,7 +240,7 @@ def parse_polars_expression(expression: str, df: pl.DataFrame, current_col_idx:
|
|
|
212
240
|
|
|
213
241
|
Args:
|
|
214
242
|
expression: The input expression as a string.
|
|
215
|
-
|
|
243
|
+
columns: The list of column names in the DataFrame.
|
|
216
244
|
current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
217
245
|
|
|
218
246
|
Returns:
|
|
@@ -242,19 +270,19 @@ def parse_polars_expression(expression: str, df: pl.DataFrame, current_col_idx:
|
|
|
242
270
|
|
|
243
271
|
if col_ref == "_":
|
|
244
272
|
# Current selected column
|
|
245
|
-
col_name =
|
|
273
|
+
col_name = columns[current_col_idx]
|
|
246
274
|
elif col_ref == "#":
|
|
247
275
|
# RIDX is used to store 0-based row index; add 1 for 1-based index
|
|
248
276
|
return f"(pl.col('{RIDX}') + 1)"
|
|
249
277
|
elif col_ref.isdigit():
|
|
250
278
|
# Column by 1-based index
|
|
251
279
|
col_idx = int(col_ref) - 1
|
|
252
|
-
if col_idx < 0 or col_idx >= len(
|
|
280
|
+
if col_idx < 0 or col_idx >= len(columns):
|
|
253
281
|
raise ValueError(f"Column index out of range: ${col_ref}")
|
|
254
|
-
col_name =
|
|
282
|
+
col_name = columns[col_idx]
|
|
255
283
|
else:
|
|
256
284
|
# Column by name
|
|
257
|
-
if col_ref not in
|
|
285
|
+
if col_ref not in columns:
|
|
258
286
|
raise ValueError(f"Column not found: ${col_ref}")
|
|
259
287
|
col_name = col_ref
|
|
260
288
|
|
|
@@ -283,7 +311,7 @@ def tentative_expr(term: str) -> bool:
|
|
|
283
311
|
return False
|
|
284
312
|
|
|
285
313
|
|
|
286
|
-
def validate_expr(term: str,
|
|
314
|
+
def validate_expr(term: str, columns: list[str], current_col_idx: int) -> pl.Expr | None:
|
|
287
315
|
"""Validate and return the expression.
|
|
288
316
|
|
|
289
317
|
Parses a user-provided expression string and validates it as a valid Polars expression.
|
|
@@ -291,7 +319,7 @@ def validate_expr(term: str, df: pl.DataFrame, current_col_idx: int) -> pl.Expr
|
|
|
291
319
|
|
|
292
320
|
Args:
|
|
293
321
|
term: The input expression as a string.
|
|
294
|
-
|
|
322
|
+
columns: The list of column names in the DataFrame.
|
|
295
323
|
current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
296
324
|
|
|
297
325
|
Returns:
|
|
@@ -304,7 +332,7 @@ def validate_expr(term: str, df: pl.DataFrame, current_col_idx: int) -> pl.Expr
|
|
|
304
332
|
|
|
305
333
|
try:
|
|
306
334
|
# Parse the expression
|
|
307
|
-
expr_str = parse_polars_expression(term,
|
|
335
|
+
expr_str = parse_polars_expression(term, columns, current_col_idx)
|
|
308
336
|
|
|
309
337
|
# Validate by evaluating it
|
|
310
338
|
try:
|
|
@@ -318,3 +346,324 @@ def validate_expr(term: str, df: pl.DataFrame, current_col_idx: int) -> pl.Expr
|
|
|
318
346
|
raise ValueError(f"Failed to evaluate expression `{expr_str}`: {e}") from e
|
|
319
347
|
except Exception as ve:
|
|
320
348
|
raise ValueError(f"Failed to validate expression `{term}`: {ve}") from ve
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def load_dataframe(
|
|
352
|
+
filenames: list[str],
|
|
353
|
+
file_format: str | None = None,
|
|
354
|
+
has_header: bool = True,
|
|
355
|
+
infer_schema: bool = True,
|
|
356
|
+
comment_prefix: str | None = None,
|
|
357
|
+
skip_lines: int = 0,
|
|
358
|
+
skip_rows_after_header: int = 0,
|
|
359
|
+
null_values: list[str] | None = None,
|
|
360
|
+
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
361
|
+
"""Load DataFrames from file specifications.
|
|
362
|
+
|
|
363
|
+
Handles loading from multiple files, single files, or stdin. For Excel files,
|
|
364
|
+
loads all sheets as separate entries. For other formats, loads as single file.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
filenames: List of filenames to load. If single filename is "-", read from stdin.
|
|
368
|
+
file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
|
|
369
|
+
has_header: Whether the input files have a header row. Defaults to True.
|
|
370
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
371
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
372
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
373
|
+
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
List of tuples of (DataFrame, filename, tabname) ready for display.
|
|
377
|
+
"""
|
|
378
|
+
sources = []
|
|
379
|
+
prefix_sheet = len(filenames) > 1
|
|
380
|
+
|
|
381
|
+
for filename in filenames:
|
|
382
|
+
# Determine file format if not specified
|
|
383
|
+
if not file_format:
|
|
384
|
+
ext = Path(filename).suffix.lower()
|
|
385
|
+
if ext == ".gz" or ext == ".bz2" or ext == ".xz":
|
|
386
|
+
ext = Path(filename).with_suffix("").suffix.lower()
|
|
387
|
+
fmt = ext.removeprefix(".")
|
|
388
|
+
|
|
389
|
+
# Default to TSV
|
|
390
|
+
file_format = fmt if fmt in SUPPORTED_FORMATS else "tsv"
|
|
391
|
+
|
|
392
|
+
# Load each file
|
|
393
|
+
sources.extend(
|
|
394
|
+
load_file(
|
|
395
|
+
filename,
|
|
396
|
+
prefix_sheet=prefix_sheet,
|
|
397
|
+
file_format=file_format,
|
|
398
|
+
has_header=has_header,
|
|
399
|
+
infer_schema=infer_schema,
|
|
400
|
+
comment_prefix=comment_prefix,
|
|
401
|
+
skip_lines=skip_lines,
|
|
402
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
403
|
+
null_values=null_values,
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return sources
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
RE_COMPUTE_ERROR = re.compile(r"at column '(.*?)' \(column number \d+\)")
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def handle_compute_error(
|
|
414
|
+
err_msg: str,
|
|
415
|
+
file_format: str | None,
|
|
416
|
+
infer_schema: bool,
|
|
417
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
418
|
+
) -> tuple[bool, dict[str, pl.DataType] | None]:
|
|
419
|
+
"""Handle ComputeError during schema inference and determine retry strategy.
|
|
420
|
+
|
|
421
|
+
Analyzes the error message and determines whether to retry with schema overrides,
|
|
422
|
+
disable schema inference, or exit with an error.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
err_msg: The error message from the ComputeError exception.
|
|
426
|
+
file_format: The file format being loaded (tsv, csv, etc.).
|
|
427
|
+
infer_schema: Whether schema inference is currently enabled.
|
|
428
|
+
schema_overrides: Current schema overrides, if any.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
A tuple of (infer_schema, schema_overrides):
|
|
432
|
+
|
|
433
|
+
Raises:
|
|
434
|
+
SystemExit: If the error is unrecoverable.
|
|
435
|
+
"""
|
|
436
|
+
# Already disabled schema inference, cannot recover
|
|
437
|
+
if not infer_schema:
|
|
438
|
+
print(f"Error loading with schema inference disabled:\n{err_msg}", file=sys.stderr)
|
|
439
|
+
sys.exit(1)
|
|
440
|
+
|
|
441
|
+
# Schema mismatch error
|
|
442
|
+
if "found more fields than defined in 'Schema'" in err_msg:
|
|
443
|
+
print(f"Input might be malformed:\n{err_msg}", file=sys.stderr)
|
|
444
|
+
sys.exit(1)
|
|
445
|
+
|
|
446
|
+
# ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
|
|
447
|
+
if file_format in ("tsv", "csv") and (m := RE_COMPUTE_ERROR.search(err_msg)):
|
|
448
|
+
col_name = m.group(1)
|
|
449
|
+
|
|
450
|
+
if schema_overrides is None:
|
|
451
|
+
schema_overrides = {}
|
|
452
|
+
schema_overrides.update({col_name: pl.String})
|
|
453
|
+
else:
|
|
454
|
+
infer_schema = False
|
|
455
|
+
|
|
456
|
+
return infer_schema, schema_overrides
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def load_stdin(
|
|
460
|
+
stdin_data=None,
|
|
461
|
+
file_format: str | None = None,
|
|
462
|
+
has_header: bool = True,
|
|
463
|
+
infer_schema: bool = True,
|
|
464
|
+
comment_prefix: str | None = None,
|
|
465
|
+
skip_lines: int = 0,
|
|
466
|
+
skip_rows_after_header: int = 0,
|
|
467
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
468
|
+
null_values: list[str] | None = None,
|
|
469
|
+
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
470
|
+
"""Load DataFrame from stdin.
|
|
471
|
+
|
|
472
|
+
If a ComputeError occurs during schema inference for a column, attempts to recover
|
|
473
|
+
by treating that column as a string and retrying the load. This process repeats until
|
|
474
|
+
all columns are successfully loaded or no further recovery is possible.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
stdin_data: Optional stdin data as string. If None, read from sys.stdin.
|
|
478
|
+
file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
|
|
479
|
+
has_header: Whether the input files have a header row. Defaults to True.
|
|
480
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
481
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
482
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
483
|
+
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
List of tuples of (DataFrame, filename, tabname) ready for display.
|
|
487
|
+
"""
|
|
488
|
+
import os
|
|
489
|
+
from io import StringIO
|
|
490
|
+
|
|
491
|
+
sources = []
|
|
492
|
+
|
|
493
|
+
# Read from stdin into memory first (stdin is not seekable)
|
|
494
|
+
if stdin_data is None:
|
|
495
|
+
stdin_data = sys.stdin.read()
|
|
496
|
+
|
|
497
|
+
# Reopen stdin to /dev/tty for proper terminal interaction
|
|
498
|
+
try:
|
|
499
|
+
tty = open("/dev/tty")
|
|
500
|
+
os.dup2(tty.fileno(), sys.stdin.fileno())
|
|
501
|
+
except (OSError, FileNotFoundError):
|
|
502
|
+
pass
|
|
503
|
+
|
|
504
|
+
lf = pl.scan_csv(
|
|
505
|
+
StringIO(stdin_data),
|
|
506
|
+
separator="," if file_format == "csv" else "\t",
|
|
507
|
+
has_header=has_header,
|
|
508
|
+
infer_schema=infer_schema,
|
|
509
|
+
comment_prefix=comment_prefix,
|
|
510
|
+
skip_lines=skip_lines,
|
|
511
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
512
|
+
schema_overrides=schema_overrides,
|
|
513
|
+
null_values=null_values,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
sources = [(lf, f"stdin.{file_format}" if file_format else "stdin", "stdin")]
|
|
517
|
+
|
|
518
|
+
# Attempt to collect, handling ComputeError for schema inference issues
|
|
519
|
+
try:
|
|
520
|
+
sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
|
|
521
|
+
except pl.exceptions.ComputeError as ce:
|
|
522
|
+
# Handle the error and determine retry strategy
|
|
523
|
+
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
524
|
+
|
|
525
|
+
# Retry loading with updated schema overrides
|
|
526
|
+
return load_stdin(
|
|
527
|
+
stdin_data,
|
|
528
|
+
file_format=file_format,
|
|
529
|
+
has_header=has_header,
|
|
530
|
+
infer_schema=infer_schema,
|
|
531
|
+
comment_prefix=comment_prefix,
|
|
532
|
+
skip_lines=skip_lines,
|
|
533
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
534
|
+
schema_overrides=schema_overrides,
|
|
535
|
+
null_values=null_values,
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
return sources
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def load_file(
|
|
542
|
+
filename: str,
|
|
543
|
+
first_sheet: bool = False,
|
|
544
|
+
prefix_sheet: bool = False,
|
|
545
|
+
file_format: str | None = None,
|
|
546
|
+
has_header: bool = True,
|
|
547
|
+
infer_schema: bool = True,
|
|
548
|
+
comment_prefix: str | None = None,
|
|
549
|
+
skip_lines: int = 0,
|
|
550
|
+
skip_rows_after_header: int = 0,
|
|
551
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
552
|
+
null_values: list[str] | None = None,
|
|
553
|
+
) -> list[tuple[pl.DataFrame, str, str]]:
|
|
554
|
+
"""Load a single file.
|
|
555
|
+
|
|
556
|
+
For Excel files, when `first_sheet` is True, returns only the first sheet. Otherwise, returns one entry per sheet.
|
|
557
|
+
For other files or multiple files, returns one entry per file.
|
|
558
|
+
|
|
559
|
+
If a ComputeError occurs during schema inference for a column, attempts to recover
|
|
560
|
+
by treating that column as a string and retrying the load. This process repeats until
|
|
561
|
+
all columns are successfully loaded or no further recovery is possible.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
filename: Path to file to load.
|
|
565
|
+
first_sheet: If True, only load first sheet for Excel files. Defaults to False.
|
|
566
|
+
prefix_sheet: If True, prefix filename to sheet name as the tab name for Excel files. Defaults to False.
|
|
567
|
+
file_format: Optional format specifier (i.e., 'tsv', 'csv', 'excel', 'parquet', 'json', 'ndjson') for input files.
|
|
568
|
+
By default, infers from file extension.
|
|
569
|
+
has_header: Whether the input files have a header row. Defaults to True.
|
|
570
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
571
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
572
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. The header will be parsed at this offset. Defaults to 0.
|
|
573
|
+
skip_rows_after_header: Number of rows to skip after header when reading CSV/TSV files. Defaults to 0.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
List of tuples of (DataFrame, filename, tabname).
|
|
577
|
+
"""
|
|
578
|
+
sources = []
|
|
579
|
+
|
|
580
|
+
if filename == "-":
|
|
581
|
+
return load_stdin(
|
|
582
|
+
file_format=file_format,
|
|
583
|
+
has_header=has_header,
|
|
584
|
+
infer_schema=infer_schema,
|
|
585
|
+
comment_prefix=comment_prefix,
|
|
586
|
+
skip_lines=skip_lines,
|
|
587
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
588
|
+
schema_overrides=schema_overrides,
|
|
589
|
+
null_values=null_values,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
filepath = Path(filename)
|
|
593
|
+
|
|
594
|
+
# Load based on file format
|
|
595
|
+
if file_format in ("tsv", "csv"):
|
|
596
|
+
lf = pl.scan_csv(
|
|
597
|
+
filename,
|
|
598
|
+
separator="\t" if file_format == "tsv" else ",",
|
|
599
|
+
has_header=has_header,
|
|
600
|
+
infer_schema=infer_schema,
|
|
601
|
+
comment_prefix=comment_prefix,
|
|
602
|
+
skip_lines=skip_lines,
|
|
603
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
604
|
+
schema_overrides=schema_overrides,
|
|
605
|
+
null_values=null_values,
|
|
606
|
+
)
|
|
607
|
+
sources.append((lf, filename, filepath.stem))
|
|
608
|
+
elif file_format in ("xlsx", "xls", "excel"):
|
|
609
|
+
if first_sheet:
|
|
610
|
+
# Read only the first sheet for multiple files
|
|
611
|
+
lf = pl.read_excel(filename).lazy()
|
|
612
|
+
sources.append((lf, filename, filepath.stem))
|
|
613
|
+
else:
|
|
614
|
+
# For single file, expand all sheets
|
|
615
|
+
sheets = pl.read_excel(filename, sheet_id=0)
|
|
616
|
+
for sheet_name, df in sheets.items():
|
|
617
|
+
tabname = f"{filepath.stem}_{sheet_name}" if prefix_sheet else sheet_name
|
|
618
|
+
sources.append((df.lazy(), filename, tabname))
|
|
619
|
+
elif file_format == "parquet":
|
|
620
|
+
lf = pl.scan_parquet(filename)
|
|
621
|
+
sources.append((lf, filename, filepath.stem))
|
|
622
|
+
elif file_format == "json":
|
|
623
|
+
lf = pl.read_json(filename).lazy()
|
|
624
|
+
sources.append((lf, filename, filepath.stem))
|
|
625
|
+
elif file_format == "ndjson":
|
|
626
|
+
lf = pl.scan_ndjson(filename, schema_overrides=schema_overrides)
|
|
627
|
+
sources.append((lf, filename, filepath.stem))
|
|
628
|
+
else:
|
|
629
|
+
raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {SUPPORTED_FORMATS}")
|
|
630
|
+
|
|
631
|
+
# Attempt to collect, handling ComputeError for schema inference issues
|
|
632
|
+
try:
|
|
633
|
+
sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
|
|
634
|
+
except pl.exceptions.ComputeError as ce:
|
|
635
|
+
# Handle the error and determine retry strategy
|
|
636
|
+
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
637
|
+
|
|
638
|
+
# Retry loading with updated schema overrides
|
|
639
|
+
return load_file(
|
|
640
|
+
filename,
|
|
641
|
+
file_format=file_format,
|
|
642
|
+
has_header=has_header,
|
|
643
|
+
infer_schema=infer_schema,
|
|
644
|
+
comment_prefix=comment_prefix,
|
|
645
|
+
skip_lines=skip_lines,
|
|
646
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
647
|
+
schema_overrides=schema_overrides,
|
|
648
|
+
null_values=null_values,
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
return sources
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def now() -> str:
|
|
655
|
+
"""Get the current local time as a formatted string."""
|
|
656
|
+
import time
|
|
657
|
+
|
|
658
|
+
return time.strftime("%m/%d/%Y %H:%M:%S", time.localtime())
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
async def sleep_async(seconds: float) -> None:
|
|
662
|
+
"""Async sleep to yield control back to the event loop.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
seconds: The number of seconds to sleep.
|
|
666
|
+
"""
|
|
667
|
+
import asyncio
|
|
668
|
+
|
|
669
|
+
await asyncio.sleep(seconds)
|
|
@@ -79,10 +79,12 @@ class DataFrameHelpPanel(Widget):
|
|
|
79
79
|
None
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
|
-
def update_help(focused_widget: Widget | None):
|
|
83
|
-
|
|
82
|
+
# def update_help(focused_widget: Widget | None):
|
|
83
|
+
# self.update_help(focused_widget)
|
|
84
84
|
|
|
85
|
-
self.watch(self.screen, "focused", update_help)
|
|
85
|
+
# self.watch(self.screen, "focused", update_help)
|
|
86
|
+
|
|
87
|
+
self.update_help(self.screen.focused)
|
|
86
88
|
|
|
87
89
|
def update_help(self, focused_widget: Widget | None) -> None:
|
|
88
90
|
"""Update the help for the focused widget.
|
|
@@ -96,7 +98,7 @@ class DataFrameHelpPanel(Widget):
|
|
|
96
98
|
return
|
|
97
99
|
self.set_class(focused_widget is not None, "-show-help")
|
|
98
100
|
if focused_widget is not None:
|
|
99
|
-
help = self.app.HELP + "\n" + focused_widget.HELP or ""
|
|
101
|
+
help = (self.app.HELP or "") + "\n" + (focused_widget.HELP or "")
|
|
100
102
|
if not help:
|
|
101
103
|
self.remove_class("-show-help")
|
|
102
104
|
try:
|