dataframe-textual 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataframe_textual/__init__.py +60 -0
- dataframe_textual/__main__.py +107 -0
- dataframe_textual/common.py +786 -0
- dataframe_textual/data_frame_help_panel.py +115 -0
- dataframe_textual/data_frame_table.py +3940 -0
- dataframe_textual/data_frame_viewer.py +625 -0
- dataframe_textual/sql_screen.py +238 -0
- dataframe_textual/table_screen.py +527 -0
- dataframe_textual/yes_no_screen.py +752 -0
- dataframe_textual-2.2.1.dist-info/METADATA +846 -0
- dataframe_textual-2.2.1.dist-info/RECORD +14 -0
- dataframe_textual-2.2.1.dist-info/WHEEL +4 -0
- dataframe_textual-2.2.1.dist-info/entry_points.txt +3 -0
- dataframe_textual-2.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,786 @@
|
|
|
1
|
+
"""Common utilities and constants for dataframe_viewer."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
from rich.text import Text
|
|
13
|
+
|
|
14
|
+
# Supported file formats
|
|
15
|
+
SUPPORTED_FORMATS = {"tsv", "tab", "csv", "excel", "xlsx", "xls", "parquet", "json", "ndjson"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Boolean string mappings
|
|
19
|
+
BOOLS = {
|
|
20
|
+
"true": True,
|
|
21
|
+
"t": True,
|
|
22
|
+
"yes": True,
|
|
23
|
+
"y": True,
|
|
24
|
+
"1": True,
|
|
25
|
+
"false": False,
|
|
26
|
+
"f": False,
|
|
27
|
+
"no": False,
|
|
28
|
+
"n": False,
|
|
29
|
+
"0": False,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Special string to represent null value
|
|
33
|
+
NULL = "NULL"
|
|
34
|
+
NULL_DISPLAY = "-"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def format_float(value: float, thousand_separator: bool = False, precision: int = 2) -> str:
|
|
38
|
+
"""Format a float value, keeping integers without decimal point.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
val: The float value to format.
|
|
42
|
+
thousand_separator: Whether to include thousand separators. Defaults to False.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The formatted float as a string.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
if (val := int(value)) == value:
|
|
49
|
+
if precision > 0:
|
|
50
|
+
return f"{val:,}" if thousand_separator else str(val)
|
|
51
|
+
else:
|
|
52
|
+
return f"{val:,.{-precision}f}" if thousand_separator else f"{val:.{-precision}f}"
|
|
53
|
+
else:
|
|
54
|
+
if precision > 0:
|
|
55
|
+
return f"{value:,.{precision}f}" if thousand_separator else f"{value:.{precision}f}"
|
|
56
|
+
else:
|
|
57
|
+
return f"{value:,f}" if thousand_separator else str(value)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class DtypeClass:
|
|
62
|
+
"""Data type class configuration.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
gtype: Generic, high-level type as a string.
|
|
66
|
+
style: Style string for display purposes.
|
|
67
|
+
justify: Text justification for display.
|
|
68
|
+
itype: Input type for validation.
|
|
69
|
+
convert: Conversion function for the data type.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
gtype: str # generic, high-level type
|
|
73
|
+
style: str
|
|
74
|
+
justify: str
|
|
75
|
+
itype: str
|
|
76
|
+
convert: Any
|
|
77
|
+
|
|
78
|
+
def format(
|
|
79
|
+
self, val: Any, style: str | None = None, justify: str | None = None, thousand_separator: bool = False
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Format the value according to its data type.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
val: The value to format.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
The formatted value as a Text.
|
|
88
|
+
"""
|
|
89
|
+
# Format the value
|
|
90
|
+
if val is None:
|
|
91
|
+
text_val = NULL_DISPLAY
|
|
92
|
+
elif self.gtype == "integer" and thousand_separator:
|
|
93
|
+
text_val = f"{val:,}"
|
|
94
|
+
elif self.gtype == "float":
|
|
95
|
+
text_val = format_float(val, thousand_separator)
|
|
96
|
+
else:
|
|
97
|
+
text_val = str(val)
|
|
98
|
+
|
|
99
|
+
return Text(
|
|
100
|
+
text_val,
|
|
101
|
+
style="" if style == "" else (style or self.style),
|
|
102
|
+
justify="" if justify == "" else (justify or self.justify),
|
|
103
|
+
overflow="ellipsis",
|
|
104
|
+
no_wrap=True,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# itype is used by Input widget for input validation
|
|
109
|
+
# fmt: off
|
|
110
|
+
STYLES = {
|
|
111
|
+
# str
|
|
112
|
+
pl.String: DtypeClass(gtype="string", style="green", justify="left", itype="text", convert=str),
|
|
113
|
+
# int
|
|
114
|
+
pl.Int8: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
115
|
+
pl.Int16: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
116
|
+
pl.Int32: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
117
|
+
pl.Int64: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
118
|
+
pl.Int128: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
119
|
+
pl.UInt8: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
120
|
+
pl.UInt16: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
121
|
+
pl.UInt32: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
122
|
+
pl.UInt64: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
|
|
123
|
+
# float
|
|
124
|
+
pl.Float32: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
125
|
+
pl.Float64: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
126
|
+
pl.Decimal: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
|
|
127
|
+
# bool
|
|
128
|
+
pl.Boolean: DtypeClass(gtype="boolean", style="blue", justify="center", itype="text", convert=lambda x: BOOLS[x.lower()]),
|
|
129
|
+
# temporal
|
|
130
|
+
pl.Date: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
131
|
+
pl.Datetime: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
132
|
+
pl.Time: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
|
|
133
|
+
# unknown
|
|
134
|
+
pl.Unknown: DtypeClass(gtype="unknown", style="", justify="", itype="text", convert=str),
|
|
135
|
+
}
|
|
136
|
+
# fmt: on
|
|
137
|
+
|
|
138
|
+
# Subscript digits mapping for sort indicators
|
|
139
|
+
SUBSCRIPT_DIGITS = {
|
|
140
|
+
0: "₀",
|
|
141
|
+
1: "₁",
|
|
142
|
+
2: "₂",
|
|
143
|
+
3: "₃",
|
|
144
|
+
4: "₄",
|
|
145
|
+
5: "₅",
|
|
146
|
+
6: "₆",
|
|
147
|
+
7: "₇",
|
|
148
|
+
8: "₈",
|
|
149
|
+
9: "₉",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Cursor types ("none" removed)
|
|
153
|
+
CURSOR_TYPES = ["row", "column", "cell"]
|
|
154
|
+
|
|
155
|
+
# Row index mapping between filtered and original dataframe
|
|
156
|
+
RID = "^_RID_^"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class Source:
|
|
161
|
+
"""Data source representation.
|
|
162
|
+
|
|
163
|
+
Attributes:
|
|
164
|
+
frame: The Polars DataFrame or LazyFrame.
|
|
165
|
+
filename: The name of the source file.
|
|
166
|
+
tabname: The name of the tab to display.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
frame: pl.DataFrame | pl.LazyFrame
|
|
170
|
+
filename: str
|
|
171
|
+
tabname: str
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def DtypeConfig(dtype: pl.DataType) -> DtypeClass:
|
|
175
|
+
"""Get the DtypeClass configuration for a given Polars data type.
|
|
176
|
+
|
|
177
|
+
Retrieves styling and formatting configuration based on the Polars data type,
|
|
178
|
+
including style (color), justification, and type conversion function.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
dtype: A Polars data type to get configuration for.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
A DtypeClass containing style, justification, input type, and conversion function.
|
|
185
|
+
"""
|
|
186
|
+
if dc := STYLES.get(dtype):
|
|
187
|
+
return dc
|
|
188
|
+
elif isinstance(dtype, pl.Datetime):
|
|
189
|
+
return STYLES[pl.Datetime]
|
|
190
|
+
elif isinstance(dtype, pl.Date):
|
|
191
|
+
return STYLES[pl.Date]
|
|
192
|
+
elif isinstance(dtype, pl.Time):
|
|
193
|
+
return STYLES[pl.Time]
|
|
194
|
+
else:
|
|
195
|
+
return STYLES[pl.Unknown]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def format_row(vals, dtypes, styles: list[str | None] | None = None, thousand_separator=False) -> list[Text]:
|
|
199
|
+
"""Format a single row with proper styling and justification.
|
|
200
|
+
|
|
201
|
+
Converts raw row values to formatted Rich Text objects with appropriate
|
|
202
|
+
styling (colors), justification, and null value handling based on data types.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
vals: The list of values in the row.
|
|
206
|
+
dtypes: The list of data types corresponding to each value.
|
|
207
|
+
styles: Optional list of style overrides for each value. Defaults to None.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
A list of Rich Text objects with proper formatting applied.
|
|
211
|
+
"""
|
|
212
|
+
formatted_row = []
|
|
213
|
+
|
|
214
|
+
for idx, (val, dtype) in enumerate(zip(vals, dtypes, strict=True)):
|
|
215
|
+
dc = DtypeConfig(dtype)
|
|
216
|
+
formatted_row.append(
|
|
217
|
+
dc.format(
|
|
218
|
+
val,
|
|
219
|
+
style=styles[idx] if styles and styles[idx] else None,
|
|
220
|
+
thousand_separator=thousand_separator,
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
return formatted_row
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def rindex(lst: list, value, pos: int | None = None) -> int:
|
|
228
|
+
"""Return the last index of value in a list. Return -1 if not found.
|
|
229
|
+
|
|
230
|
+
Searches through the list in reverse order to find the last occurrence
|
|
231
|
+
of the given value.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
lst: The list to search through.
|
|
235
|
+
value: The value to find.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The index (0-based) of the last occurrence, or -1 if not found.
|
|
239
|
+
"""
|
|
240
|
+
n = len(lst)
|
|
241
|
+
for i, item in enumerate(reversed(lst)):
|
|
242
|
+
if pos is not None and (n - 1 - i) > pos:
|
|
243
|
+
continue
|
|
244
|
+
if item == value:
|
|
245
|
+
return n - 1 - i
|
|
246
|
+
return -1
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_next_item(lst: list[Any], current, offset=1) -> Any:
|
|
250
|
+
"""Return the next item in the list after the current item, cycling if needed.
|
|
251
|
+
|
|
252
|
+
Finds the current item in the list and returns the item at position (current_index + offset),
|
|
253
|
+
wrapping around to the beginning if necessary.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
lst: The list to cycle through.
|
|
257
|
+
current: The current item (must be in the list).
|
|
258
|
+
offset: The number of positions to advance. Defaults to 1.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
The next item in the list after advancing by the offset.
|
|
262
|
+
|
|
263
|
+
Raises:
|
|
264
|
+
ValueError: If the current item is not found in the list.
|
|
265
|
+
"""
|
|
266
|
+
if current not in lst:
|
|
267
|
+
raise ValueError("Current item not in list")
|
|
268
|
+
current_index = lst.index(current)
|
|
269
|
+
next_index = (current_index + offset) % len(lst)
|
|
270
|
+
return lst[next_index]
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def parse_placeholders(template: str, columns: list[str], current_cidx: int) -> list[str | pl.Expr]:
|
|
274
|
+
"""Parse template string into a list of strings or Polars expressions
|
|
275
|
+
|
|
276
|
+
Supports multiple placeholder types:
|
|
277
|
+
- `$_` - Current column (based on current_cidx parameter)
|
|
278
|
+
- `$#` - Row index (1-based)
|
|
279
|
+
- `$1`, `$2`, etc. - Column index (1-based)
|
|
280
|
+
- `$name` - Column name (e.g., `$product_id`)
|
|
281
|
+
- `` $`col name` `` - Column name with spaces (e.g., `` $`product id` ``)
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
template: The template string containing placeholders and literal text
|
|
285
|
+
columns: List of column names in the dataframe
|
|
286
|
+
current_cidx: 0-based index of the current column for `$_` references in the columns list
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
A list of strings (literal text) and Polars expressions (for column references)
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
ValueError: If invalid column index or non-existent column name is referenced
|
|
293
|
+
"""
|
|
294
|
+
if "$" not in template or template.endswith("$"):
|
|
295
|
+
return [template]
|
|
296
|
+
|
|
297
|
+
# Regex matches: $_ or $# or $\d+ or $`...` (backtick-quoted names with spaces) or $\w+ (column names)
|
|
298
|
+
# Pattern explanation:
|
|
299
|
+
# \$(_|#|\d+|`[^`]+`|[a-zA-Z_]\w*)
|
|
300
|
+
# - $_ : current column
|
|
301
|
+
# - $# : row index
|
|
302
|
+
# - $\d+ : column by index (1-based)
|
|
303
|
+
# - $`[^`]+` : column by name with spaces (backtick quoted)
|
|
304
|
+
# - $[a-zA-Z_]\w* : column by name without spaces
|
|
305
|
+
placeholder_pattern = r"\$(_|#|\d+|`[^`]+`|[a-zA-Z_]\w*)"
|
|
306
|
+
placeholders = re.finditer(placeholder_pattern, template)
|
|
307
|
+
|
|
308
|
+
parts = []
|
|
309
|
+
last_end = 0
|
|
310
|
+
|
|
311
|
+
# Get current column name for $_ references
|
|
312
|
+
try:
|
|
313
|
+
col_name = columns[current_cidx]
|
|
314
|
+
except IndexError:
|
|
315
|
+
raise ValueError(f"Current column index {current_cidx} is out of range for columns list")
|
|
316
|
+
|
|
317
|
+
for match in placeholders:
|
|
318
|
+
# Add literal text before this placeholder
|
|
319
|
+
if match.start() > last_end:
|
|
320
|
+
parts.append(template[last_end : match.start()])
|
|
321
|
+
|
|
322
|
+
placeholder = match.group(1) # Extract content after '$'
|
|
323
|
+
|
|
324
|
+
if placeholder == "_":
|
|
325
|
+
# $_ refers to current column (where cursor was)
|
|
326
|
+
parts.append(pl.col(col_name))
|
|
327
|
+
elif placeholder == "#":
|
|
328
|
+
# $# refers to row index (1-based)
|
|
329
|
+
parts.append(pl.col(RID))
|
|
330
|
+
elif placeholder.isdigit():
|
|
331
|
+
# $1, $2, etc. refer to columns by 1-based position index
|
|
332
|
+
col_idx = int(placeholder) - 1 # Convert to 0-based
|
|
333
|
+
try:
|
|
334
|
+
col_ref = columns[col_idx]
|
|
335
|
+
parts.append(pl.col(col_ref))
|
|
336
|
+
except IndexError:
|
|
337
|
+
raise ValueError(f"Invalid column index: ${placeholder} (valid range: $1 to ${len(columns)})")
|
|
338
|
+
elif placeholder.startswith("`") and placeholder.endswith("`"):
|
|
339
|
+
# $`col name` refers to column by name with spaces
|
|
340
|
+
col_ref = placeholder[1:-1] # Remove backticks
|
|
341
|
+
if col_ref in columns:
|
|
342
|
+
parts.append(pl.col(col_ref))
|
|
343
|
+
else:
|
|
344
|
+
raise ValueError(f"Column not found: ${placeholder} (available columns: {', '.join(columns)})")
|
|
345
|
+
else:
|
|
346
|
+
# $name refers to column by name
|
|
347
|
+
if placeholder in columns:
|
|
348
|
+
parts.append(pl.col(placeholder))
|
|
349
|
+
else:
|
|
350
|
+
raise ValueError(f"Column not found: ${placeholder} (available columns: {', '.join(columns)})")
|
|
351
|
+
|
|
352
|
+
last_end = match.end()
|
|
353
|
+
|
|
354
|
+
# Add remaining literal text after last placeholder
|
|
355
|
+
if last_end < len(template):
|
|
356
|
+
parts.append(template[last_end:])
|
|
357
|
+
|
|
358
|
+
# If no placeholders found, treat entire template as literal
|
|
359
|
+
if not parts:
|
|
360
|
+
parts = [template]
|
|
361
|
+
|
|
362
|
+
return parts
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def parse_polars_expression(expression: str, columns: list[str], current_cidx: int) -> str:
|
|
366
|
+
"""Parse and convert an expression to Polars syntax.
|
|
367
|
+
|
|
368
|
+
Replaces column references with Polars col() expressions:
|
|
369
|
+
- $_ - Current selected column
|
|
370
|
+
- $# - Row index (1-based)
|
|
371
|
+
- $1, $2, etc. - Column index (1-based)
|
|
372
|
+
- $col_name - Column name (valid identifier starting with _ or letter)
|
|
373
|
+
- $`col name` - Column name with spaces (backtick quoted)
|
|
374
|
+
|
|
375
|
+
Examples:
|
|
376
|
+
- "$_ > 50" -> "pl.col('current_col') > 50"
|
|
377
|
+
- "$# > 10" -> "pl.col('^_RID_^') > 10"
|
|
378
|
+
- "$1 > 50" -> "pl.col('col0') > 50"
|
|
379
|
+
- "$name == 'Alex'" -> "pl.col('name') == 'Alex'"
|
|
380
|
+
- "$age < $salary" -> "pl.col('age') < pl.col('salary')"
|
|
381
|
+
- "$`product id` > 100" -> "pl.col('product id') > 100"
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
expression: The input expression as a string.
|
|
385
|
+
columns: The list of column names in the DataFrame.
|
|
386
|
+
current_cidx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
A Python expression string with $references replaced by pl.col() calls.
|
|
390
|
+
|
|
391
|
+
Raises:
|
|
392
|
+
ValueError: If a column reference is invalid.
|
|
393
|
+
"""
|
|
394
|
+
# Early return if no $ present
|
|
395
|
+
if "$" not in expression:
|
|
396
|
+
if "pl." in expression:
|
|
397
|
+
# This may be valid Polars expression already
|
|
398
|
+
return expression
|
|
399
|
+
else:
|
|
400
|
+
# Return as a literal string
|
|
401
|
+
return f"pl.lit({expression})"
|
|
402
|
+
|
|
403
|
+
parts = parse_placeholders(expression, columns, current_cidx)
|
|
404
|
+
|
|
405
|
+
result = []
|
|
406
|
+
for part in parts:
|
|
407
|
+
if isinstance(part, pl.Expr):
|
|
408
|
+
col = part.meta.output_name()
|
|
409
|
+
|
|
410
|
+
if col == RID: # Convert to 1-based
|
|
411
|
+
result.append(f"(pl.col('{col}') + 1)")
|
|
412
|
+
else:
|
|
413
|
+
result.append(f"pl.col('{col}')")
|
|
414
|
+
else:
|
|
415
|
+
result.append(part)
|
|
416
|
+
|
|
417
|
+
return "".join(result)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def tentative_expr(term: str) -> bool:
|
|
421
|
+
"""Check if the given term could be a Polars expression.
|
|
422
|
+
|
|
423
|
+
Heuristically determines whether a string might represent a Polars expression
|
|
424
|
+
based on common patterns like column references ($) or direct Polars syntax (pl.).
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
term: The string to check.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
True if the term appears to be a Polars expression, False otherwise.
|
|
431
|
+
"""
|
|
432
|
+
if "$" in term and not term.endswith("$"):
|
|
433
|
+
return True
|
|
434
|
+
if "pl." in term:
|
|
435
|
+
return True
|
|
436
|
+
return False
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def validate_expr(term: str, columns: list[str], current_col_idx: int) -> pl.Expr | None:
|
|
440
|
+
"""Validate and return the expression.
|
|
441
|
+
|
|
442
|
+
Parses a user-provided expression string and validates it as a valid Polars expression.
|
|
443
|
+
Converts special syntax like $_ references to proper Polars col() expressions.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
term: The input expression as a string.
|
|
447
|
+
columns: The list of column names in the DataFrame.
|
|
448
|
+
current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
A valid Polars expression object if validation succeeds.
|
|
452
|
+
|
|
453
|
+
Raises:
|
|
454
|
+
ValueError: If the expression is invalid, contains non-existent column references, or cannot be evaluated.
|
|
455
|
+
"""
|
|
456
|
+
term = term.strip()
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
# Parse the expression
|
|
460
|
+
expr_str = parse_polars_expression(term, columns, current_col_idx)
|
|
461
|
+
|
|
462
|
+
# Validate by evaluating it
|
|
463
|
+
try:
|
|
464
|
+
expr = eval(expr_str, {"pl": pl})
|
|
465
|
+
if not isinstance(expr, pl.Expr):
|
|
466
|
+
raise ValueError(f"Expression evaluated to `{type(expr).__name__}` instead of a Polars expression")
|
|
467
|
+
|
|
468
|
+
# Expression is valid
|
|
469
|
+
return expr
|
|
470
|
+
except Exception as e:
|
|
471
|
+
raise ValueError(f"Failed to evaluate expression `{expr_str}`: {e}") from e
|
|
472
|
+
except Exception as ve:
|
|
473
|
+
raise ValueError(f"Failed to validate expression `{term}`: {ve}") from ve
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def load_dataframe(
|
|
477
|
+
filenames: list[str],
|
|
478
|
+
file_format: str | None = None,
|
|
479
|
+
has_header: bool = True,
|
|
480
|
+
infer_schema: bool = True,
|
|
481
|
+
comment_prefix: str | None = None,
|
|
482
|
+
quote_char: str | None = '"',
|
|
483
|
+
skip_lines: int = 0,
|
|
484
|
+
skip_rows_after_header: int = 0,
|
|
485
|
+
null_values: list[str] | None = None,
|
|
486
|
+
ignore_errors: bool = False,
|
|
487
|
+
truncate_ragged_lines: bool = False,
|
|
488
|
+
) -> list[Source]:
|
|
489
|
+
"""Load DataFrames from file specifications.
|
|
490
|
+
|
|
491
|
+
Handles loading from multiple files, single files, or stdin. For Excel files,
|
|
492
|
+
loads all sheets as separate entries. For other formats, loads as single file.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
filenames: List of filenames to load. If single filename is "-", read from stdin.
|
|
496
|
+
file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
|
|
497
|
+
has_header: Whether the input files have a header row. Defaults to True.
|
|
498
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
499
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
500
|
+
quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
|
|
501
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
|
|
502
|
+
skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
|
|
503
|
+
null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
|
|
504
|
+
ignore_errors: Whether to ignore errors when reading CSV/TSV files. Defaults to False.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
List of `Source` objects.
|
|
508
|
+
"""
|
|
509
|
+
data: list[Source] = []
|
|
510
|
+
prefix_sheet = len(filenames) > 1
|
|
511
|
+
|
|
512
|
+
for filename in filenames:
|
|
513
|
+
if filename == "-":
|
|
514
|
+
source = StringIO(sys.stdin.read())
|
|
515
|
+
file_format = file_format or "tsv"
|
|
516
|
+
|
|
517
|
+
# Reopen stdin to /dev/tty for proper terminal interaction
|
|
518
|
+
try:
|
|
519
|
+
tty = open("/dev/tty")
|
|
520
|
+
os.dup2(tty.fileno(), sys.stdin.fileno())
|
|
521
|
+
except (OSError, FileNotFoundError):
|
|
522
|
+
pass
|
|
523
|
+
else:
|
|
524
|
+
source = filename
|
|
525
|
+
|
|
526
|
+
# If not specified, determine file format (may be different for each file)
|
|
527
|
+
fmt = file_format
|
|
528
|
+
if not fmt:
|
|
529
|
+
ext = Path(filename).suffix.lower()
|
|
530
|
+
if ext == ".gz":
|
|
531
|
+
ext = Path(filename).with_suffix("").suffix.lower()
|
|
532
|
+
fmt = ext.removeprefix(".")
|
|
533
|
+
|
|
534
|
+
# Default to TSV
|
|
535
|
+
if not fmt or fmt not in SUPPORTED_FORMATS:
|
|
536
|
+
fmt = "tsv"
|
|
537
|
+
|
|
538
|
+
# Load the file
|
|
539
|
+
data.extend(
|
|
540
|
+
load_file(
|
|
541
|
+
source,
|
|
542
|
+
prefix_sheet=prefix_sheet,
|
|
543
|
+
file_format=fmt,
|
|
544
|
+
has_header=has_header,
|
|
545
|
+
infer_schema=infer_schema,
|
|
546
|
+
comment_prefix=comment_prefix,
|
|
547
|
+
quote_char=quote_char,
|
|
548
|
+
skip_lines=skip_lines,
|
|
549
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
550
|
+
null_values=null_values,
|
|
551
|
+
ignore_errors=ignore_errors,
|
|
552
|
+
truncate_ragged_lines=truncate_ragged_lines,
|
|
553
|
+
)
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
return data
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
RE_COMPUTE_ERROR = re.compile(r"at column '(.*?)' \(column number \d+\)")
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def handle_compute_error(
|
|
563
|
+
err_msg: str,
|
|
564
|
+
file_format: str | None,
|
|
565
|
+
infer_schema: bool,
|
|
566
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
567
|
+
) -> tuple[bool, dict[str, pl.DataType] | None]:
|
|
568
|
+
"""Handle ComputeError during schema inference and determine retry strategy.
|
|
569
|
+
|
|
570
|
+
Analyzes the error message and determines whether to retry with schema overrides,
|
|
571
|
+
disable schema inference, or exit with an error.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
err_msg: The error message from the ComputeError exception.
|
|
575
|
+
file_format: The file format being loaded (tsv, csv, etc.).
|
|
576
|
+
infer_schema: Whether schema inference is currently enabled.
|
|
577
|
+
schema_overrides: Current schema overrides, if any.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
A tuple of (infer_schema, schema_overrides):
|
|
581
|
+
|
|
582
|
+
Raises:
|
|
583
|
+
SystemExit: If the error is unrecoverable.
|
|
584
|
+
"""
|
|
585
|
+
# Already disabled schema inference, cannot recover
|
|
586
|
+
if not infer_schema:
|
|
587
|
+
print(f"Error loading even with schema inference disabled:\n{err_msg}", file=sys.stderr)
|
|
588
|
+
|
|
589
|
+
if "CSV malformed" in err_msg:
|
|
590
|
+
print(
|
|
591
|
+
"\nSometimes quote characters might be mismatched. Try again with `-q` or `-E` to ignore errors",
|
|
592
|
+
file=sys.stderr,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
sys.exit(1)
|
|
596
|
+
|
|
597
|
+
# Schema mismatch error
|
|
598
|
+
if "found more fields than defined in 'Schema'" in err_msg:
|
|
599
|
+
print(f"{err_msg}.\n\nInput might be malformed. Try again with `-t` to truncate ragged lines", file=sys.stderr)
|
|
600
|
+
sys.exit(1)
|
|
601
|
+
|
|
602
|
+
# Field ... is not properly escaped
|
|
603
|
+
if "is not properly escaped" in err_msg:
|
|
604
|
+
print(
|
|
605
|
+
f"{err_msg}\n\nQuoting might be causing the issue. Try again with `-q` to disable quoting", file=sys.stderr
|
|
606
|
+
)
|
|
607
|
+
sys.exit(1)
|
|
608
|
+
|
|
609
|
+
# ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
|
|
610
|
+
if file_format in ("tsv", "csv") and (m := RE_COMPUTE_ERROR.search(err_msg)):
|
|
611
|
+
col_name = m.group(1)
|
|
612
|
+
|
|
613
|
+
if schema_overrides is None:
|
|
614
|
+
schema_overrides = {}
|
|
615
|
+
schema_overrides.update({col_name: pl.String})
|
|
616
|
+
else:
|
|
617
|
+
infer_schema = False
|
|
618
|
+
|
|
619
|
+
return infer_schema, schema_overrides
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def load_file(
|
|
623
|
+
source: str | StringIO,
|
|
624
|
+
first_sheet: bool = False,
|
|
625
|
+
prefix_sheet: bool = False,
|
|
626
|
+
file_format: str | None = None,
|
|
627
|
+
has_header: bool = True,
|
|
628
|
+
infer_schema: bool = True,
|
|
629
|
+
comment_prefix: str | None = None,
|
|
630
|
+
quote_char: str | None = '"',
|
|
631
|
+
skip_lines: int = 0,
|
|
632
|
+
skip_rows_after_header: int = 0,
|
|
633
|
+
schema_overrides: dict[str, pl.DataType] | None = None,
|
|
634
|
+
null_values: list[str] | None = None,
|
|
635
|
+
ignore_errors: bool = False,
|
|
636
|
+
truncate_ragged_lines: bool = False,
|
|
637
|
+
) -> list[Source]:
|
|
638
|
+
"""Load a single file.
|
|
639
|
+
|
|
640
|
+
For Excel files, when `first_sheet` is True, returns only the first sheet. Otherwise, returns one entry per sheet.
|
|
641
|
+
For other files or multiple files, returns one entry per file.
|
|
642
|
+
|
|
643
|
+
If a ComputeError occurs during schema inference for a column, attempts to recover
|
|
644
|
+
by treating that column as a string and retrying the load. This process repeats until
|
|
645
|
+
all columns are successfully loaded or no further recovery is possible.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
filename: Path to file to load.
|
|
649
|
+
first_sheet: If True, only load first sheet for Excel files. Defaults to False.
|
|
650
|
+
prefix_sheet: If True, prefix filename to sheet name as the tab name for Excel files. Defaults to False.
|
|
651
|
+
file_format: Optional format specifier (i.e., 'tsv', 'csv', 'excel', 'parquet', 'json', 'ndjson') for input files.
|
|
652
|
+
By default, infers from file extension.
|
|
653
|
+
has_header: Whether the input files have a header row. Defaults to True.
|
|
654
|
+
infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
|
|
655
|
+
comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
|
|
656
|
+
quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
|
|
657
|
+
skip_lines: Number of lines to skip when reading CSV/TSV files. The header will be parsed at this offset. Defaults to 0.
|
|
658
|
+
skip_rows_after_header: Number of rows to skip after header when reading CSV/TSV files. Defaults to 0.
|
|
659
|
+
schema_overrides: Optional dictionary of column name to Polars data type to override inferred schema.
|
|
660
|
+
null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
|
|
661
|
+
ignore_errors: Whether to ignore errors when reading CSV/TSV files.
|
|
662
|
+
|
|
663
|
+
Returns:
|
|
664
|
+
List of `Source` objects.
|
|
665
|
+
"""
|
|
666
|
+
data: list[Source] = []
|
|
667
|
+
|
|
668
|
+
filename = f"stdin.{file_format}" if isinstance(source, StringIO) else source
|
|
669
|
+
filepath = Path(filename)
|
|
670
|
+
|
|
671
|
+
if not file_format:
|
|
672
|
+
ext = filepath.suffix.lower()
|
|
673
|
+
if ext == ".gz":
|
|
674
|
+
ext = Path(filename).with_suffix("").suffix.lower()
|
|
675
|
+
file_format = ext.removeprefix(".")
|
|
676
|
+
|
|
677
|
+
# Load based on file format
|
|
678
|
+
if file_format in ("csv", "tsv"):
|
|
679
|
+
lf = pl.scan_csv(
|
|
680
|
+
source,
|
|
681
|
+
separator="\t" if file_format == "tsv" else ",",
|
|
682
|
+
has_header=has_header,
|
|
683
|
+
infer_schema=infer_schema,
|
|
684
|
+
comment_prefix=comment_prefix,
|
|
685
|
+
quote_char=quote_char,
|
|
686
|
+
skip_lines=skip_lines,
|
|
687
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
688
|
+
schema_overrides=schema_overrides,
|
|
689
|
+
null_values=null_values,
|
|
690
|
+
ignore_errors=ignore_errors,
|
|
691
|
+
truncate_ragged_lines=truncate_ragged_lines,
|
|
692
|
+
)
|
|
693
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
694
|
+
elif file_format in ("xlsx", "xls", "excel"):
|
|
695
|
+
if first_sheet:
|
|
696
|
+
# Read only the first sheet for multiple files
|
|
697
|
+
lf = pl.read_excel(source).lazy()
|
|
698
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
699
|
+
else:
|
|
700
|
+
# For single file, expand all sheets
|
|
701
|
+
sheets = pl.read_excel(source, sheet_id=0)
|
|
702
|
+
for sheet_name, df in sheets.items():
|
|
703
|
+
tabname = f"{filepath.stem}_{sheet_name}" if prefix_sheet else sheet_name
|
|
704
|
+
data.append(Source(df.lazy(), filename, tabname))
|
|
705
|
+
elif file_format == "parquet":
|
|
706
|
+
lf = pl.scan_parquet(source)
|
|
707
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
708
|
+
elif file_format == "json":
|
|
709
|
+
lf = pl.read_json(source).lazy()
|
|
710
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
711
|
+
elif file_format == "ndjson":
|
|
712
|
+
lf = pl.scan_ndjson(source, schema_overrides=schema_overrides)
|
|
713
|
+
data.append(Source(lf, filename, filepath.stem))
|
|
714
|
+
else:
|
|
715
|
+
raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {SUPPORTED_FORMATS}")
|
|
716
|
+
|
|
717
|
+
# Attempt to collect, handling ComputeError for schema inference issues
|
|
718
|
+
try:
|
|
719
|
+
data = [Source(src.frame.collect(), src.filename, src.tabname) for src in data]
|
|
720
|
+
except pl.exceptions.ComputeError as ce:
|
|
721
|
+
# Handle the error and determine retry strategy
|
|
722
|
+
infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
|
|
723
|
+
|
|
724
|
+
# Retry loading with updated schema overrides
|
|
725
|
+
if isinstance(source, StringIO):
|
|
726
|
+
source.seek(0)
|
|
727
|
+
|
|
728
|
+
return load_file(
|
|
729
|
+
source,
|
|
730
|
+
file_format=file_format,
|
|
731
|
+
has_header=has_header,
|
|
732
|
+
infer_schema=infer_schema,
|
|
733
|
+
comment_prefix=comment_prefix,
|
|
734
|
+
quote_char=quote_char,
|
|
735
|
+
skip_lines=skip_lines,
|
|
736
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
737
|
+
schema_overrides=schema_overrides,
|
|
738
|
+
null_values=null_values,
|
|
739
|
+
ignore_errors=ignore_errors,
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
return data
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def now() -> str:
|
|
746
|
+
"""Get the current local time as a formatted string."""
|
|
747
|
+
import time
|
|
748
|
+
|
|
749
|
+
return time.strftime("%m/%d/%Y %H:%M:%S", time.localtime())
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
async def sleep_async(seconds: float) -> None:
|
|
753
|
+
"""Async sleep to yield control back to the event loop.
|
|
754
|
+
|
|
755
|
+
Args:
|
|
756
|
+
seconds: The number of seconds to sleep.
|
|
757
|
+
"""
|
|
758
|
+
import asyncio
|
|
759
|
+
|
|
760
|
+
await asyncio.sleep(seconds)
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def round_to_nearest_hundreds(num: int, N: int = 100) -> tuple[int, int]:
|
|
764
|
+
"""Round a number to the nearest hundred boundaries.
|
|
765
|
+
|
|
766
|
+
Given a number, return a tuple of the two closest hundreds that bracket it.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
num: The number to round.
|
|
770
|
+
|
|
771
|
+
Returns:
|
|
772
|
+
A tuple (lower_hundred, upper_hundred) where:
|
|
773
|
+
- lower_hundred is the largest multiple of 100 <= num
|
|
774
|
+
- upper_hundred is the smallest multiple of 100 > num
|
|
775
|
+
|
|
776
|
+
Examples:
|
|
777
|
+
>>> round_to_nearest_hundreds(0)
|
|
778
|
+
(0, 100)
|
|
779
|
+
>>> round_to_nearest_hundreds(150)
|
|
780
|
+
(100, 200)
|
|
781
|
+
>>> round_to_nearest_hundreds(200)
|
|
782
|
+
(200, 300)
|
|
783
|
+
"""
|
|
784
|
+
lower = (num // N) * N
|
|
785
|
+
upper = lower + N
|
|
786
|
+
return (lower, upper)
|