dataframe-textual 0.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,20 @@
1
1
  """Common utilities and constants for dataframe_viewer."""
2
2
 
3
+ import os
3
4
  import re
5
+ import sys
4
6
  from dataclasses import dataclass
7
+ from io import StringIO
8
+ from pathlib import Path
5
9
  from typing import Any
6
10
 
7
11
  import polars as pl
8
12
  from rich.text import Text
9
13
 
14
+ # Supported file formats
15
+ SUPPORTED_FORMATS = {"tsv", "csv", "excel", "xlsx", "xls", "parquet", "json", "ndjson"}
16
+
17
+
10
18
  # Boolean string mappings
11
19
  BOOLS = {
12
20
  "true": True,
@@ -21,35 +29,59 @@ BOOLS = {
21
29
  "0": False,
22
30
  }
23
31
 
24
- # itype is used by Input widget for input validation
25
- # fmt: off
26
- STYLES = {
27
- "Int64": {"style": "cyan", "justify": "right", "itype": "integer", "convert": int},
28
- "Float64": {"style": "magenta", "justify": "right", "itype": "number", "convert": float},
29
- "String": {"style": "green", "justify": "left", "itype": "text", "convert": str},
30
- "Boolean": {"style": "blue", "justify": "center", "itype": "text", "convert": lambda x: BOOLS[x.lower()]},
31
- "Date": {"style": "blue", "justify": "center", "itype": "text", "convert": str},
32
- "Datetime": {"style": "blue", "justify": "center", "itype": "text", "convert": str},
33
- }
34
- # fmt: on
32
+ # Special string to represent null value
33
+ NULL = "NULL"
34
+ NULL_DISPLAY = "-"
35
35
 
36
36
 
37
37
  @dataclass
38
- class DtypeConfig:
38
+ class DtypeClass:
39
+ """Data type class configuration.
40
+
41
+ Attributes:
42
+ gtype: Generic, high-level type as a string.
43
+ style: Style string for display purposes.
44
+ justify: Text justification for display.
45
+ itype: Input type for validation.
46
+ convert: Conversion function for the data type.
47
+ """
48
+
49
+ gtype: str # generic, high-level type
39
50
  style: str
40
51
  justify: str
41
52
  itype: str
42
53
  convert: Any
43
54
 
44
- def __init__(self, dtype: pl.DataType):
45
- dc = STYLES.get(
46
- str(dtype), {"style": "", "justify": "", "itype": "text", "convert": str}
47
- )
48
- self.style = dc["style"]
49
- self.justify = dc["justify"]
50
- self.itype = dc["itype"]
51
- self.convert = dc["convert"]
52
55
 
56
+ # itype is used by Input widget for input validation
57
+ # fmt: off
58
+ STYLES = {
59
+ # str
60
+ pl.String: DtypeClass(gtype="string", style="green", justify="left", itype="text", convert=str),
61
+ # int
62
+ pl.Int8: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
63
+ pl.Int16: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
64
+ pl.Int32: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
65
+ pl.Int64: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
66
+ pl.Int128: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
67
+ pl.UInt8: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
68
+ pl.UInt16: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
69
+ pl.UInt32: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
70
+ pl.UInt64: DtypeClass(gtype="integer", style="cyan", justify="right", itype="integer", convert=int),
71
+ # float
72
+ pl.Float32: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
73
+ pl.Float64: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
74
+ pl.Decimal: DtypeClass(gtype="float", style="yellow", justify="right", itype="number", convert=float),
75
+ # bool
76
+ pl.Boolean: DtypeClass(gtype="boolean", style="blue", justify="center", itype="text", convert=lambda x: BOOLS[x.lower()]),
77
+ # temporal
78
+ pl.Date: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
79
+ pl.Datetime: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
80
+ pl.Time: DtypeClass(gtype="temporal", style="magenta", justify="center", itype="text", convert=str),
81
+ # unknown
82
+ pl.Unknown: DtypeClass(gtype="unknown", style="", justify="", itype="text", convert=str),
83
+ }
84
+ # fmt: on
53
85
 
54
86
  # Subscript digits mapping for sort indicators
55
87
  SUBSCRIPT_DIGITS = {
@@ -68,53 +100,147 @@ SUBSCRIPT_DIGITS = {
68
100
  # Cursor types ("none" removed)
69
101
  CURSOR_TYPES = ["row", "column", "cell"]
70
102
 
71
- # Pagination settings
72
- INITIAL_BATCH_SIZE = 100 # Load this many rows initially
73
- BATCH_SIZE = 50 # Load this many rows when scrolling
103
+ # For row index column
104
+ RIDX = "^_ridx_^"
105
+
106
+
107
+ @dataclass
108
+ class Source:
109
+ """Data source representation.
110
+
111
+ Attributes:
112
+ frame: The Polars DataFrame or LazyFrame.
113
+ filename: The name of the source file.
114
+ tabname: The name of the tab to display.
115
+ """
116
+
117
+ frame: pl.DataFrame | pl.LazyFrame
118
+ filename: str
119
+ tabname: str
120
+
74
121
 
122
+ def DtypeConfig(dtype: pl.DataType) -> DtypeClass:
123
+ """Get the DtypeClass configuration for a given Polars data type.
75
124
 
76
- def _format_row(vals, dtypes, apply_justify=True) -> list[Text]:
125
+ Retrieves styling and formatting configuration based on the Polars data type,
126
+ including style (color), justification, and type conversion function.
127
+
128
+ Args:
129
+ dtype: A Polars data type to get configuration for.
130
+
131
+ Returns:
132
+ A DtypeClass containing style, justification, input type, and conversion function.
133
+ """
134
+ if dc := STYLES.get(dtype):
135
+ return dc
136
+ elif isinstance(dtype, pl.Datetime):
137
+ return STYLES[pl.Datetime]
138
+ elif isinstance(dtype, pl.Date):
139
+ return STYLES[pl.Date]
140
+ elif isinstance(dtype, pl.Time):
141
+ return STYLES[pl.Time]
142
+ else:
143
+ return STYLES[pl.Unknown]
144
+
145
+
146
+ def format_float(value: float, thousand_separator: bool = False, precision: int = 2) -> str:
147
+ """Format a float value, keeping integers without decimal point.
148
+
149
+ Args:
150
+ val: The float value to format.
151
+ thousand_separator: Whether to include thousand separators. Defaults to False.
152
+
153
+ Returns:
154
+ The formatted float as a string.
155
+ """
156
+
157
+ if (val := int(value)) == value:
158
+ return f"{val:,}" if thousand_separator else str(val)
159
+ else:
160
+ if precision > 0:
161
+ return f"{value:,.{precision}f}" if thousand_separator else f"{value:.{precision}f}"
162
+ else:
163
+ return f"{value:,f}" if thousand_separator else str(value)
164
+
165
+
166
+ def format_row(vals, dtypes, styles=None, apply_justify=True, thousand_separator=False) -> list[Text]:
77
167
  """Format a single row with proper styling and justification.
78
168
 
169
+ Converts raw row values to formatted Rich Text objects with appropriate
170
+ styling (colors), justification, and null value handling based on data types.
171
+
79
172
  Args:
80
173
  vals: The list of values in the row.
81
174
  dtypes: The list of data types corresponding to each value.
82
175
  apply_justify: Whether to apply justification styling. Defaults to True.
176
+
177
+ Returns:
178
+ A list of Rich Text objects with proper formatting applied.
83
179
  """
84
180
  formatted_row = []
85
181
 
86
- for val, dtype in zip(vals, dtypes, strict=True):
182
+ for idx, (val, dtype) in enumerate(zip(vals, dtypes, strict=True)):
87
183
  dc = DtypeConfig(dtype)
88
184
 
89
185
  # Format the value
90
186
  if val is None:
91
- text_val = "-"
92
- elif str(dtype).startswith("Float"):
93
- text_val = f"{val:.4g}"
187
+ text_val = NULL_DISPLAY
188
+ elif dc.gtype == "integer" and thousand_separator:
189
+ text_val = f"{val:,}"
190
+ elif dc.gtype == "float":
191
+ text_val = format_float(val, thousand_separator)
94
192
  else:
95
193
  text_val = str(val)
96
194
 
97
195
  formatted_row.append(
98
196
  Text(
99
197
  text_val,
100
- style=dc.style,
198
+ style=styles[idx] if styles and styles[idx] else dc.style,
101
199
  justify=dc.justify if apply_justify else "",
200
+ overflow="ellipsis",
201
+ no_wrap=True,
102
202
  )
103
203
  )
104
204
 
105
205
  return formatted_row
106
206
 
107
207
 
108
- def _rindex(lst: list, value) -> int:
109
- """Return the last index of value in a list. Return -1 if not found."""
208
+ def rindex(lst: list, value) -> int:
209
+ """Return the last index of value in a list. Return -1 if not found.
210
+
211
+ Searches through the list in reverse order to find the last occurrence
212
+ of the given value.
213
+
214
+ Args:
215
+ lst: The list to search through.
216
+ value: The value to find.
217
+
218
+ Returns:
219
+ The index (0-based) of the last occurrence, or -1 if not found.
220
+ """
110
221
  for i, item in enumerate(reversed(lst)):
111
222
  if item == value:
112
223
  return len(lst) - 1 - i
113
224
  return -1
114
225
 
115
226
 
116
- def _next(lst: list[Any], current, offset=1) -> Any:
117
- """Return the next item in the list after the current item, cycling if needed."""
227
+ def get_next_item(lst: list[Any], current, offset=1) -> Any:
228
+ """Return the next item in the list after the current item, cycling if needed.
229
+
230
+ Finds the current item in the list and returns the item at position (current_index + offset),
231
+ wrapping around to the beginning if necessary.
232
+
233
+ Args:
234
+ lst: The list to cycle through.
235
+ current: The current item (must be in the list).
236
+ offset: The number of positions to advance. Defaults to 1.
237
+
238
+ Returns:
239
+ The next item in the list after advancing by the offset.
240
+
241
+ Raises:
242
+ ValueError: If the current item is not found in the list.
243
+ """
118
244
  if current not in lst:
119
245
  raise ValueError("Current item not in list")
120
246
  current_index = lst.index(current)
@@ -122,83 +248,452 @@ def _next(lst: list[Any], current, offset=1) -> Any:
122
248
  return lst[next_index]
123
249
 
124
250
 
125
- def parse_filter_expression(
126
- expression: str, df: pl.DataFrame, current_col_idx: int
127
- ) -> str:
128
- """Parse and convert a filter expression to Polars syntax.
251
+ def parse_placeholders(template: str, columns: list[str], current_cidx: int) -> list[str | pl.Expr]:
252
+ """Parse template string into a list of strings or Polars expressions
253
+
254
+ Supports multiple placeholder types:
255
+ - `$_` - Current column (based on current_cidx parameter)
256
+ - `$#` - Row index (1-based, requires '^__ridx__^' column to be present)
257
+ - `$1`, `$2`, etc. - Column index (1-based)
258
+ - `$name` - Column name (e.g., `$product_id`)
259
+
260
+ Args:
261
+ template: The template string containing placeholders and literal text
262
+ columns: List of column names in the dataframe
263
+ current_cidx: 0-based index of the current column for `$_` references in the columns list
264
+
265
+ Returns:
266
+ A list of strings (literal text) and Polars expressions (for column references)
267
+
268
+ Raises:
269
+ ValueError: If invalid column index or non-existent column name is referenced
270
+ """
271
+ if "$" not in template or template.endswith("$"):
272
+ return [template]
273
+
274
+ # Regex matches: $_ or $\d+ or $\w+ (column names)
275
+ placeholder_pattern = r"\$(_|#|\d+|[a-zA-Z_]\w*)"
276
+ placeholders = re.finditer(placeholder_pattern, template)
277
+
278
+ parts = []
279
+ last_end = 0
280
+
281
+ # Get current column name for $_ references
282
+ try:
283
+ col_name = columns[current_cidx]
284
+ except IndexError:
285
+ raise ValueError(f"Current column index {current_cidx} is out of range for columns list")
286
+
287
+ for match in placeholders:
288
+ # Add literal text before this placeholder
289
+ if match.start() > last_end:
290
+ parts.append(template[last_end : match.start()])
291
+
292
+ placeholder = match.group(1) # Extract content after '$'
293
+
294
+ if placeholder == "_":
295
+ # $_ refers to current column (where cursor was)
296
+ parts.append(pl.col(col_name))
297
+ elif placeholder == "#":
298
+ # $# refers to row index (1-based)
299
+ parts.append((pl.col(RIDX)))
300
+ elif placeholder.isdigit():
301
+ # $1, $2, etc. refer to columns by 1-based position index
302
+ col_idx = int(placeholder) - 1 # Convert to 0-based
303
+ try:
304
+ col_ref = columns[col_idx]
305
+ parts.append(pl.col(col_ref))
306
+ except IndexError:
307
+ raise ValueError(f"Invalid column index: ${placeholder} (valid range: $1 to ${len(columns)})")
308
+ else:
309
+ # $name refers to column by name
310
+ if placeholder in columns:
311
+ parts.append(pl.col(placeholder))
312
+ else:
313
+ raise ValueError(f"Column not found: ${placeholder} (available columns: {', '.join(columns)})")
314
+
315
+ last_end = match.end()
316
+
317
+ # Add remaining literal text after last placeholder
318
+ if last_end < len(template):
319
+ parts.append(template[last_end:])
129
320
 
130
- Supports:
321
+ # If no placeholders found, treat entire template as literal
322
+ if not parts:
323
+ parts = [template]
324
+
325
+ return parts
326
+
327
+
328
+ def parse_polars_expression(expression: str, columns: list[str], current_cidx: int) -> str:
329
+ """Parse and convert an expression to Polars syntax.
330
+
331
+ Replaces column references with Polars col() expressions:
131
332
  - $_ - Current selected column
132
- - $1, $2, etc. - Column by 1-based index
133
- - $col_name - Column by name
134
- - Comparison operators: ==, !=, <, >, <=, >=
135
- - Logical operators: &&, ||
136
- - String literals: 'text', "text"
137
- - Numeric literals: integers and floats
333
+ - $# - Row index (1-based, requires '^__ridx__^' column to be present)
334
+ - $1, $2, etc. - Column index (1-based)
335
+ - $col_name - Column name (valid identifier starting with _ or letter)
138
336
 
139
337
  Examples:
140
338
  - "$_ > 50" -> "pl.col('current_col') > 50"
339
+ - "$# > 10" -> "pl.col('^__ridx__^') > 10"
141
340
  - "$1 > 50" -> "pl.col('col0') > 50"
142
341
  - "$name == 'Alex'" -> "pl.col('name') == 'Alex'"
143
- - "$1 > 3 && $name == 'Alex'" -> "(pl.col('col0') > 3) & (pl.col('name') == 'Alex')"
144
342
  - "$age < $salary" -> "pl.col('age') < pl.col('salary')"
145
343
 
146
344
  Args:
147
- expression: The filter expression as a string.
148
- df: The DataFrame to validate column references.
345
+ expression: The input expression as a string.
346
+ columns: The list of column names in the DataFrame.
347
+ current_cidx: The index of the currently selected column (0-based). Used for $_ reference.
348
+
349
+ Returns:
350
+ A Python expression string with $references replaced by pl.col() calls.
351
+
352
+ Raises:
353
+ ValueError: If a column reference is invalid.
354
+ """
355
+ # Early return if no $ present
356
+ if "$" not in expression:
357
+ if "pl." in expression:
358
+ # This may be valid Polars expression already
359
+ return expression
360
+ else:
361
+ # Return as a literal string
362
+ return f"pl.lit({expression})"
363
+
364
+ parts = parse_placeholders(expression, columns, current_cidx)
365
+
366
+ result = []
367
+ for part in parts:
368
+ if isinstance(part, pl.Expr):
369
+ col = part.meta.output_name()
370
+
371
+ result.append(f"pl.col('{col}')")
372
+ else:
373
+ result.append(part)
374
+
375
+ return "".join(result)
376
+
377
+
378
+ def tentative_expr(term: str) -> bool:
379
+ """Check if the given term could be a Polars expression.
380
+
381
+ Heuristically determines whether a string might represent a Polars expression
382
+ based on common patterns like column references ($) or direct Polars syntax (pl.).
383
+
384
+ Args:
385
+ term: The string to check.
386
+
387
+ Returns:
388
+ True if the term appears to be a Polars expression, False otherwise.
389
+ """
390
+ if "$" in term and not term.endswith("$"):
391
+ return True
392
+ if "pl." in term:
393
+ return True
394
+ return False
395
+
396
+
397
+ def validate_expr(term: str, columns: list[str], current_col_idx: int) -> pl.Expr | None:
398
+ """Validate and return the expression.
399
+
400
+ Parses a user-provided expression string and validates it as a valid Polars expression.
401
+ Converts special syntax like $_ references to proper Polars col() expressions.
402
+
403
+ Args:
404
+ term: The input expression as a string.
405
+ columns: The list of column names in the DataFrame.
149
406
  current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
150
407
 
151
408
  Returns:
152
- A Python expression string that can be eval'd with Polars symbols.
409
+ A valid Polars expression object if validation succeeds.
153
410
 
154
411
  Raises:
155
- ValueError: If the expression contains invalid column references.
156
- SyntaxError: If the expression has invalid syntax.
412
+ ValueError: If the expression is invalid, contains non-existent column references, or cannot be evaluated.
157
413
  """
158
- # Tokenize the expression
159
- # Pattern matches: $_, $index, $identifier, strings, operators, numbers, etc.
160
- token_pattern = r'\$_|\$\d+|\$\w+|\'[^\']*\'|"[^"]*"|&&|\|\||<=|>=|!=|==|[+\-*/%<>=()]|\d+\.?\d*|\w+|.'
161
-
162
- tokens = re.findall(token_pattern, expression)
163
-
164
- if not tokens:
165
- raise ValueError("Expression is empty")
166
-
167
- # Convert tokens to Polars expression syntax
168
- converted_tokens = []
169
- for token in tokens:
170
- if token.startswith("$"):
171
- # Column reference
172
- col_ref = token[1:]
173
-
174
- # Special case: $_ refers to the current selected column
175
- if col_ref == "_":
176
- col_name = df.columns[current_col_idx]
177
- # Check if it's a numeric index
178
- elif col_ref.isdigit():
179
- col_idx = int(col_ref) - 1 # Convert to 0-based index
180
- if col_idx < 0 or col_idx >= len(df.columns):
181
- raise ValueError(f"Column index out of range: ${col_ref}")
182
- col_name = df.columns[col_idx]
183
- else:
184
- # It's a column name
185
- if col_ref not in df.columns:
186
- raise ValueError(f"Column not found: ${col_ref}")
187
- col_name = col_ref
414
+ term = term.strip()
415
+
416
+ try:
417
+ # Parse the expression
418
+ expr_str = parse_polars_expression(term, columns, current_col_idx)
419
+
420
+ # Validate by evaluating it
421
+ try:
422
+ expr = eval(expr_str, {"pl": pl})
423
+ if not isinstance(expr, pl.Expr):
424
+ raise ValueError(f"Expression evaluated to `{type(expr).__name__}` instead of a Polars expression")
425
+
426
+ # Expression is valid
427
+ return expr
428
+ except Exception as e:
429
+ raise ValueError(f"Failed to evaluate expression `{expr_str}`: {e}") from e
430
+ except Exception as ve:
431
+ raise ValueError(f"Failed to validate expression `{term}`: {ve}") from ve
432
+
433
+
434
+ def load_dataframe(
435
+ filenames: list[str],
436
+ file_format: str | None = None,
437
+ has_header: bool = True,
438
+ infer_schema: bool = True,
439
+ comment_prefix: str | None = None,
440
+ quote_char: str | None = '"',
441
+ skip_lines: int = 0,
442
+ skip_rows_after_header: int = 0,
443
+ null_values: list[str] | None = None,
444
+ ignore_errors: bool = False,
445
+ ) -> list[Source]:
446
+ """Load DataFrames from file specifications.
447
+
448
+ Handles loading from multiple files, single files, or stdin. For Excel files,
449
+ loads all sheets as separate entries. For other formats, loads as single file.
188
450
 
189
- converted_tokens.append(f"pl.col('{col_name}')")
451
+ Args:
452
+ filenames: List of filenames to load. If single filename is "-", read from stdin.
453
+ file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
454
+ has_header: Whether the input files have a header row. Defaults to True.
455
+ infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
456
+ comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
457
+ quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
458
+ skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
459
+ skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
460
+ null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
461
+ ignore_errors: Whether to ignore errors when reading CSV/TSV files. Defaults to False.
190
462
 
191
- elif token in ("&&", "||"):
192
- # Convert logical operators and wrap surrounding expressions in parentheses
193
- if token == "&&":
194
- converted_tokens.append(") & (")
195
- else:
196
- converted_tokens.append(") | (")
463
+ Returns:
464
+ List of `Source` objects.
465
+ """
466
+ data: list[Source] = []
467
+ prefix_sheet = len(filenames) > 1
468
+
469
+ for filename in filenames:
470
+ if filename == "-":
471
+ source = StringIO(sys.stdin.read())
472
+ file_format = file_format or "tsv"
473
+
474
+ # Reopen stdin to /dev/tty for proper terminal interaction
475
+ try:
476
+ tty = open("/dev/tty")
477
+ os.dup2(tty.fileno(), sys.stdin.fileno())
478
+ except (OSError, FileNotFoundError):
479
+ pass
480
+ else:
481
+ source = filename
482
+
483
+ # Load from file
484
+ # Determine file format if not specified
485
+ if not file_format:
486
+ ext = Path(filename).suffix.lower()
487
+ if ext == ".gz" or ext == ".bz2" or ext == ".xz":
488
+ ext = Path(filename).with_suffix("").suffix.lower()
489
+ fmt = ext.removeprefix(".")
490
+
491
+ # Default to TSV
492
+ file_format = fmt if fmt in SUPPORTED_FORMATS else "tsv"
493
+
494
+ # Load the file
495
+ data.extend(
496
+ load_file(
497
+ source,
498
+ prefix_sheet=prefix_sheet,
499
+ file_format=file_format,
500
+ has_header=has_header,
501
+ infer_schema=infer_schema,
502
+ comment_prefix=comment_prefix,
503
+ quote_char=quote_char,
504
+ skip_lines=skip_lines,
505
+ skip_rows_after_header=skip_rows_after_header,
506
+ null_values=null_values,
507
+ ignore_errors=ignore_errors,
508
+ )
509
+ )
510
+
511
+ return data
512
+
513
+
514
+ RE_COMPUTE_ERROR = re.compile(r"at column '(.*?)' \(column number \d+\)")
515
+
516
+
517
+ def handle_compute_error(
518
+ err_msg: str,
519
+ file_format: str | None,
520
+ infer_schema: bool,
521
+ schema_overrides: dict[str, pl.DataType] | None = None,
522
+ ) -> tuple[bool, dict[str, pl.DataType] | None]:
523
+ """Handle ComputeError during schema inference and determine retry strategy.
524
+
525
+ Analyzes the error message and determines whether to retry with schema overrides,
526
+ disable schema inference, or exit with an error.
527
+
528
+ Args:
529
+ err_msg: The error message from the ComputeError exception.
530
+ file_format: The file format being loaded (tsv, csv, etc.).
531
+ infer_schema: Whether schema inference is currently enabled.
532
+ schema_overrides: Current schema overrides, if any.
533
+
534
+ Returns:
535
+ A tuple of (infer_schema, schema_overrides):
536
+
537
+ Raises:
538
+ SystemExit: If the error is unrecoverable.
539
+ """
540
+ # Already disabled schema inference, cannot recover
541
+ if not infer_schema:
542
+ print(f"Error loading even with schema inference disabled:\n{err_msg}", file=sys.stderr)
543
+
544
+ if "CSV malformed" in err_msg:
545
+ print(
546
+ "\nSometimes quote characters might be mismatched. Try again with `-q` or `-E` to ignore errors",
547
+ file=sys.stderr,
548
+ )
197
549
 
550
+ sys.exit(1)
551
+
552
+ # Schema mismatch error
553
+ if "found more fields than defined in 'Schema'" in err_msg:
554
+ print(f"Input might be malformed:\n{err_msg}.\nTry again with `-E` to ignore errors", file=sys.stderr)
555
+ sys.exit(1)
556
+
557
+ # ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
558
+ if file_format in ("tsv", "csv") and (m := RE_COMPUTE_ERROR.search(err_msg)):
559
+ col_name = m.group(1)
560
+
561
+ if schema_overrides is None:
562
+ schema_overrides = {}
563
+ schema_overrides.update({col_name: pl.String})
564
+ else:
565
+ infer_schema = False
566
+
567
+ return infer_schema, schema_overrides
568
+
569
+
570
+ def load_file(
571
+ source: str | StringIO,
572
+ first_sheet: bool = False,
573
+ prefix_sheet: bool = False,
574
+ file_format: str | None = None,
575
+ has_header: bool = True,
576
+ infer_schema: bool = True,
577
+ comment_prefix: str | None = None,
578
+ quote_char: str | None = '"',
579
+ skip_lines: int = 0,
580
+ skip_rows_after_header: int = 0,
581
+ schema_overrides: dict[str, pl.DataType] | None = None,
582
+ null_values: list[str] | None = None,
583
+ ignore_errors: bool = False,
584
+ ) -> list[Source]:
585
+ """Load a single file.
586
+
587
+ For Excel files, when `first_sheet` is True, returns only the first sheet. Otherwise, returns one entry per sheet.
588
+ For other files or multiple files, returns one entry per file.
589
+
590
+ If a ComputeError occurs during schema inference for a column, attempts to recover
591
+ by treating that column as a string and retrying the load. This process repeats until
592
+ all columns are successfully loaded or no further recovery is possible.
593
+
594
+ Args:
595
+ filename: Path to file to load.
596
+ first_sheet: If True, only load first sheet for Excel files. Defaults to False.
597
+ prefix_sheet: If True, prefix filename to sheet name as the tab name for Excel files. Defaults to False.
598
+ file_format: Optional format specifier (i.e., 'tsv', 'csv', 'excel', 'parquet', 'json', 'ndjson') for input files.
599
+ By default, infers from file extension.
600
+ has_header: Whether the input files have a header row. Defaults to True.
601
+ infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
602
+ comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
603
+ quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
604
+ skip_lines: Number of lines to skip when reading CSV/TSV files. The header will be parsed at this offset. Defaults to 0.
605
+ skip_rows_after_header: Number of rows to skip after header when reading CSV/TSV files. Defaults to 0.
606
+ schema_overrides: Optional dictionary of column name to Polars data type to override inferred schema.
607
+ null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
608
+ ignore_errors: Whether to ignore errors when reading CSV/TSV files.
609
+
610
+ Returns:
611
+ List of `Source` objects.
612
+ """
613
+ data: list[Source] = []
614
+ filename = f"stdin.{file_format}" if isinstance(source, StringIO) else source
615
+ filepath = Path(filename)
616
+
617
+ # Load based on file format
618
+ if file_format in ("tsv", "csv"):
619
+ lf = pl.scan_csv(
620
+ source,
621
+ separator="\t" if file_format == "tsv" else ",",
622
+ has_header=has_header,
623
+ infer_schema=infer_schema,
624
+ comment_prefix=comment_prefix,
625
+ quote_char=quote_char,
626
+ skip_lines=skip_lines,
627
+ skip_rows_after_header=skip_rows_after_header,
628
+ schema_overrides=schema_overrides,
629
+ null_values=null_values,
630
+ ignore_errors=ignore_errors,
631
+ )
632
+ data.append(Source(lf, filename, filepath.stem))
633
+ elif file_format in ("xlsx", "xls", "excel"):
634
+ if first_sheet:
635
+ # Read only the first sheet for multiple files
636
+ lf = pl.read_excel(source).lazy()
637
+ data.append(Source(lf, filename, filepath.stem))
198
638
  else:
199
- # Keep as-is (operators, numbers, strings, parentheses)
200
- converted_tokens.append(token)
639
+ # For single file, expand all sheets
640
+ sheets = pl.read_excel(source, sheet_id=0)
641
+ for sheet_name, df in sheets.items():
642
+ tabname = f"{filepath.stem}_{sheet_name}" if prefix_sheet else sheet_name
643
+ data.append(Source(df.lazy(), filename, tabname))
644
+ elif file_format == "parquet":
645
+ lf = pl.scan_parquet(source)
646
+ data.append(Source(lf, filename, filepath.stem))
647
+ elif file_format == "json":
648
+ lf = pl.read_json(source).lazy()
649
+ data.append(Source(lf, filename, filepath.stem))
650
+ elif file_format == "ndjson":
651
+ lf = pl.scan_ndjson(source, schema_overrides=schema_overrides)
652
+ data.append(Source(lf, filename, filepath.stem))
653
+ else:
654
+ raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {SUPPORTED_FORMATS}")
655
+
656
+ # Attempt to collect, handling ComputeError for schema inference issues
657
+ try:
658
+ data = [Source(src.frame.collect(), src.filename, src.tabname) for src in data]
659
+ except pl.exceptions.ComputeError as ce:
660
+ # Handle the error and determine retry strategy
661
+ infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
662
+
663
+ # Retry loading with updated schema overrides
664
+ if isinstance(source, StringIO):
665
+ source.seek(0)
666
+
667
+ return load_file(
668
+ source,
669
+ file_format=file_format,
670
+ has_header=has_header,
671
+ infer_schema=infer_schema,
672
+ comment_prefix=comment_prefix,
673
+ quote_char=quote_char,
674
+ skip_lines=skip_lines,
675
+ skip_rows_after_header=skip_rows_after_header,
676
+ schema_overrides=schema_overrides,
677
+ null_values=null_values,
678
+ ignore_errors=ignore_errors,
679
+ )
680
+
681
+ return data
682
+
683
+
684
+ def now() -> str:
685
+ """Get the current local time as a formatted string."""
686
+ import time
687
+
688
+ return time.strftime("%m/%d/%Y %H:%M:%S", time.localtime())
689
+
690
+
691
+ async def sleep_async(seconds: float) -> None:
692
+ """Async sleep to yield control back to the event loop.
693
+
694
+ Args:
695
+ seconds: The number of seconds to sleep.
696
+ """
697
+ import asyncio
201
698
 
202
- # Join tokens with space to ensure proper separation
203
- result = "(" + " ".join(converted_tokens) + ")"
204
- return result
699
+ await asyncio.sleep(seconds)