dataframe-textual 1.5.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,32 @@
1
1
  """DataFrame Viewer - Interactive CSV/Excel viewer for the terminal."""
2
2
 
3
+ from importlib.metadata import version
4
+
5
+ __version__ = version("dataframe-textual")
6
+
3
7
  from .data_frame_help_panel import DataFrameHelpPanel
4
8
  from .data_frame_table import DataFrameTable, History
5
9
  from .data_frame_viewer import DataFrameViewer
6
- from .table_screen import FrequencyScreen, RowDetailScreen, TableScreen
10
+ from .table_screen import (
11
+ FrequencyScreen,
12
+ MetaColumnScreen,
13
+ MetaShape,
14
+ RowDetailScreen,
15
+ StatisticsScreen,
16
+ TableScreen,
17
+ )
7
18
  from .yes_no_screen import (
19
+ AddColumnScreen,
20
+ AddLinkScreen,
8
21
  ConfirmScreen,
9
22
  EditCellScreen,
23
+ EditColumnScreen,
10
24
  FilterScreen,
25
+ FindReplaceScreen,
11
26
  FreezeScreen,
12
27
  OpenFileScreen,
28
+ RenameColumnScreen,
29
+ RenameTabScreen,
13
30
  SaveFileScreen,
14
31
  SearchScreen,
15
32
  YesNoScreen,
@@ -23,6 +40,9 @@ __all__ = [
23
40
  "TableScreen",
24
41
  "RowDetailScreen",
25
42
  "FrequencyScreen",
43
+ "StatisticsScreen",
44
+ "MetaShape",
45
+ "MetaColumnScreen",
26
46
  "YesNoScreen",
27
47
  "SaveFileScreen",
28
48
  "ConfirmScreen",
@@ -31,4 +51,10 @@ __all__ = [
31
51
  "FilterScreen",
32
52
  "FreezeScreen",
33
53
  "OpenFileScreen",
54
+ "RenameColumnScreen",
55
+ "EditColumnScreen",
56
+ "AddColumnScreen",
57
+ "AddLinkScreen",
58
+ "FindReplaceScreen",
59
+ "RenameTabScreen",
34
60
  ]
@@ -4,6 +4,7 @@ import argparse
4
4
  import sys
5
5
  from pathlib import Path
6
6
 
7
+ from . import __version__
7
8
  from .common import SUPPORTED_FORMATS, load_dataframe
8
9
  from .data_frame_viewer import DataFrameViewer
9
10
 
@@ -24,6 +25,12 @@ def cli() -> argparse.Namespace:
24
25
  " cat data.csv | %(prog)s --format csv\n",
25
26
  )
26
27
  parser.add_argument("files", nargs="*", help="Files to view (or read from stdin)")
28
+ parser.add_argument(
29
+ "-V",
30
+ "--version",
31
+ action="version",
32
+ version=f"%(prog)s {__version__}",
33
+ )
27
34
  parser.add_argument(
28
35
  "-f",
29
36
  "--format",
@@ -37,7 +44,10 @@ def cli() -> argparse.Namespace:
37
44
  help="Specify that input files have no header row when reading CSV/TSV",
38
45
  )
39
46
  parser.add_argument(
40
- "-I", "--no-inferrence", action="store_true", help="Do not infer data types when reading CSV/TSV"
47
+ "-I", "--no-inference", action="store_true", help="Do not infer data types when reading CSV/TSV"
48
+ )
49
+ parser.add_argument(
50
+ "-t", "--truncate-ragged-lines", action="store_true", help="Truncate ragged lines when reading CSV/TSV"
41
51
  )
42
52
  parser.add_argument("-E", "--ignore-errors", action="store_true", help="Ignore errors when reading CSV/TSV")
43
53
  parser.add_argument(
@@ -50,7 +60,7 @@ def cli() -> argparse.Namespace:
50
60
  parser.add_argument(
51
61
  "-a", "--skip-rows-after-header", type=int, default=0, help="Skip rows after header when reading CSV/TSV"
52
62
  )
53
- parser.add_argument("-u", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
63
+ parser.add_argument("-n", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
54
64
 
55
65
  args = parser.parse_args()
56
66
  if args.files is None:
@@ -80,13 +90,14 @@ def main() -> None:
80
90
  args.files,
81
91
  file_format=args.format,
82
92
  has_header=not args.no_header,
83
- infer_schema=not args.no_inferrence,
93
+ infer_schema=not args.no_inference,
84
94
  comment_prefix=args.comment_prefix,
85
95
  quote_char=args.quote_char,
86
96
  skip_lines=args.skip_lines,
87
97
  skip_rows_after_header=args.skip_rows_after_header,
88
98
  null_values=args.null,
89
99
  ignore_errors=args.ignore_errors,
100
+ truncate_ragged_lines=args.truncate_ragged_lines,
90
101
  )
91
102
  app = DataFrameViewer(*sources)
92
103
  app.run()
@@ -12,7 +12,7 @@ import polars as pl
12
12
  from rich.text import Text
13
13
 
14
14
  # Supported file formats
15
- SUPPORTED_FORMATS = {"tsv", "csv", "excel", "xlsx", "xls", "parquet", "json", "ndjson"}
15
+ SUPPORTED_FORMATS = {"tsv", "tab", "csv", "excel", "xlsx", "xls", "parquet", "json", "ndjson"}
16
16
 
17
17
 
18
18
  # Boolean string mappings
@@ -34,6 +34,29 @@ NULL = "NULL"
34
34
  NULL_DISPLAY = "-"
35
35
 
36
36
 
37
+ def format_float(value: float, thousand_separator: bool = False, precision: int = 2) -> str:
38
+ """Format a float value, keeping integers without decimal point.
39
+
40
+ Args:
41
+ val: The float value to format.
42
+ thousand_separator: Whether to include thousand separators. Defaults to False.
43
+
44
+ Returns:
45
+ The formatted float as a string.
46
+ """
47
+
48
+ if (val := int(value)) == value:
49
+ if precision > 0:
50
+ return f"{val:,}" if thousand_separator else str(val)
51
+ else:
52
+ return f"{val:,.{-precision}f}" if thousand_separator else f"{val:.{-precision}f}"
53
+ else:
54
+ if precision > 0:
55
+ return f"{value:,.{precision}f}" if thousand_separator else f"{value:.{precision}f}"
56
+ else:
57
+ return f"{value:,f}" if thousand_separator else str(value)
58
+
59
+
37
60
  @dataclass
38
61
  class DtypeClass:
39
62
  """Data type class configuration.
@@ -52,6 +75,35 @@ class DtypeClass:
52
75
  itype: str
53
76
  convert: Any
54
77
 
78
+ def format(
79
+ self, val: Any, style: str | None = None, justify: str | None = None, thousand_separator: bool = False
80
+ ) -> str:
81
+ """Format the value according to its data type.
82
+
83
+ Args:
84
+ val: The value to format.
85
+
86
+ Returns:
87
+ The formatted value as a Text.
88
+ """
89
+ # Format the value
90
+ if val is None:
91
+ text_val = NULL_DISPLAY
92
+ elif self.gtype == "integer" and thousand_separator:
93
+ text_val = f"{val:,}"
94
+ elif self.gtype == "float":
95
+ text_val = format_float(val, thousand_separator)
96
+ else:
97
+ text_val = str(val)
98
+
99
+ return Text(
100
+ text_val,
101
+ style="" if style == "" else (style or self.style),
102
+ justify="" if justify == "" else (justify or self.justify),
103
+ overflow="ellipsis",
104
+ no_wrap=True,
105
+ )
106
+
55
107
 
56
108
  # itype is used by Input widget for input validation
57
109
  # fmt: off
@@ -100,8 +152,8 @@ SUBSCRIPT_DIGITS = {
100
152
  # Cursor types ("none" removed)
101
153
  CURSOR_TYPES = ["row", "column", "cell"]
102
154
 
103
- # For row index column
104
- RIDX = "^_ridx_^"
155
+ # Row index mapping between filtered and original dataframe
156
+ RID = "^_RID_^"
105
157
 
106
158
 
107
159
  @dataclass
@@ -143,27 +195,7 @@ def DtypeConfig(dtype: pl.DataType) -> DtypeClass:
143
195
  return STYLES[pl.Unknown]
144
196
 
145
197
 
146
- def format_float(value: float, thousand_separator: bool = False, precision: int = 2) -> str:
147
- """Format a float value, keeping integers without decimal point.
148
-
149
- Args:
150
- val: The float value to format.
151
- thousand_separator: Whether to include thousand separators. Defaults to False.
152
-
153
- Returns:
154
- The formatted float as a string.
155
- """
156
-
157
- if (val := int(value)) == value:
158
- return f"{val:,}" if thousand_separator else str(val)
159
- else:
160
- if precision > 0:
161
- return f"{value:,.{precision}f}" if thousand_separator else f"{value:.{precision}f}"
162
- else:
163
- return f"{value:,f}" if thousand_separator else str(value)
164
-
165
-
166
- def format_row(vals, dtypes, styles=None, apply_justify=True, thousand_separator=False) -> list[Text]:
198
+ def format_row(vals, dtypes, styles: list[str | None] | None = None, thousand_separator=False) -> list[Text]:
167
199
  """Format a single row with proper styling and justification.
168
200
 
169
201
  Converts raw row values to formatted Rich Text objects with appropriate
@@ -172,7 +204,7 @@ def format_row(vals, dtypes, styles=None, apply_justify=True, thousand_separator
172
204
  Args:
173
205
  vals: The list of values in the row.
174
206
  dtypes: The list of data types corresponding to each value.
175
- apply_justify: Whether to apply justification styling. Defaults to True.
207
+ styles: Optional list of style overrides for each value. Defaults to None.
176
208
 
177
209
  Returns:
178
210
  A list of Rich Text objects with proper formatting applied.
@@ -181,31 +213,18 @@ def format_row(vals, dtypes, styles=None, apply_justify=True, thousand_separator
181
213
 
182
214
  for idx, (val, dtype) in enumerate(zip(vals, dtypes, strict=True)):
183
215
  dc = DtypeConfig(dtype)
184
-
185
- # Format the value
186
- if val is None:
187
- text_val = NULL_DISPLAY
188
- elif dc.gtype == "integer" and thousand_separator:
189
- text_val = f"{val:,}"
190
- elif dc.gtype == "float":
191
- text_val = format_float(val, thousand_separator)
192
- else:
193
- text_val = str(val)
194
-
195
216
  formatted_row.append(
196
- Text(
197
- text_val,
198
- style=styles[idx] if styles and styles[idx] else dc.style,
199
- justify=dc.justify if apply_justify else "",
200
- overflow="ellipsis",
201
- no_wrap=True,
217
+ dc.format(
218
+ val,
219
+ style=styles[idx] if styles and styles[idx] else None,
220
+ thousand_separator=thousand_separator,
202
221
  )
203
222
  )
204
223
 
205
224
  return formatted_row
206
225
 
207
226
 
208
- def rindex(lst: list, value) -> int:
227
+ def rindex(lst: list, value, pos: int | None = None) -> int:
209
228
  """Return the last index of value in a list. Return -1 if not found.
210
229
 
211
230
  Searches through the list in reverse order to find the last occurrence
@@ -218,9 +237,12 @@ def rindex(lst: list, value) -> int:
218
237
  Returns:
219
238
  The index (0-based) of the last occurrence, or -1 if not found.
220
239
  """
240
+ n = len(lst)
221
241
  for i, item in enumerate(reversed(lst)):
242
+ if pos is not None and (n - 1 - i) > pos:
243
+ continue
222
244
  if item == value:
223
- return len(lst) - 1 - i
245
+ return n - 1 - i
224
246
  return -1
225
247
 
226
248
 
@@ -253,9 +275,10 @@ def parse_placeholders(template: str, columns: list[str], current_cidx: int) ->
253
275
 
254
276
  Supports multiple placeholder types:
255
277
  - `$_` - Current column (based on current_cidx parameter)
256
- - `$#` - Row index (1-based, requires '^__ridx__^' column to be present)
278
+ - `$#` - Row index (1-based)
257
279
  - `$1`, `$2`, etc. - Column index (1-based)
258
280
  - `$name` - Column name (e.g., `$product_id`)
281
+ - `` $`col name` `` - Column name with spaces (e.g., `` $`product id` ``)
259
282
 
260
283
  Args:
261
284
  template: The template string containing placeholders and literal text
@@ -271,8 +294,15 @@ def parse_placeholders(template: str, columns: list[str], current_cidx: int) ->
271
294
  if "$" not in template or template.endswith("$"):
272
295
  return [template]
273
296
 
274
- # Regex matches: $_ or $\d+ or $\w+ (column names)
275
- placeholder_pattern = r"\$(_|#|\d+|[a-zA-Z_]\w*)"
297
+ # Regex matches: $_ or $# or $\d+ or $`...` (backtick-quoted names with spaces) or $\w+ (column names)
298
+ # Pattern explanation:
299
+ # \$(_|#|\d+|`[^`]+`|[a-zA-Z_]\w*)
300
+ # - $_ : current column
301
+ # - $# : row index
302
+ # - $\d+ : column by index (1-based)
303
+ # - $`[^`]+` : column by name with spaces (backtick quoted)
304
+ # - $[a-zA-Z_]\w* : column by name without spaces
305
+ placeholder_pattern = r"\$(_|#|\d+|`[^`]+`|[a-zA-Z_]\w*)"
276
306
  placeholders = re.finditer(placeholder_pattern, template)
277
307
 
278
308
  parts = []
@@ -296,7 +326,7 @@ def parse_placeholders(template: str, columns: list[str], current_cidx: int) ->
296
326
  parts.append(pl.col(col_name))
297
327
  elif placeholder == "#":
298
328
  # $# refers to row index (1-based)
299
- parts.append((pl.col(RIDX)))
329
+ parts.append(pl.col(RID))
300
330
  elif placeholder.isdigit():
301
331
  # $1, $2, etc. refer to columns by 1-based position index
302
332
  col_idx = int(placeholder) - 1 # Convert to 0-based
@@ -305,6 +335,13 @@ def parse_placeholders(template: str, columns: list[str], current_cidx: int) ->
305
335
  parts.append(pl.col(col_ref))
306
336
  except IndexError:
307
337
  raise ValueError(f"Invalid column index: ${placeholder} (valid range: $1 to ${len(columns)})")
338
+ elif placeholder.startswith("`") and placeholder.endswith("`"):
339
+ # $`col name` refers to column by name with spaces
340
+ col_ref = placeholder[1:-1] # Remove backticks
341
+ if col_ref in columns:
342
+ parts.append(pl.col(col_ref))
343
+ else:
344
+ raise ValueError(f"Column not found: ${placeholder} (available columns: {', '.join(columns)})")
308
345
  else:
309
346
  # $name refers to column by name
310
347
  if placeholder in columns:
@@ -330,16 +367,18 @@ def parse_polars_expression(expression: str, columns: list[str], current_cidx: i
330
367
 
331
368
  Replaces column references with Polars col() expressions:
332
369
  - $_ - Current selected column
333
- - $# - Row index (1-based, requires '^__ridx__^' column to be present)
370
+ - $# - Row index (1-based)
334
371
  - $1, $2, etc. - Column index (1-based)
335
372
  - $col_name - Column name (valid identifier starting with _ or letter)
373
+ - $`col name` - Column name with spaces (backtick quoted)
336
374
 
337
375
  Examples:
338
376
  - "$_ > 50" -> "pl.col('current_col') > 50"
339
- - "$# > 10" -> "pl.col('^__ridx__^') > 10"
377
+ - "$# > 10" -> "pl.col('^_RID_^') > 10"
340
378
  - "$1 > 50" -> "pl.col('col0') > 50"
341
379
  - "$name == 'Alex'" -> "pl.col('name') == 'Alex'"
342
380
  - "$age < $salary" -> "pl.col('age') < pl.col('salary')"
381
+ - "$`product id` > 100" -> "pl.col('product id') > 100"
343
382
 
344
383
  Args:
345
384
  expression: The input expression as a string.
@@ -368,7 +407,10 @@ def parse_polars_expression(expression: str, columns: list[str], current_cidx: i
368
407
  if isinstance(part, pl.Expr):
369
408
  col = part.meta.output_name()
370
409
 
371
- result.append(f"pl.col('{col}')")
410
+ if col == RID: # Convert to 1-based
411
+ result.append(f"(pl.col('{col}') + 1)")
412
+ else:
413
+ result.append(f"pl.col('{col}')")
372
414
  else:
373
415
  result.append(part)
374
416
 
@@ -442,6 +484,7 @@ def load_dataframe(
442
484
  skip_rows_after_header: int = 0,
443
485
  null_values: list[str] | None = None,
444
486
  ignore_errors: bool = False,
487
+ truncate_ragged_lines: bool = False,
445
488
  ) -> list[Source]:
446
489
  """Load DataFrames from file specifications.
447
490
 
@@ -480,23 +523,24 @@ def load_dataframe(
480
523
  else:
481
524
  source = filename
482
525
 
483
- # Load from file
484
- # Determine file format if not specified
485
- if not file_format:
526
+ # If not specified, determine file format (may be different for each file)
527
+ fmt = file_format
528
+ if not fmt:
486
529
  ext = Path(filename).suffix.lower()
487
- if ext == ".gz" or ext == ".bz2" or ext == ".xz":
530
+ if ext == ".gz":
488
531
  ext = Path(filename).with_suffix("").suffix.lower()
489
532
  fmt = ext.removeprefix(".")
490
533
 
491
534
  # Default to TSV
492
- file_format = fmt if fmt in SUPPORTED_FORMATS else "tsv"
535
+ if not fmt or fmt not in SUPPORTED_FORMATS:
536
+ fmt = "tsv"
493
537
 
494
538
  # Load the file
495
539
  data.extend(
496
540
  load_file(
497
541
  source,
498
542
  prefix_sheet=prefix_sheet,
499
- file_format=file_format,
543
+ file_format=fmt,
500
544
  has_header=has_header,
501
545
  infer_schema=infer_schema,
502
546
  comment_prefix=comment_prefix,
@@ -505,6 +549,7 @@ def load_dataframe(
505
549
  skip_rows_after_header=skip_rows_after_header,
506
550
  null_values=null_values,
507
551
  ignore_errors=ignore_errors,
552
+ truncate_ragged_lines=truncate_ragged_lines,
508
553
  )
509
554
  )
510
555
 
@@ -551,7 +596,14 @@ def handle_compute_error(
551
596
 
552
597
  # Schema mismatch error
553
598
  if "found more fields than defined in 'Schema'" in err_msg:
554
- print(f"Input might be malformed:\n{err_msg}.\nTry again with `-E` to ignore errors", file=sys.stderr)
599
+ print(f"{err_msg}.\n\nInput might be malformed. Try again with `-t` to truncate ragged lines", file=sys.stderr)
600
+ sys.exit(1)
601
+
602
+ # Field ... is not properly escaped
603
+ if "is not properly escaped" in err_msg:
604
+ print(
605
+ f"{err_msg}\n\nQuoting might be causing the issue. Try again with `-q` to disable quoting", file=sys.stderr
606
+ )
555
607
  sys.exit(1)
556
608
 
557
609
  # ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
@@ -581,6 +633,7 @@ def load_file(
581
633
  schema_overrides: dict[str, pl.DataType] | None = None,
582
634
  null_values: list[str] | None = None,
583
635
  ignore_errors: bool = False,
636
+ truncate_ragged_lines: bool = False,
584
637
  ) -> list[Source]:
585
638
  """Load a single file.
586
639
 
@@ -611,11 +664,18 @@ def load_file(
611
664
  List of `Source` objects.
612
665
  """
613
666
  data: list[Source] = []
667
+
614
668
  filename = f"stdin.{file_format}" if isinstance(source, StringIO) else source
615
669
  filepath = Path(filename)
616
670
 
671
+ if not file_format:
672
+ ext = filepath.suffix.lower()
673
+ if ext == ".gz":
674
+ ext = Path(filename).with_suffix("").suffix.lower()
675
+ file_format = ext.removeprefix(".")
676
+
617
677
  # Load based on file format
618
- if file_format in ("tsv", "csv"):
678
+ if file_format in ("csv", "tsv"):
619
679
  lf = pl.scan_csv(
620
680
  source,
621
681
  separator="\t" if file_format == "tsv" else ",",
@@ -628,6 +688,7 @@ def load_file(
628
688
  schema_overrides=schema_overrides,
629
689
  null_values=null_values,
630
690
  ignore_errors=ignore_errors,
691
+ truncate_ragged_lines=truncate_ragged_lines,
631
692
  )
632
693
  data.append(Source(lf, filename, filepath.stem))
633
694
  elif file_format in ("xlsx", "xls", "excel"):
@@ -656,6 +717,14 @@ def load_file(
656
717
  # Attempt to collect, handling ComputeError for schema inference issues
657
718
  try:
658
719
  data = [Source(src.frame.collect(), src.filename, src.tabname) for src in data]
720
+ except pl.exceptions.NoDataError:
721
+ print(
722
+ "Warning: No data from stdin."
723
+ if isinstance(source, StringIO)
724
+ else f"Warning: No data found in file `{filename}`.",
725
+ file=sys.stderr,
726
+ )
727
+ sys.exit()
659
728
  except pl.exceptions.ComputeError as ce:
660
729
  # Handle the error and determine retry strategy
661
730
  infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
@@ -697,3 +766,29 @@ async def sleep_async(seconds: float) -> None:
697
766
  import asyncio
698
767
 
699
768
  await asyncio.sleep(seconds)
769
+
770
+
771
+ def round_to_nearest_hundreds(num: int, N: int = 100) -> tuple[int, int]:
772
+ """Round a number to the nearest hundred boundaries.
773
+
774
+ Given a number, return a tuple of the two closest hundreds that bracket it.
775
+
776
+ Args:
777
+ num: The number to round.
778
+
779
+ Returns:
780
+ A tuple (lower_hundred, upper_hundred) where:
781
+ - lower_hundred is the largest multiple of 100 <= num
782
+ - upper_hundred is the smallest multiple of 100 > num
783
+
784
+ Examples:
785
+ >>> round_to_nearest_hundreds(0)
786
+ (0, 100)
787
+ >>> round_to_nearest_hundreds(150)
788
+ (100, 200)
789
+ >>> round_to_nearest_hundreds(200)
790
+ (200, 300)
791
+ """
792
+ lower = (num // N) * N
793
+ upper = lower + N
794
+ return (lower, upper)
@@ -74,9 +74,6 @@ class DataFrameHelpPanel(Widget):
74
74
 
75
75
  Initializes the help panel by setting up a watcher for focused widget changes
76
76
  to dynamically update help text based on which widget has focus.
77
-
78
- Returns:
79
- None
80
77
  """
81
78
 
82
79
  # def update_help(focused_widget: Widget | None):