PyPI - dataframe-textual - Versions diffs - 1.4.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

dataframe-textual 1.4.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

dataframe_textual/__main__.py +10 -4
dataframe_textual/common.py +201 -163
dataframe_textual/data_frame_table.py +1037 -881
dataframe_textual/data_frame_viewer.py +321 -104
dataframe_textual/sql_screen.py +50 -11
dataframe_textual/table_screen.py +1 -1
dataframe_textual/yes_no_screen.py +89 -8
{dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/METADATA +141 -185
dataframe_textual-1.9.0.dist-info/RECORD +14 -0
dataframe_textual-1.4.0.dist-info/RECORD +0 -14
{dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/WHEEL +0 -0
{dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/entry_points.txt +0 -0
{dataframe_textual-1.4.0.dist-info → dataframe_textual-1.9.0.dist-info}/licenses/LICENSE +0 -0

dataframe_textual/__main__.py CHANGED Viewed

@@ -39,14 +39,18 @@ def cli() -> argparse.Namespace:
     parser.add_argument(
         "-I", "--no-inferrence", action="store_true", help="Do not infer data types when reading CSV/TSV"
     )
+    parser.add_argument("-E", "--ignore-errors", action="store_true", help="Ignore errors when reading CSV/TSV")
     parser.add_argument(
-        "-C", "--comment-prefix", nargs="?", const="#", help="Comment lines are skipped when reading CSV/TSV"
+        "-c", "--comment-prefix", nargs="?", const="#", help="Comment lines are skipped when reading CSV/TSV"
     )
-    parser.add_argument("-L", "--skip-lines", type=int, default=0, help="Skip lines when reading CSV/TSV")
     parser.add_argument(
-        "-K", "--skip-rows-after-header", type=int, default=0, help="Skip rows after header when reading CSV/TSV"
+        "-q", "--quote-char", nargs="?", const=None, default='"', help="Quote character for reading CSV/TSV"
     )
-    parser.add_argument("-U", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
+    parser.add_argument("-l", "--skip-lines", type=int, default=0, help="Skip lines when reading CSV/TSV")
+    parser.add_argument(
+        "-a", "--skip-rows-after-header", type=int, default=0, help="Skip rows after header when reading CSV/TSV"
+    )
+    parser.add_argument("-n", "--null", nargs="+", help="Values to interpret as null values when reading CSV/TSV")
     args = parser.parse_args()
     if args.files is None:
@@ -78,9 +82,11 @@ def main() -> None:
         has_header=not args.no_header,
         infer_schema=not args.no_inferrence,
         comment_prefix=args.comment_prefix,
+        quote_char=args.quote_char,
         skip_lines=args.skip_lines,
         skip_rows_after_header=args.skip_rows_after_header,
         null_values=args.null,
+        ignore_errors=args.ignore_errors,
     )
     app = DataFrameViewer(*sources)
     app.run()

dataframe_textual/common.py CHANGED Viewed

@@ -1,8 +1,10 @@
 """Common utilities and constants for dataframe_viewer."""
+import os
 import re
 import sys
 from dataclasses import dataclass
+from io import StringIO
 from pathlib import Path
 from typing import Any
@@ -34,6 +36,16 @@ NULL_DISPLAY = "-"
 @dataclass
 class DtypeClass:
+    """Data type class configuration.
+    Attributes:
+        gtype: Generic, high-level type as a string.
+        style: Style string for display purposes.
+        justify: Text justification for display.
+        itype: Input type for validation.
+        convert: Conversion function for the data type.
+    """
     gtype: str  # generic, high-level type
     style: str
     justify: str
@@ -71,7 +83,6 @@ STYLES = {
 }
 # fmt: on
 # Subscript digits mapping for sort indicators
 SUBSCRIPT_DIGITS = {
     0: "₀",
@@ -93,6 +104,21 @@ CURSOR_TYPES = ["row", "column", "cell"]
 RIDX = "^_ridx_^"
+@dataclass
+class Source:
+    """Data source representation.
+    Attributes:
+        frame: The Polars DataFrame or LazyFrame.
+        filename: The name of the source file.
+        tabname: The name of the tab to display.
+    """
+    frame: pl.DataFrame | pl.LazyFrame
+    filename: str
+    tabname: str
 def DtypeConfig(dtype: pl.DataType) -> DtypeClass:
     """Get the DtypeClass configuration for a given Polars data type.
@@ -222,14 +248,91 @@ def get_next_item(lst: list[Any], current, offset=1) -> Any:
     return lst[next_index]
-def parse_polars_expression(expression: str, columns: list[str], current_col_idx: int) -> str:
+def parse_placeholders(template: str, columns: list[str], current_cidx: int) -> list[str | pl.Expr]:
+    """Parse template string into a list of strings or Polars expressions
+    Supports multiple placeholder types:
+    - `$_` - Current column (based on current_cidx parameter)
+    - `$#` - Row index (1-based, requires '^__ridx__^' column to be present)
+    - `$1`, `$2`, etc. - Column index (1-based)
+    - `$name` - Column name (e.g., `$product_id`)
+    Args:
+        template: The template string containing placeholders and literal text
+        columns: List of column names in the dataframe
+        current_cidx: 0-based index of the current column for `$_` references in the columns list
+    Returns:
+        A list of strings (literal text) and Polars expressions (for column references)
+    Raises:
+        ValueError: If invalid column index or non-existent column name is referenced
+    """
+    if "$" not in template or template.endswith("$"):
+        return [template]
+    # Regex matches: $_ or $\d+ or $\w+ (column names)
+    placeholder_pattern = r"\$(_|#|\d+|[a-zA-Z_]\w*)"
+    placeholders = re.finditer(placeholder_pattern, template)
+    parts = []
+    last_end = 0
+    # Get current column name for $_ references
+    try:
+        col_name = columns[current_cidx]
+    except IndexError:
+        raise ValueError(f"Current column index {current_cidx} is out of range for columns list")
+    for match in placeholders:
+        # Add literal text before this placeholder
+        if match.start() > last_end:
+            parts.append(template[last_end : match.start()])
+        placeholder = match.group(1)  # Extract content after '$'
+        if placeholder == "_":
+            # $_ refers to current column (where cursor was)
+            parts.append(pl.col(col_name))
+        elif placeholder == "#":
+            # $# refers to row index (1-based)
+            parts.append((pl.col(RIDX)))
+        elif placeholder.isdigit():
+            # $1, $2, etc. refer to columns by 1-based position index
+            col_idx = int(placeholder) - 1  # Convert to 0-based
+            try:
+                col_ref = columns[col_idx]
+                parts.append(pl.col(col_ref))
+            except IndexError:
+                raise ValueError(f"Invalid column index: ${placeholder} (valid range: $1 to ${len(columns)})")
+        else:
+            # $name refers to column by name
+            if placeholder in columns:
+                parts.append(pl.col(placeholder))
+            else:
+                raise ValueError(f"Column not found: ${placeholder} (available columns: {', '.join(columns)})")
+        last_end = match.end()
+    # Add remaining literal text after last placeholder
+    if last_end < len(template):
+        parts.append(template[last_end:])
+    # If no placeholders found, treat entire template as literal
+    if not parts:
+        parts = [template]
+    return parts
+def parse_polars_expression(expression: str, columns: list[str], current_cidx: int) -> str:
     """Parse and convert an expression to Polars syntax.
     Replaces column references with Polars col() expressions:
     - $_ - Current selected column
     - $# - Row index (1-based, requires '^__ridx__^' column to be present)
-    - $1, $2, etc. - Column by 1-based index
-    - $col_name - Column by name (valid identifier starting with _ or letter)
+    - $1, $2, etc. - Column index (1-based)
+    - $col_name - Column name (valid identifier starting with _ or letter)
     Examples:
     - "$_ > 50" -> "pl.col('current_col') > 50"
@@ -241,7 +344,7 @@ def parse_polars_expression(expression: str, columns: list[str], current_col_idx
     Args:
         expression: The input expression as a string.
         columns: The list of column names in the DataFrame.
-        current_col_idx: The index of the currently selected column (0-based). Used for $_ reference.
+        current_cidx: The index of the currently selected column (0-based). Used for $_ reference.
     Returns:
         A Python expression string with $references replaced by pl.col() calls.
@@ -258,38 +361,18 @@ def parse_polars_expression(expression: str, columns: list[str], current_col_idx
             # Return as a literal string
             return f"pl.lit({expression})"
-    # Pattern to match $ followed by either:
-    # - _ (single underscore)
-    # - # (hash for row index)
-    # - digits (integer)
-    # - identifier (starts with letter or _, followed by letter/digit/_)
-    pattern = r"\$(_|#|\d+|[a-zA-Z_]\w*)"
-    def replace_column_ref(match):
-        col_ref = match.group(1)
-        if col_ref == "_":
-            # Current selected column
-            col_name = columns[current_col_idx]
-        elif col_ref == "#":
-            # RIDX is used to store 0-based row index; add 1 for 1-based index
-            return f"(pl.col('{RIDX}') + 1)"
-        elif col_ref.isdigit():
-            # Column by 1-based index
-            col_idx = int(col_ref) - 1
-            if col_idx < 0 or col_idx >= len(columns):
-                raise ValueError(f"Column index out of range: ${col_ref}")
-            col_name = columns[col_idx]
-        else:
-            # Column by name
-            if col_ref not in columns:
-                raise ValueError(f"Column not found: ${col_ref}")
-            col_name = col_ref
+    parts = parse_placeholders(expression, columns, current_cidx)
-        return f"pl.col('{col_name}')"
+    result = []
+    for part in parts:
+        if isinstance(part, pl.Expr):
+            col = part.meta.output_name()
-    result = re.sub(pattern, replace_column_ref, expression)
-    return result
+            result.append(f"pl.col('{col}')")
+        else:
+            result.append(part)
+    return "".join(result)
 def tentative_expr(term: str) -> bool:
@@ -354,10 +437,12 @@ def load_dataframe(
     has_header: bool = True,
     infer_schema: bool = True,
     comment_prefix: str | None = None,
+    quote_char: str | None = '"',
     skip_lines: int = 0,
     skip_rows_after_header: int = 0,
     null_values: list[str] | None = None,
-) -> list[tuple[pl.DataFrame, str, str]]:
+    ignore_errors: bool = False,
+) -> list[Source]:
     """Load DataFrames from file specifications.
     Handles loading from multiple files, single files, or stdin. For Excel files,
@@ -369,42 +454,62 @@ def load_dataframe(
         has_header: Whether the input files have a header row. Defaults to True.
         infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
         comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
+        quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
         skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
         skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
+        null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
+        ignore_errors: Whether to ignore errors when reading CSV/TSV files. Defaults to False.
     Returns:
-        List of tuples of (DataFrame, filename, tabname) ready for display.
+        List of `Source` objects.
     """
-    sources = []
+    data: list[Source] = []
     prefix_sheet = len(filenames) > 1
     for filename in filenames:
-        # Determine file format if not specified
-        if not file_format:
+        if filename == "-":
+            source = StringIO(sys.stdin.read())
+            file_format = file_format or "tsv"
+            # Reopen stdin to /dev/tty for proper terminal interaction
+            try:
+                tty = open("/dev/tty")
+                os.dup2(tty.fileno(), sys.stdin.fileno())
+            except (OSError, FileNotFoundError):
+                pass
+        else:
+            source = filename
+        # If not specified, determine file format (may be different for each file)
+        fmt = file_format
+        if not fmt:
             ext = Path(filename).suffix.lower()
-            if ext == ".gz" or ext == ".bz2" or ext == ".xz":
+            if ext == ".gz":
                 ext = Path(filename).with_suffix("").suffix.lower()
             fmt = ext.removeprefix(".")
             # Default to TSV
-            file_format = fmt if fmt in SUPPORTED_FORMATS else "tsv"
+            if not fmt or fmt not in SUPPORTED_FORMATS:
+                fmt = "tsv"
-        # Load each file
-        sources.extend(
+        # Load the file
+        data.extend(
             load_file(
-                filename,
+                source,
                 prefix_sheet=prefix_sheet,
-                file_format=file_format,
+                file_format=fmt,
                 has_header=has_header,
                 infer_schema=infer_schema,
                 comment_prefix=comment_prefix,
+                quote_char=quote_char,
                 skip_lines=skip_lines,
                 skip_rows_after_header=skip_rows_after_header,
                 null_values=null_values,
+                ignore_errors=ignore_errors,
             )
         )
-    return sources
+    return data
 RE_COMPUTE_ERROR = re.compile(r"at column '(.*?)' \(column number \d+\)")
@@ -435,12 +540,19 @@ def handle_compute_error(
     """
     # Already disabled schema inference, cannot recover
     if not infer_schema:
-        print(f"Error loading with schema inference disabled:\n{err_msg}", file=sys.stderr)
+        print(f"Error loading even with schema inference disabled:\n{err_msg}", file=sys.stderr)
+        if "CSV malformed" in err_msg:
+            print(
+                "\nSometimes quote characters might be mismatched. Try again with `-q` or `-E` to ignore errors",
+                file=sys.stderr,
+            )
         sys.exit(1)
     # Schema mismatch error
     if "found more fields than defined in 'Schema'" in err_msg:
-        print(f"Input might be malformed:\n{err_msg}", file=sys.stderr)
+        print(f"Input might be malformed:\n{err_msg}.\nTry again with `-E` to ignore errors", file=sys.stderr)
         sys.exit(1)
     # ComputeError: could not parse `n.a. as of 04.01.022` as `dtype` i64 at column 'PubChemCID' (column number 16)
@@ -456,101 +568,21 @@ def handle_compute_error(
     return infer_schema, schema_overrides
-def load_stdin(
-    stdin_data=None,
-    file_format: str | None = None,
-    has_header: bool = True,
-    infer_schema: bool = True,
-    comment_prefix: str | None = None,
-    skip_lines: int = 0,
-    skip_rows_after_header: int = 0,
-    schema_overrides: dict[str, pl.DataType] | None = None,
-    null_values: list[str] | None = None,
-) -> list[tuple[pl.DataFrame, str, str]]:
-    """Load DataFrame from stdin.
-    If a ComputeError occurs during schema inference for a column, attempts to recover
-    by treating that column as a string and retrying the load. This process repeats until
-    all columns are successfully loaded or no further recovery is possible.
-    Args:
-        stdin_data: Optional stdin data as string. If None, read from sys.stdin.
-        file_format: Optional format specifier for input files (e.g., 'csv', 'excel').
-        has_header: Whether the input files have a header row. Defaults to True.
-        infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
-        comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
-        skip_lines: Number of lines to skip when reading CSV/TSV files. Defaults to 0.
-        skip_rows_after_header: Number of rows to skip after header. Defaults to 0.
-    Returns:
-        List of tuples of (DataFrame, filename, tabname) ready for display.
-    """
-    import os
-    from io import StringIO
-    sources = []
-    # Read from stdin into memory first (stdin is not seekable)
-    if stdin_data is None:
-        stdin_data = sys.stdin.read()
-        # Reopen stdin to /dev/tty for proper terminal interaction
-        try:
-            tty = open("/dev/tty")
-            os.dup2(tty.fileno(), sys.stdin.fileno())
-        except (OSError, FileNotFoundError):
-            pass
-    lf = pl.scan_csv(
-        StringIO(stdin_data),
-        separator="," if file_format == "csv" else "\t",
-        has_header=has_header,
-        infer_schema=infer_schema,
-        comment_prefix=comment_prefix,
-        skip_lines=skip_lines,
-        skip_rows_after_header=skip_rows_after_header,
-        schema_overrides=schema_overrides,
-        null_values=null_values,
-    )
-    sources = [(lf, f"stdin.{file_format}" if file_format else "stdin", "stdin")]
-    # Attempt to collect, handling ComputeError for schema inference issues
-    try:
-        sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
-    except pl.exceptions.ComputeError as ce:
-        # Handle the error and determine retry strategy
-        infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
-        # Retry loading with updated schema overrides
-        return load_stdin(
-            stdin_data,
-            file_format=file_format,
-            has_header=has_header,
-            infer_schema=infer_schema,
-            comment_prefix=comment_prefix,
-            skip_lines=skip_lines,
-            skip_rows_after_header=skip_rows_after_header,
-            schema_overrides=schema_overrides,
-            null_values=null_values,
-        )
-    return sources
 def load_file(
-    filename: str,
+    source: str | StringIO,
     first_sheet: bool = False,
     prefix_sheet: bool = False,
     file_format: str | None = None,
     has_header: bool = True,
     infer_schema: bool = True,
     comment_prefix: str | None = None,
+    quote_char: str | None = '"',
     skip_lines: int = 0,
     skip_rows_after_header: int = 0,
     schema_overrides: dict[str, pl.DataType] | None = None,
     null_values: list[str] | None = None,
-) -> list[tuple[pl.DataFrame, str, str]]:
+    ignore_errors: bool = False,
+) -> list[Source]:
     """Load a single file.
     For Excel files, when `first_sheet` is True, returns only the first sheet. Otherwise, returns one entry per sheet.
@@ -569,86 +601,92 @@ def load_file(
         has_header: Whether the input files have a header row. Defaults to True.
         infer_schema: Whether to infer data types for CSV/TSV files. Defaults to True.
         comment_prefix: Character(s) indicating comment lines in CSV/TSV files. Defaults to None.
+        quote_char: Quote character for reading CSV/TSV files. Defaults to '"'.
         skip_lines: Number of lines to skip when reading CSV/TSV files. The header will be parsed at this offset. Defaults to 0.
         skip_rows_after_header: Number of rows to skip after header when reading CSV/TSV files. Defaults to 0.
+        schema_overrides: Optional dictionary of column name to Polars data type to override inferred schema.
+        null_values: List of values to interpret as null when reading CSV/TSV files. Defaults to None.
+        ignore_errors: Whether to ignore errors when reading CSV/TSV files.
     Returns:
-        List of tuples of (DataFrame, filename, tabname).
+        List of `Source` objects.
     """
-    sources = []
-    if filename == "-":
-        return load_stdin(
-            file_format=file_format,
-            has_header=has_header,
-            infer_schema=infer_schema,
-            comment_prefix=comment_prefix,
-            skip_lines=skip_lines,
-            skip_rows_after_header=skip_rows_after_header,
-            schema_overrides=schema_overrides,
-            null_values=null_values,
-        )
+    data: list[Source] = []
+    filename = f"stdin.{file_format}" if isinstance(source, StringIO) else source
     filepath = Path(filename)
+    if not file_format:
+        ext = filepath.suffix.lower()
+        if ext == ".gz":
+            ext = Path(filename).with_suffix("").suffix.lower()
+        file_format = ext.removeprefix(".")
     # Load based on file format
-    if file_format in ("tsv", "csv"):
+    if file_format in ("csv", "tsv"):
         lf = pl.scan_csv(
-            filename,
+            source,
             separator="\t" if file_format == "tsv" else ",",
             has_header=has_header,
             infer_schema=infer_schema,
             comment_prefix=comment_prefix,
+            quote_char=quote_char,
             skip_lines=skip_lines,
             skip_rows_after_header=skip_rows_after_header,
             schema_overrides=schema_overrides,
             null_values=null_values,
+            ignore_errors=ignore_errors,
         )
-        sources.append((lf, filename, filepath.stem))
+        data.append(Source(lf, filename, filepath.stem))
     elif file_format in ("xlsx", "xls", "excel"):
         if first_sheet:
             # Read only the first sheet for multiple files
-            lf = pl.read_excel(filename).lazy()
-            sources.append((lf, filename, filepath.stem))
+            lf = pl.read_excel(source).lazy()
+            data.append(Source(lf, filename, filepath.stem))
         else:
             # For single file, expand all sheets
-            sheets = pl.read_excel(filename, sheet_id=0)
+            sheets = pl.read_excel(source, sheet_id=0)
             for sheet_name, df in sheets.items():
                 tabname = f"{filepath.stem}_{sheet_name}" if prefix_sheet else sheet_name
-                sources.append((df.lazy(), filename, tabname))
+                data.append(Source(df.lazy(), filename, tabname))
     elif file_format == "parquet":
-        lf = pl.scan_parquet(filename)
-        sources.append((lf, filename, filepath.stem))
+        lf = pl.scan_parquet(source)
+        data.append(Source(lf, filename, filepath.stem))
     elif file_format == "json":
-        lf = pl.read_json(filename).lazy()
-        sources.append((lf, filename, filepath.stem))
+        lf = pl.read_json(source).lazy()
+        data.append(Source(lf, filename, filepath.stem))
     elif file_format == "ndjson":
-        lf = pl.scan_ndjson(filename, schema_overrides=schema_overrides)
-        sources.append((lf, filename, filepath.stem))
+        lf = pl.scan_ndjson(source, schema_overrides=schema_overrides)
+        data.append(Source(lf, filename, filepath.stem))
     else:
         raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {SUPPORTED_FORMATS}")
     # Attempt to collect, handling ComputeError for schema inference issues
     try:
-        sources = [(lf.collect(), fn, tn) for lf, fn, tn in sources]
+        data = [Source(src.frame.collect(), src.filename, src.tabname) for src in data]
     except pl.exceptions.ComputeError as ce:
         # Handle the error and determine retry strategy
         infer_schema, schema_overrides = handle_compute_error(str(ce), file_format, infer_schema, schema_overrides)
         # Retry loading with updated schema overrides
+        if isinstance(source, StringIO):
+            source.seek(0)
         return load_file(
-            filename,
+            source,
             file_format=file_format,
             has_header=has_header,
             infer_schema=infer_schema,
             comment_prefix=comment_prefix,
+            quote_char=quote_char,
             skip_lines=skip_lines,
             skip_rows_after_header=skip_rows_after_header,
             schema_overrides=schema_overrides,
             null_values=null_values,
+            ignore_errors=ignore_errors,
         )
-    return sources
+    return data
 def now() -> str:

dataframe-textual 1.4.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

dataframe-textual 1.4.0py3-none-any.whl → 1.9.0py3-none-any.whl