PyPI - polars-bio - Versions diffs - 0.14.1__cp39-abi3-win_amd64.whl → 0.15.0__cp39-abi3-win_amd64.whl - Mend

polars-bio 0.14.1__cp39-abi3-win_amd64.whl → 0.15.0__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

polars_bio/__init__.py +1 -1
polars_bio/io.py +425 -184
polars_bio/polars_bio.pyd +0 -0
polars_bio/predicate_translator.py +464 -0
polars_bio/sql_predicate_builder.py +293 -0
polars_bio/utils.py +29 -4
{polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/METADATA +1 -1
{polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/RECORD +10 -8
{polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/WHEEL +0 -0
{polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/licenses/LICENSE +0 -0

polars_bio/io.py CHANGED Viewed

@@ -316,6 +316,7 @@ class IOOperations:
         timeout: int = 300,
         compression_type: str = "auto",
         projection_pushdown: bool = False,
+        predicate_pushdown: bool = False,
         parallel: bool = False,
     ) -> pl.DataFrame:
         """
@@ -332,6 +333,7 @@ class IOOperations:
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the GFF file. If not specified, it will be detected automatically..
             projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
+            predicate_pushdown: Enable predicate pushdown optimization to push filter conditions down to the DataFusion table provider level, reducing data processing and I/O.
             parallel: Whether to use the parallel reader for BGZF-compressed local files (uses BGZF chunk-level parallelism similar to FASTQ).
         !!! note
@@ -348,6 +350,7 @@ class IOOperations:
             timeout,
             compression_type,
             projection_pushdown,
+            predicate_pushdown,
             parallel,
         ).collect()
@@ -363,6 +366,7 @@ class IOOperations:
         timeout: int = 300,
         compression_type: str = "auto",
         projection_pushdown: bool = False,
+        predicate_pushdown: bool = False,
         parallel: bool = False,
     ) -> pl.LazyFrame:
         """
@@ -379,6 +383,7 @@ class IOOperations:
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the GFF file. If not specified, it will be detected automatically.
             projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
+            predicate_pushdown: Enable predicate pushdown optimization to push filter conditions down to the DataFusion table provider level, reducing data processing and I/O.
             parallel: Whether to use the parallel reader for BGZF-compressed local files (use BGZF chunk-level parallelism similar to FASTQ).
         !!! note
@@ -401,7 +406,9 @@ class IOOperations:
             parallel=parallel,
         )
         read_options = ReadOptions(gff_read_options=gff_read_options)
-        return _read_file(path, InputFormat.Gff, read_options, projection_pushdown)
+        return _read_file(
+            path, InputFormat.Gff, read_options, projection_pushdown, predicate_pushdown
+        )
     @staticmethod
     def read_bam(
@@ -760,13 +767,160 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
     return [x.strip() for x in t]
+def _apply_combined_pushdown_via_sql(
+    ctx,
+    table_name,
+    original_df,
+    predicate,
+    projected_columns,
+    predicate_pushdown,
+    projection_pushdown,
+):
+    """Apply both predicate and projection pushdown using SQL approach."""
+    from polars_bio.polars_bio import py_read_sql
+    # Build SQL query with combined optimizations
+    select_clause = "*"
+    if projection_pushdown and projected_columns:
+        select_clause = ", ".join([f'"{c}"' for c in projected_columns])
+    where_clause = ""
+    if predicate_pushdown and predicate is not None:
+        try:
+            # Use the proven regex-based predicate translation
+            where_clause = _build_sql_where_from_predicate_safe(predicate)
+        except Exception as e:
+            where_clause = ""
+    # No fallback - if we can't parse to SQL, just use projection only
+    # This keeps us in pure SQL mode for maximum performance
+    # Construct optimized SQL query
+    if where_clause:
+        sql = f"SELECT {select_clause} FROM {table_name} WHERE {where_clause}"
+    else:
+        sql = f"SELECT {select_clause} FROM {table_name}"
+    # Execute with DataFusion - this leverages the proven 4x+ optimization
+    return py_read_sql(ctx, sql)
+def _build_sql_where_from_predicate_safe(predicate):
+    """Build SQL WHERE clause by parsing all individual conditions and connecting with AND."""
+    import re
+    pred_str = str(predicate).strip("[]")
+    # Find all individual conditions in the nested structure
+    conditions = []
+    # String equality/inequality patterns (including empty strings)
+    # Accept both with and without surrounding parentheses in Polars repr
+    str_eq_patterns = [
+        r'\(col\("([^"]+)"\)\)\s*==\s*\("([^"]*)"\)',  # (col("x")) == ("v")
+        r'col\("([^"]+)"\)\s*==\s*"([^"]*)"',  # col("x") == "v"
+    ]
+    for pat in str_eq_patterns:
+        for column, value in re.findall(pat, pred_str):
+            conditions.append(f"\"{column}\" = '{value}'")
+    # Numeric comparison patterns (handle both formats: with and without "dyn int:")
+    numeric_patterns = [
+        (r'\(col\("([^"]+)"\)\)\s*>\s*\((?:dyn int:\s*)?(\d+)\)', ">"),
+        (r'\(col\("([^"]+)"\)\)\s*<\s*\((?:dyn int:\s*)?(\d+)\)', "<"),
+        (r'\(col\("([^"]+)"\)\)\s*>=\s*\((?:dyn int:\s*)?(\d+)\)', ">="),
+        (r'\(col\("([^"]+)"\)\)\s*<=\s*\((?:dyn int:\s*)?(\d+)\)', "<="),
+        (r'\(col\("([^"]+)"\)\)\s*!=\s*\((?:dyn int:\s*)?(\d+)\)', "!="),
+        (r'\(col\("([^"]+)"\)\)\s*==\s*\((?:dyn int:\s*)?(\d+)\)', "="),
+        (r'col\("([^"]+)"\)\s*>\s*(\d+)', ">"),
+        (r'col\("([^"]+)"\)\s*<\s*(\d+)', "<"),
+        (r'col\("([^"]+)"\)\s*>=\s*(\d+)', ">="),
+        (r'col\("([^"]+)"\)\s*<=\s*(\d+)', "<="),
+        (r'col\("([^"]+)"\)\s*!=\s*(\d+)', "!="),
+        (r'col\("([^"]+)"\)\s*==\s*(\d+)', "="),
+    ]
+    for pattern, op in numeric_patterns:
+        matches = re.findall(pattern, pred_str)
+        for column, value in matches:
+            conditions.append(f'"{column}" {op} {value}')
+    # Float comparison patterns (handle both formats: with and without "dyn float:")
+    float_patterns = [
+        (r'\(col\("([^"]+)"\)\)\s*>\s*\((?:dyn float:\s*)?([\d.]+)\)', ">"),
+        (r'\(col\("([^"]+)"\)\)\s*<\s*\((?:dyn float:\s*)?([\d.]+)\)', "<"),
+        (r'\(col\("([^"]+)"\)\)\s*>=\s*\((?:dyn float:\s*)?([\d.]+)\)', ">="),
+        (r'\(col\("([^"]+)"\)\)\s*<=\s*\((?:dyn float:\s*)?([\d.]+)\)', "<="),
+        (r'\(col\("([^"]+)"\)\)\s*!=\s*\((?:dyn float:\s*)?([\d.]+)\)', "!="),
+        (r'\(col\("([^"]+)"\)\)\s*==\s*\((?:dyn float:\s*)?([\d.]+)\)', "="),
+        (r'col\("([^"]+)"\)\s*>\s*([\d.]+)', ">"),
+        (r'col\("([^"]+)"\)\s*<\s*([\d.]+)', "<"),
+        (r'col\("([^"]+)"\)\s*>=\s*([\d.]+)', ">="),
+        (r'col\("([^"]+)"\)\s*<=\s*([\d.]+)', "<="),
+        (r'col\("([^"]+)"\)\s*!=\s*([\d.]+)', "!="),
+        (r'col\("([^"]+)"\)\s*==\s*([\d.]+)', "="),
+    ]
+    for pattern, op in float_patterns:
+        matches = re.findall(pattern, pred_str)
+        for column, value in matches:
+            conditions.append(f'"{column}" {op} {value}')
+    # IN list pattern: col("x").is_in([v1, v2, ...])
+    in_matches = re.findall(r'col\("([^"]+)"\)\.is_in\(\[(.*?)\]\)', pred_str)
+    for column, values_str in in_matches:
+        # Tokenize values: quoted strings or numbers
+        tokens = re.findall(r"'(?:[^']*)'|\"(?:[^\"]*)\"|\d+(?:\.\d+)?", values_str)
+        items = []
+        for t in tokens:
+            if t.startswith('"') and t.endswith('"'):
+                items.append("'" + t[1:-1] + "'")
+            else:
+                items.append(t)
+        if items:
+            conditions.append(f'"{column}" IN ({", ".join(items)})')
+    # Join all conditions with AND
+    if conditions:
+        where = " AND ".join(conditions)
+        # Clean up any residual bracketed list formatting from IN clause (defensive)
+        where = (
+            where.replace("IN ([", "IN (")
+            .replace("])", ")")
+            .replace("[ ", "")
+            .replace(" ]", "")
+        )
+        # Collapse simple >= and <= pairs into BETWEEN when possible
+        try:
+            import re as _re
+            where = _re.sub(
+                r'"([^"]+)"\s*>=\s*([\d.]+)\s*AND\s*"\1"\s*<=\s*([\d.]+)',
+                r'"\1" BETWEEN \2 AND \3',
+                where,
+            )
+            where = _re.sub(
+                r'"([^"]+)"\s*<=\s*([\d.]+)\s*AND\s*"\1"\s*>=\s*([\d.]+)',
+                r'"\1" BETWEEN \3 AND \2',
+                where,
+            )
+        except Exception:
+            pass
+        return where
+    return ""
 def _lazy_scan(
     df: Union[pl.DataFrame, pl.LazyFrame],
     projection_pushdown: bool = False,
+    predicate_pushdown: bool = False,
     table_name: str = None,
     input_format: InputFormat = None,
     file_path: str = None,
+    read_options: ReadOptions = None,
 ) -> pl.LazyFrame:
     df_lazy: DataFrame = df
     original_schema = df_lazy.schema()
@@ -776,67 +930,160 @@ def _lazy_scan(
         n_rows: Union[int, None],
         _batch_size: Union[int, None],
     ) -> Iterator[pl.DataFrame]:
-        # Extract column names from with_columns if projection pushdown is enabled
-        projected_columns = None
-        if projection_pushdown and with_columns is not None:
-            projected_columns = _extract_column_names_from_expr(with_columns)
-        # Projection pushdown is handled natively by table providers
-        query_df = df_lazy
+        # If this is a GFF scan, perform pushdown by building a single SELECT ... WHERE ...
+        if input_format == InputFormat.Gff and file_path is not None:
+            from polars_bio.polars_bio import GffReadOptions, PyObjectStorageOptions
+            from polars_bio.polars_bio import ReadOptions as _ReadOptions
+            from polars_bio.polars_bio import (
+                py_read_sql,
+                py_read_table,
+                py_register_table,
+                py_register_view,
+            )
-        # Apply column projection to DataFusion query if enabled
-        datafusion_projection_applied = False
+            from .context import ctx
-        if projection_pushdown and projected_columns:
-            try:
-                # Apply projection at the DataFusion level using SQL
-                # This approach works reliably with the DataFusion Python API
-                columns_sql = ", ".join([f'"{c}"' for c in projected_columns])
+            # Extract columns requested by Polars optimizer
+            requested_cols = (
+                _extract_column_names_from_expr(with_columns)
+                if with_columns is not None
+                else []
+            )
-                # Use the table name passed from _read_file, fallback if not available
-                table_to_query = table_name if table_name else "temp_table"
+            # Compute attribute fields to request based on selected columns
+            STATIC = {
+                "chrom",
+                "start",
+                "end",
+                "type",
+                "source",
+                "score",
+                "strand",
+                "phase",
+                "attributes",
+            }
+            attr_fields = [c for c in requested_cols if c not in STATIC]
+            # Derive thread/parallel from read_options when available
+            thread_num = 1
+            parallel = False
+            if read_options is not None:
+                try:
+                    gopt = getattr(read_options, "gff_read_options", None)
+                    if gopt is not None:
+                        tn = getattr(gopt, "thread_num", None)
+                        if tn is not None:
+                            thread_num = tn
+                        par = getattr(gopt, "parallel", None)
+                        if par is not None:
+                            parallel = par
+                except Exception:
+                    pass
-                # Use py_read_sql to execute SQL projection (same as pb.sql() does)
-                from .context import ctx
+            # Build fresh read options (object storage options are not readable from Rust class; use safe defaults)
+            obj = PyObjectStorageOptions(
+                allow_anonymous=True,
+                enable_request_payer=False,
+                chunk_size=8,
+                concurrent_fetches=1,
+                max_retries=5,
+                timeout=300,
+                compression_type="auto",
+            )
+            # Determine attribute parsing behavior:
+            # - if user selected raw "attributes" column: keep provider defaults (None)
+            # - if user selected specific attribute columns: pass that list
+            # - otherwise: disable attribute parsing with empty list for performance
+            if "attributes" in requested_cols:
+                _attr = None
+            elif attr_fields:
+                _attr = attr_fields
+            else:
+                _attr = []
-                query_df = py_read_sql(
-                    ctx, f"SELECT {columns_sql} FROM {table_to_query}"
+            gff_opts = GffReadOptions(
+                attr_fields=_attr,
+                thread_num=thread_num,
+                object_storage_options=obj,
+                parallel=parallel,
+            )
+            ropts = _ReadOptions(gff_read_options=gff_opts)
+            # Determine which table to query: reuse original unless we must change attr_fields
+            table_name_use = table_name
+            if projection_pushdown and requested_cols:
+                # Only re-register when projection is active (we know column needs)
+                table_obj = py_register_table(
+                    ctx, file_path, None, InputFormat.Gff, ropts
                 )
-                datafusion_projection_applied = True
-            except Exception as e:
-                # Fallback to original behavior if projection fails
-                print(f"DataFusion projection failed: {e}")
-                query_df = df_lazy
-                projected_columns = None
-                datafusion_projection_applied = False
-        if n_rows and n_rows < 8192:  # 8192 is the default batch size in datafusion
-            df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
-            df = pl.DataFrame(df).limit(n_rows)
-            if predicate is not None:
-                df = df.filter(predicate)
-            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
-            if with_columns is not None and (
-                not projection_pushdown or not datafusion_projection_applied
-            ):
-                df = df.select(with_columns)
-            yield df
+                table_name_use = table_obj.name
+            # Build SELECT clause respecting projection flag
+            if projection_pushdown and requested_cols:
+                select_clause = ", ".join([f'"{c}"' for c in requested_cols])
+            else:
+                select_clause = "*"
+            # Build WHERE clause respecting predicate flag
+            where_clause = ""
+            if predicate_pushdown and predicate is not None:
+                try:
+                    where_clause = _build_sql_where_from_predicate_safe(predicate)
+                except Exception:
+                    where_clause = ""
+            sql = f"SELECT {select_clause} FROM {table_name_use}"
+            if where_clause:
+                sql += f" WHERE {where_clause}"
+            if n_rows and n_rows > 0:
+                sql += f" LIMIT {int(n_rows)}"
+            query_df = py_read_sql(ctx, sql)
+            # Stream results, applying any non-pushed operations locally
+            df_stream = query_df.execute_stream()
+            progress_bar = tqdm(unit="rows")
+            for r in df_stream:
+                py_df = r.to_pyarrow()
+                out = pl.DataFrame(py_df)
+                # Apply local filter if we didn't push it down
+                if predicate is not None and (
+                    not predicate_pushdown or not where_clause
+                ):
+                    out = out.filter(predicate)
+                # Apply local projection if we didn't push it down
+                if with_columns is not None and (
+                    not projection_pushdown or not requested_cols
+                ):
+                    out = out.select(with_columns)
+                progress_bar.update(len(out))
+                yield out
             return
+        # Default path (non-GFF): stream and optionally apply local filter/projection
+        query_df = df_lazy
         df_stream = query_df.execute_stream()
         progress_bar = tqdm(unit="rows")
+        remaining = int(n_rows) if n_rows is not None else None
         for r in df_stream:
             py_df = r.to_pyarrow()
-            df = pl.DataFrame(py_df)
+            out = pl.DataFrame(py_df)
             if predicate is not None:
-                df = df.filter(predicate)
-            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
-            if with_columns is not None and (
-                not projection_pushdown or not datafusion_projection_applied
-            ):
-                df = df.select(with_columns)
-            progress_bar.update(len(df))
-            yield df
+                out = out.filter(predicate)
+            if with_columns is not None:
+                out = out.select(with_columns)
+            if remaining is not None:
+                if remaining <= 0:
+                    break
+                if len(out) > remaining:
+                    out = out.head(remaining)
+                remaining -= len(out)
+            progress_bar.update(len(out))
+            yield out
+            if remaining is not None and remaining <= 0:
+                return
     return register_io_source(_overlap_source, schema=original_schema)
@@ -877,21 +1124,36 @@ def _read_file(
     input_format: InputFormat,
     read_options: ReadOptions,
     projection_pushdown: bool = False,
+    predicate_pushdown: bool = False,
 ) -> pl.LazyFrame:
     table = py_register_table(ctx, path, None, input_format, read_options)
     df = py_read_table(ctx, table.name)
-    lf = _lazy_scan(df, projection_pushdown, table.name, input_format, path)
+    lf = _lazy_scan(
+        df,
+        projection_pushdown,
+        predicate_pushdown,
+        table.name,
+        input_format,
+        path,
+        read_options,
+    )
     # Wrap GFF LazyFrames with projection-aware wrapper for consistent attribute field handling
     if input_format == InputFormat.Gff:
-        return GffLazyFrameWrapper(lf, path, read_options, projection_pushdown)
+        return GffLazyFrameWrapper(
+            lf, path, read_options, projection_pushdown, predicate_pushdown
+        )
     return lf
 class GffLazyFrameWrapper:
-    """Wrapper for GFF LazyFrames that handles attribute field detection in select operations."""
+    """Thin wrapper that preserves type while delegating to the underlying LazyFrame.
+    Pushdown is decided exclusively inside the io_source callback based on
+    with_columns and predicate; this wrapper only keeps chain type stable.
+    """
     def __init__(
         self,
@@ -899,45 +1161,33 @@ class GffLazyFrameWrapper:
         file_path: str,
         read_options: ReadOptions,
         projection_pushdown: bool = True,
+        predicate_pushdown: bool = True,
     ):
         self._base_lf = base_lf
         self._file_path = file_path
         self._read_options = read_options
         self._projection_pushdown = projection_pushdown
+        self._predicate_pushdown = predicate_pushdown
     def select(self, exprs):
-        """Override select to handle GFF attribute field detection.
-        Ensures queries requesting the raw `attributes` column use a registration
-        that exposes it, while preserving projection pushdown. For unnested
-        attribute fields (e.g., `gene_id`), re-registers with those fields to
-        enable efficient projection.
-        """
-        # Extract column names from expressions
-        if isinstance(exprs, (list, tuple)):
-            columns = []
-            for expr in exprs:
-                if isinstance(expr, str):
-                    columns.append(expr)
-                elif hasattr(expr, "meta") and hasattr(expr.meta, "output_name"):
-                    try:
-                        columns.append(expr.meta.output_name())
-                    except:
-                        pass
-        else:
-            # Single expression
-            if isinstance(exprs, str):
-                columns = [exprs]
-            elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
-                try:
-                    columns = [exprs.meta.output_name()]
-                except:
-                    columns = []
+        # Extract requested column names
+        columns = []
+        try:
+            if isinstance(exprs, (list, tuple)):
+                for e in exprs:
+                    if isinstance(e, str):
+                        columns.append(e)
+                    elif hasattr(e, "meta") and hasattr(e.meta, "output_name"):
+                        columns.append(e.meta.output_name())
             else:
-                columns = []
+                if isinstance(exprs, str):
+                    columns = [exprs]
+                elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
+                    columns = [exprs.meta.output_name()]
+        except Exception:
+            columns = []
-        # Categorize columns
-        GFF_STATIC_COLUMNS = {
+        STATIC = {
             "chrom",
             "start",
             "end",
@@ -948,119 +1198,110 @@ class GffLazyFrameWrapper:
             "phase",
             "attributes",
         }
-        static_cols = [col for col in columns if col in GFF_STATIC_COLUMNS]
-        attribute_cols = [col for col in columns if col not in GFF_STATIC_COLUMNS]
+        attr_cols = [c for c in columns if c not in STATIC]
+        # If selecting attribute fields, run one-shot SQL projection with proper attr_fields
+        if columns and (attr_cols or "attributes" in columns):
+            from polars_bio.polars_bio import GffReadOptions
+            from polars_bio.polars_bio import InputFormat as _InputFormat
+            from polars_bio.polars_bio import PyObjectStorageOptions
+            from polars_bio.polars_bio import ReadOptions as _ReadOptions
+            from polars_bio.polars_bio import (
+                py_read_sql,
+                py_read_table,
+                py_register_table,
+                py_register_view,
+            )
-        # If 'attributes' is requested, ensure the registered table exposes it.
-        # Some parallel GFF providers omit the raw 'attributes' column; switch
-        # to a registration that includes it while keeping projection pushdown.
-        if "attributes" in static_cols:
             from .context import ctx
-            # Preserve original parallelism and thread config when re-registering
-            orig_gff_opts = getattr(self._read_options, "gff_read_options", None)
-            orig_parallel = (
-                getattr(orig_gff_opts, "parallel", False) if orig_gff_opts else False
-            )
-            orig_thread = (
-                getattr(orig_gff_opts, "thread_num", None) if orig_gff_opts else None
+            # Pull thread_num/parallel from original read options
+            thread_num = 1
+            parallel = False
+            try:
+                gopt = getattr(self._read_options, "gff_read_options", None)
+                if gopt is not None:
+                    tn = getattr(gopt, "thread_num", None)
+                    if tn is not None:
+                        thread_num = tn
+                    par = getattr(gopt, "parallel", None)
+                    if par is not None:
+                        parallel = par
+            except Exception:
+                pass
+            obj = PyObjectStorageOptions(
+                allow_anonymous=True,
+                enable_request_payer=False,
+                chunk_size=8,
+                concurrent_fetches=1,
+                max_retries=5,
+                timeout=300,
+                compression_type="auto",
             )
+            if "attributes" in columns:
+                _attr = None
+            elif attr_cols:
+                _attr = attr_cols
+            else:
+                _attr = []
-            # Build read options that ensure raw attributes are present
-            gff_options = GffReadOptions(
-                attr_fields=None,  # keep nested 'attributes' column
-                thread_num=orig_thread if orig_thread is not None else 1,
-                object_storage_options=PyObjectStorageOptions(
-                    allow_anonymous=True,
-                    enable_request_payer=False,
-                    chunk_size=8,
-                    concurrent_fetches=1,
-                    max_retries=5,
-                    timeout=300,
-                    compression_type="auto",
-                ),
-                parallel=orig_parallel,
+            gff_opts = GffReadOptions(
+                attr_fields=_attr,
+                thread_num=thread_num,
+                object_storage_options=obj,
+                parallel=parallel,
             )
-            read_options = ReadOptions(gff_read_options=gff_options)
+            ropts = _ReadOptions(gff_read_options=gff_opts)
             table = py_register_table(
-                ctx, self._file_path, None, InputFormat.Gff, read_options
+                ctx, self._file_path, None, _InputFormat.Gff, ropts
             )
-            df = py_read_table(ctx, table.name)
-            new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
-            return new_lf.select(exprs)
-        if self._projection_pushdown:
-            # Optimized path: when selecting specific unnested attribute fields, re-register
-            # GFF table with those fields so DataFusion can project them efficiently.
-            # Use optimized table re-registration (fast path)
-            from .context import ctx
-            gff_options = GffReadOptions(
-                attr_fields=attribute_cols,
-                thread_num=getattr(
-                    getattr(self._read_options, "gff_read_options", None),
-                    "thread_num",
-                    1,
-                ),
-                object_storage_options=PyObjectStorageOptions(
-                    allow_anonymous=True,
-                    enable_request_payer=False,
-                    chunk_size=8,
-                    concurrent_fetches=1,
-                    max_retries=5,
-                    timeout=300,
-                    compression_type="auto",
-                ),
-                # Keep parallel reading consistent with base options when possible
-                parallel=getattr(
-                    getattr(self._read_options, "gff_read_options", None),
-                    "parallel",
-                    False,
-                ),
+            select_clause = ", ".join([f'"{c}"' for c in columns])
+            view_name = f"{table.name}_proj"
+            py_register_view(
+                ctx, view_name, f"SELECT {select_clause} FROM {table.name}"
             )
-            read_options = ReadOptions(gff_read_options=gff_options)
-            table = py_register_table(
-                ctx, self._file_path, None, InputFormat.Gff, read_options
+            df_view = py_read_table(ctx, view_name)
+            new_lf = _lazy_scan(
+                df_view,
+                False,
+                self._predicate_pushdown,
+                view_name,
+                _InputFormat.Gff,
+                self._file_path,
+                self._read_options,
+            )
+            return GffLazyFrameWrapper(
+                new_lf,
+                self._file_path,
+                self._read_options,
+                False,
+                self._predicate_pushdown,
             )
-            df = py_read_table(ctx, table.name)
-            # Create new LazyFrame with optimized schema
-            new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
-            return new_lf.select(exprs)
-        elif attribute_cols:
-            # Extract attribute fields from nested structure (compatibility path)
-            import polars as pl
-            # Build selection with attribute field extraction
-            selection_exprs = []
-            # Add static columns as-is
-            for col in static_cols:
-                selection_exprs.append(pl.col(col))
-            # Add attribute field extractions
-            for attr_col in attribute_cols:
-                attr_expr = (
-                    pl.col("attributes")
-                    .list.eval(
-                        pl.when(pl.element().struct.field("tag") == attr_col).then(
-                            pl.element().struct.field("value")
-                        )
-                    )
-                    .list.drop_nulls()
-                    .list.first()
-                    .alias(attr_col)
-                )
-                selection_exprs.append(attr_expr)
-            return self._base_lf.select(selection_exprs)
-        else:
-            # Static columns only, use base LazyFrame
-            return self._base_lf.select(exprs)
+        # Otherwise delegate to Polars
+        return GffLazyFrameWrapper(
+            self._base_lf.select(exprs),
+            self._file_path,
+            self._read_options,
+            self._projection_pushdown,
+            self._predicate_pushdown,
+        )
+    def filter(self, *predicates):
+        if not predicates:
+            return self
+        pred = predicates[0]
+        for p in predicates[1:]:
+            pred = pred & p
+        return GffLazyFrameWrapper(
+            self._base_lf.filter(pred),
+            self._file_path,
+            self._read_options,
+            self._projection_pushdown,
+            self._predicate_pushdown,
+        )
     def __getattr__(self, name):
-        """Delegate all other operations to base LazyFrame."""
         return getattr(self._base_lf, name)