PyPI - polars-bio - Versions diffs - 0.13.1__cp39-abi3-win_amd64.whl → 0.14.0__cp39-abi3-win_amd64.whl - Mend

polars-bio 0.13.1__cp39-abi3-win_amd64.whl → 0.14.0__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

polars_bio/__init__.py +1 -1
polars_bio/io.py +338 -34
polars_bio/polars_bio.pyd +0 -0
polars_bio/range_op.py +36 -3
polars_bio/range_op_helpers.py +10 -1
polars_bio/range_op_io.py +43 -10
polars_bio/sql.py +27 -12
polars_bio/utils.py +85 -7
{polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/METADATA +2 -1
polars_bio-0.14.0.dist-info/RECORD +19 -0
{polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/WHEEL +1 -1
polars_bio-0.13.1.dist-info/RECORD +0 -19
{polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/licenses/LICENSE +0 -0

polars_bio/__init__.py CHANGED Viewed

@@ -73,7 +73,7 @@ except ImportError:
 POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
-__version__ = "0.13.1"
+__version__ = "0.14.0"
 __all__ = [
     "ctx",
     "FilterOp",

polars_bio/io.py CHANGED Viewed

@@ -17,6 +17,7 @@ from polars_bio.polars_bio import (
     VcfReadOptions,
     py_describe_vcf,
     py_from_polars,
+    py_read_sql,
     py_read_table,
     py_register_table,
     py_scan_table,
@@ -80,6 +81,7 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
     ) -> pl.DataFrame:
         """
@@ -94,6 +96,7 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
+            projection_pushdown: Enable column projection pushdown optimization. When True, only requested columns are processed at the DataFusion execution level, improving performance and reducing memory usage.
         !!! Example
             ```shell
@@ -124,6 +127,7 @@ class IOOperations:
             max_retries,
             timeout,
             compression_type,
+            projection_pushdown,
         ).collect()
     @staticmethod
@@ -136,6 +140,7 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
     ) -> pl.LazyFrame:
         """
@@ -150,6 +155,7 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! Example
             ```shell
@@ -184,12 +190,11 @@ class IOOperations:
             object_storage_options=object_storage_options
         )
         read_options = ReadOptions(fasta_read_options=fasta_read_options)
-        return _read_file(path, InputFormat.Fasta, read_options)
+        return _read_file(path, InputFormat.Fasta, read_options, projection_pushdown)
     @staticmethod
     def read_vcf(
         path: str,
-        info_fields: Union[list[str], None] = None,
         thread_num: int = 1,
         chunk_size: int = 8,
         concurrent_fetches: int = 1,
@@ -198,13 +203,13 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
     ) -> pl.DataFrame:
         """
         Read a VCF file into a DataFrame.
         Parameters:
             path: The path to the VCF file.
-            info_fields: The fields to read from the INFO column.
             thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
             chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
             concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
@@ -212,14 +217,14 @@ class IOOperations:
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
-            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! note
             VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
         """
         return IOOperations.scan_vcf(
             path,
-            info_fields,
             thread_num,
             chunk_size,
             concurrent_fetches,
@@ -228,12 +233,12 @@ class IOOperations:
             max_retries,
             timeout,
             compression_type,
+            projection_pushdown,
         ).collect()
     @staticmethod
     def scan_vcf(
         path: str,
-        info_fields: Union[list[str], None] = None,
         thread_num: int = 1,
         chunk_size: int = 8,
         concurrent_fetches: int = 1,
@@ -242,13 +247,13 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
     ) -> pl.LazyFrame:
         """
         Lazily read a VCF file into a LazyFrame.
         Parameters:
             path: The path to the VCF file.
-            info_fields: The fields to read from the INFO column.
             thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
             chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
             concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
@@ -256,7 +261,8 @@ class IOOperations:
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
-            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! note
             VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -271,18 +277,36 @@ class IOOperations:
             compression_type=compression_type,
         )
+        # Get all info fields from VCF header for proper projection pushdown
+        all_info_fields = None
+        try:
+            vcf_schema_df = IOOperations.describe_vcf(
+                path,
+                allow_anonymous=allow_anonymous,
+                enable_request_payer=enable_request_payer,
+                compression_type=compression_type,
+            )
+            # Use column name 'name' not 'id' based on the schema output
+            all_info_fields = vcf_schema_df.select("name").to_series().to_list()
+        except Exception:
+            # Fallback to None if unable to get info fields
+            all_info_fields = None
+        # Always start with all info fields to establish full schema
+        # The callback will re-register with only requested info fields for optimization
+        initial_info_fields = all_info_fields
         vcf_read_options = VcfReadOptions(
-            info_fields=_cleanse_fields(info_fields),
+            info_fields=initial_info_fields,
             thread_num=thread_num,
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(vcf_read_options=vcf_read_options)
-        return _read_file(path, InputFormat.Vcf, read_options)
+        return _read_file(path, InputFormat.Vcf, read_options, projection_pushdown)
     @staticmethod
     def read_gff(
         path: str,
-        attr_fields: Union[list[str], None] = None,
         thread_num: int = 1,
         chunk_size: int = 8,
         concurrent_fetches: int = 1,
@@ -291,13 +315,14 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
+        parallel: bool = False,
     ) -> pl.DataFrame:
         """
         Read a GFF file into a DataFrame.
         Parameters:
             path: The path to the GFF file.
-            attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
             thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
             chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
             concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
@@ -305,14 +330,15 @@ class IOOperations:
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
-            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically..
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
+            parallel: Whether to use the parallel reader for BGZF-compressed local files (uses BGZF chunk-level parallelism similar to FASTQ).
         !!! note
             GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
         """
         return IOOperations.scan_gff(
             path,
-            attr_fields,
             thread_num,
             chunk_size,
             concurrent_fetches,
@@ -321,12 +347,13 @@ class IOOperations:
             max_retries,
             timeout,
             compression_type,
+            projection_pushdown,
+            parallel,
         ).collect()
     @staticmethod
     def scan_gff(
         path: str,
-        attr_fields: Union[list[str], None] = None,
         thread_num: int = 1,
         chunk_size: int = 8,
         concurrent_fetches: int = 1,
@@ -335,21 +362,24 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
+        parallel: bool = False,
     ) -> pl.LazyFrame:
         """
         Lazily read a GFF file into a LazyFrame.
         Parameters:
             path: The path to the GFF file.
-            attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
             thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
             chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
-            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large-scale operations, it is recommended to increase this value to 8 or even more.
             allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
-            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
+            parallel: Whether to use the parallel reader for BGZF-compressed local files (use BGZF chunk-level parallelism similar to FASTQ).
         !!! note
             GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -365,12 +395,13 @@ class IOOperations:
         )
         gff_read_options = GffReadOptions(
-            attr_fields=_cleanse_fields(attr_fields),
+            attr_fields=None,
             thread_num=thread_num,
             object_storage_options=object_storage_options,
+            parallel=parallel,
         )
         read_options = ReadOptions(gff_read_options=gff_read_options)
-        return _read_file(path, InputFormat.Gff, read_options)
+        return _read_file(path, InputFormat.Gff, read_options, projection_pushdown)
     @staticmethod
     def read_bam(
@@ -382,6 +413,7 @@ class IOOperations:
         enable_request_payer: bool = False,
         max_retries: int = 5,
         timeout: int = 300,
+        projection_pushdown: bool = False,
     ) -> pl.DataFrame:
         """
         Read a BAM file into a DataFrame.
@@ -389,12 +421,13 @@ class IOOperations:
         Parameters:
             path: The path to the BAM file.
             thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
-            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
-            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large-scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large-scale operations, it is recommended to increase this value to 8 or even more.
             allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! note
             BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
@@ -408,6 +441,7 @@ class IOOperations:
             enable_request_payer,
             max_retries,
             timeout,
+            projection_pushdown,
         ).collect()
     @staticmethod
@@ -420,6 +454,7 @@ class IOOperations:
         enable_request_payer: bool = False,
         max_retries: int = 5,
         timeout: int = 300,
+        projection_pushdown: bool = False,
     ) -> pl.LazyFrame:
         """
         Lazily read a BAM file into a LazyFrame.
@@ -433,6 +468,7 @@ class IOOperations:
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! note
             BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
@@ -452,7 +488,7 @@ class IOOperations:
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(bam_read_options=bam_read_options)
-        return _read_file(path, InputFormat.Bam, read_options)
+        return _read_file(path, InputFormat.Bam, read_options, projection_pushdown)
     @staticmethod
     def read_fastq(
@@ -465,6 +501,7 @@ class IOOperations:
         timeout: int = 300,
         compression_type: str = "auto",
         parallel: bool = False,
+        projection_pushdown: bool = False,
     ) -> pl.DataFrame:
         """
         Read a FASTQ file into a DataFrame.
@@ -479,6 +516,7 @@ class IOOperations:
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
             parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         """
         return IOOperations.scan_fastq(
             path,
@@ -490,6 +528,7 @@ class IOOperations:
             timeout,
             compression_type,
             parallel,
+            projection_pushdown,
         ).collect()
     @staticmethod
@@ -503,6 +542,7 @@ class IOOperations:
         timeout: int = 300,
         compression_type: str = "auto",
         parallel: bool = False,
+        projection_pushdown: bool = False,
     ) -> pl.LazyFrame:
         """
         Lazily read a FASTQ file into a LazyFrame.
@@ -517,6 +557,7 @@ class IOOperations:
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
             parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         """
         object_storage_options = PyObjectStorageOptions(
             allow_anonymous=allow_anonymous,
@@ -532,7 +573,7 @@ class IOOperations:
             object_storage_options=object_storage_options, parallel=parallel
         )
         read_options = ReadOptions(fastq_read_options=fastq_read_options)
-        return _read_file(path, InputFormat.Fastq, read_options)
+        return _read_file(path, InputFormat.Fastq, read_options, projection_pushdown)
     @staticmethod
     def read_bed(
@@ -545,6 +586,7 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
     ) -> pl.DataFrame:
         """
         Read a BED file into a DataFrame.
@@ -559,6 +601,7 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! Note
             Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
@@ -577,6 +620,7 @@ class IOOperations:
             max_retries,
             timeout,
             compression_type,
+            projection_pushdown,
         ).collect()
     @staticmethod
@@ -590,6 +634,7 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
+        projection_pushdown: bool = False,
     ) -> pl.LazyFrame:
         """
         Lazily read a BED file into a LazyFrame.
@@ -604,6 +649,7 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         !!! Note
             Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
@@ -627,7 +673,7 @@ class IOOperations:
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(bed_read_options=bed_read_options)
-        return _read_file(path, InputFormat.Bed, read_options)
+        return _read_file(path, InputFormat.Bed, read_options, projection_pushdown)
     @staticmethod
     def read_table(path: str, schema: Dict = None, **kwargs) -> pl.DataFrame:
@@ -678,7 +724,7 @@ class IOOperations:
             path: The path to the VCF file.
             allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
             enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
-            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
         """
         object_storage_options = PyObjectStorageOptions(
             allow_anonymous=allow_anonymous,
@@ -714,9 +760,15 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
     return [x.strip() for x in t]
-def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
+def _lazy_scan(
+    df: Union[pl.DataFrame, pl.LazyFrame],
+    projection_pushdown: bool = False,
+    table_name: str = None,
+    input_format: InputFormat = None,
+    file_path: str = None,
+) -> pl.LazyFrame:
     df_lazy: DataFrame = df
-    arrow_schema = df_lazy.schema()
+    original_schema = df_lazy.schema()
     def _overlap_source(
         with_columns: Union[pl.Expr, None],
@@ -724,35 +776,287 @@ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
         n_rows: Union[int, None],
         _batch_size: Union[int, None],
     ) -> Iterator[pl.DataFrame]:
+        # Extract column names from with_columns if projection pushdown is enabled
+        projected_columns = None
+        if projection_pushdown and with_columns is not None:
+            projected_columns = _extract_column_names_from_expr(with_columns)
+        # Projection pushdown is handled natively by table providers
+        query_df = df_lazy
+        # Apply column projection to DataFusion query if enabled
+        datafusion_projection_applied = False
+        if projection_pushdown and projected_columns:
+            try:
+                # Apply projection at the DataFusion level using SQL
+                # This approach works reliably with the DataFusion Python API
+                columns_sql = ", ".join([f'"{c}"' for c in projected_columns])
+                # Use the table name passed from _read_file, fallback if not available
+                table_to_query = table_name if table_name else "temp_table"
+                # Use py_read_sql to execute SQL projection (same as pb.sql() does)
+                from .context import ctx
+                query_df = py_read_sql(
+                    ctx, f"SELECT {columns_sql} FROM {table_to_query}"
+                )
+                datafusion_projection_applied = True
+            except Exception as e:
+                # Fallback to original behavior if projection fails
+                print(f"DataFusion projection failed: {e}")
+                query_df = df_lazy
+                projected_columns = None
+                datafusion_projection_applied = False
         if n_rows and n_rows < 8192:  # 8192 is the default batch size in datafusion
-            df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
+            df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
             df = pl.DataFrame(df).limit(n_rows)
             if predicate is not None:
                 df = df.filter(predicate)
-            if with_columns is not None:
+            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
+            if with_columns is not None and (
+                not projection_pushdown or not datafusion_projection_applied
+            ):
                 df = df.select(with_columns)
             yield df
             return
-        df_stream = df_lazy.execute_stream()
+        df_stream = query_df.execute_stream()
         progress_bar = tqdm(unit="rows")
         for r in df_stream:
             py_df = r.to_pyarrow()
             df = pl.DataFrame(py_df)
             if predicate is not None:
                 df = df.filter(predicate)
-            if with_columns is not None:
+            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
+            if with_columns is not None and (
+                not projection_pushdown or not datafusion_projection_applied
+            ):
                 df = df.select(with_columns)
             progress_bar.update(len(df))
             yield df
-    return register_io_source(_overlap_source, schema=arrow_schema)
+    return register_io_source(_overlap_source, schema=original_schema)
+def _extract_column_names_from_expr(with_columns: Union[pl.Expr, list]) -> list[str]:
+    """Extract column names from Polars expressions."""
+    if with_columns is None:
+        return []
+    # Handle different types of with_columns input
+    if hasattr(with_columns, "__iter__") and not isinstance(with_columns, str):
+        # It's a list of expressions or strings
+        column_names = []
+        for item in with_columns:
+            if isinstance(item, str):
+                column_names.append(item)
+            elif hasattr(item, "meta") and hasattr(item.meta, "output_name"):
+                # Polars expression with output name
+                try:
+                    column_names.append(item.meta.output_name())
+                except Exception:
+                    pass
+        return column_names
+    elif isinstance(with_columns, str):
+        return [with_columns]
+    elif hasattr(with_columns, "meta") and hasattr(with_columns.meta, "output_name"):
+        # Single Polars expression
+        try:
+            return [with_columns.meta.output_name()]
+        except Exception:
+            pass
+    return []
 def _read_file(
     path: str,
     input_format: InputFormat,
     read_options: ReadOptions,
+    projection_pushdown: bool = False,
 ) -> pl.LazyFrame:
     table = py_register_table(ctx, path, None, input_format, read_options)
     df = py_read_table(ctx, table.name)
-    return _lazy_scan(df)
+    lf = _lazy_scan(df, projection_pushdown, table.name, input_format, path)
+    # Wrap GFF LazyFrames with projection-aware wrapper for consistent attribute field handling
+    if input_format == InputFormat.Gff:
+        return GffLazyFrameWrapper(lf, path, read_options, projection_pushdown)
+    return lf
+class GffLazyFrameWrapper:
+    """Wrapper for GFF LazyFrames that handles attribute field detection in select operations."""
+    def __init__(
+        self,
+        base_lf: pl.LazyFrame,
+        file_path: str,
+        read_options: ReadOptions,
+        projection_pushdown: bool = True,
+    ):
+        self._base_lf = base_lf
+        self._file_path = file_path
+        self._read_options = read_options
+        self._projection_pushdown = projection_pushdown
+    def select(self, exprs):
+        """Override select to handle GFF attribute field detection.
+        Ensures queries requesting the raw `attributes` column use a registration
+        that exposes it, while preserving projection pushdown. For unnested
+        attribute fields (e.g., `gene_id`), re-registers with those fields to
+        enable efficient projection.
+        """
+        # Extract column names from expressions
+        if isinstance(exprs, (list, tuple)):
+            columns = []
+            for expr in exprs:
+                if isinstance(expr, str):
+                    columns.append(expr)
+                elif hasattr(expr, "meta") and hasattr(expr.meta, "output_name"):
+                    try:
+                        columns.append(expr.meta.output_name())
+                    except:
+                        pass
+        else:
+            # Single expression
+            if isinstance(exprs, str):
+                columns = [exprs]
+            elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
+                try:
+                    columns = [exprs.meta.output_name()]
+                except:
+                    columns = []
+            else:
+                columns = []
+        # Categorize columns
+        GFF_STATIC_COLUMNS = {
+            "chrom",
+            "start",
+            "end",
+            "type",
+            "source",
+            "score",
+            "strand",
+            "phase",
+            "attributes",
+        }
+        static_cols = [col for col in columns if col in GFF_STATIC_COLUMNS]
+        attribute_cols = [col for col in columns if col not in GFF_STATIC_COLUMNS]
+        # If 'attributes' is requested, ensure the registered table exposes it.
+        # Some parallel GFF providers omit the raw 'attributes' column; switch
+        # to a registration that includes it while keeping projection pushdown.
+        if "attributes" in static_cols:
+            from .context import ctx
+            # Preserve original parallelism and thread config when re-registering
+            orig_gff_opts = getattr(self._read_options, "gff_read_options", None)
+            orig_parallel = (
+                getattr(orig_gff_opts, "parallel", False) if orig_gff_opts else False
+            )
+            orig_thread = (
+                getattr(orig_gff_opts, "thread_num", None) if orig_gff_opts else None
+            )
+            # Build read options that ensure raw attributes are present
+            gff_options = GffReadOptions(
+                attr_fields=None,  # keep nested 'attributes' column
+                thread_num=orig_thread if orig_thread is not None else 1,
+                object_storage_options=PyObjectStorageOptions(
+                    allow_anonymous=True,
+                    enable_request_payer=False,
+                    chunk_size=8,
+                    concurrent_fetches=1,
+                    max_retries=5,
+                    timeout=300,
+                    compression_type="auto",
+                ),
+                parallel=orig_parallel,
+            )
+            read_options = ReadOptions(gff_read_options=gff_options)
+            table = py_register_table(
+                ctx, self._file_path, None, InputFormat.Gff, read_options
+            )
+            df = py_read_table(ctx, table.name)
+            new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
+            return new_lf.select(exprs)
+        if self._projection_pushdown:
+            # Optimized path: when selecting specific unnested attribute fields, re-register
+            # GFF table with those fields so DataFusion can project them efficiently.
+            # Use optimized table re-registration (fast path)
+            from .context import ctx
+            gff_options = GffReadOptions(
+                attr_fields=attribute_cols if attribute_cols else None,
+                thread_num=1,
+                object_storage_options=PyObjectStorageOptions(
+                    allow_anonymous=True,
+                    enable_request_payer=False,
+                    chunk_size=8,
+                    concurrent_fetches=1,
+                    max_retries=5,
+                    timeout=300,
+                    compression_type="auto",
+                ),
+                # Keep parallel reading consistent with base options when possible
+                parallel=getattr(
+                    getattr(self._read_options, "gff_read_options", None),
+                    "parallel",
+                    False,
+                ),
+            )
+            read_options = ReadOptions(gff_read_options=gff_options)
+            table = py_register_table(
+                ctx, self._file_path, None, InputFormat.Gff, read_options
+            )
+            df = py_read_table(ctx, table.name)
+            # Create new LazyFrame with optimized schema
+            new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
+            return new_lf.select(exprs)
+        elif attribute_cols:
+            # Extract attribute fields from nested structure (compatibility path)
+            import polars as pl
+            # Build selection with attribute field extraction
+            selection_exprs = []
+            # Add static columns as-is
+            for col in static_cols:
+                selection_exprs.append(pl.col(col))
+            # Add attribute field extractions
+            for attr_col in attribute_cols:
+                attr_expr = (
+                    pl.col("attributes")
+                    .list.eval(
+                        pl.when(pl.element().struct.field("tag") == attr_col).then(
+                            pl.element().struct.field("value")
+                        )
+                    )
+                    .list.drop_nulls()
+                    .list.first()
+                    .alias(attr_col)
+                )
+                selection_exprs.append(attr_expr)
+            return self._base_lf.select(selection_exprs)
+        else:
+            # Static columns only, use base LazyFrame
+            return self._base_lf.select(exprs)
+    def __getattr__(self, name):
+        """Delegate all other operations to base LazyFrame."""
+        return getattr(self._base_lf, name)

polars_bio/polars_bio.pyd CHANGED Viewed

Binary file

polars_bio/range_op.py CHANGED Viewed

@@ -48,6 +48,7 @@ class IntervalOperations:
         output_type: str = "polars.LazyFrame",
         read_options1: Union[ReadOptions, None] = None,
         read_options2: Union[ReadOptions, None] = None,
+        projection_pushdown: bool = False,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
         Find pairs of overlapping genomic intervals.
@@ -67,6 +68,7 @@ class IntervalOperations:
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
             read_options1: Additional options for reading the input files.
             read_options2: Additional options for reading the input files.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         Returns:
             **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -123,7 +125,14 @@ class IntervalOperations:
         )
         return range_operation(
-            df1, df2, range_options, output_type, ctx, read_options1, read_options2
+            df1,
+            df2,
+            range_options,
+            output_type,
+            ctx,
+            read_options1,
+            read_options2,
+            projection_pushdown,
         )
     @staticmethod
@@ -137,6 +146,7 @@ class IntervalOperations:
         cols2: Union[list[str], None] = ["chrom", "start", "end"],
         output_type: str = "polars.LazyFrame",
         read_options: Union[ReadOptions, None] = None,
+        projection_pushdown: bool = False,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
         Find pairs of closest genomic intervals.
@@ -154,6 +164,7 @@ class IntervalOperations:
             on_cols: List of additional column names to join on. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
             read_options: Additional options for reading the input files.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         Returns:
@@ -182,7 +193,15 @@ class IntervalOperations:
             columns_1=cols1,
             columns_2=cols2,
         )
-        return range_operation(df1, df2, range_options, output_type, ctx, read_options)
+        return range_operation(
+            df1,
+            df2,
+            range_options,
+            output_type,
+            ctx,
+            read_options,
+            projection_pushdown=projection_pushdown,
+        )
     @staticmethod
     def coverage(
@@ -195,6 +214,7 @@ class IntervalOperations:
         cols2: Union[list[str], None] = ["chrom", "start", "end"],
         output_type: str = "polars.LazyFrame",
         read_options: Union[ReadOptions, None] = None,
+        projection_pushdown: bool = False,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
         Calculate intervals coverage.
@@ -212,6 +232,7 @@ class IntervalOperations:
             on_cols: List of additional column names to join on. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
             read_options: Additional options for reading the input files.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         Returns:
@@ -245,7 +266,15 @@ class IntervalOperations:
             columns_1=cols1,
             columns_2=cols2,
         )
-        return range_operation(df2, df1, range_options, output_type, ctx, read_options)
+        return range_operation(
+            df2,
+            df1,
+            range_options,
+            output_type,
+            ctx,
+            read_options,
+            projection_pushdown=projection_pushdown,
+        )
     @staticmethod
     def count_overlaps(
@@ -258,6 +287,7 @@ class IntervalOperations:
         on_cols: Union[list[str], None] = None,
         output_type: str = "polars.LazyFrame",
         naive_query: bool = True,
+        projection_pushdown: bool = False,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
         Count pairs of overlapping genomic intervals.
@@ -275,6 +305,7 @@ class IntervalOperations:
             on_cols: List of additional column names to join on. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
             naive_query: If True, use naive query for counting overlaps based on overlaps.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         Returns:
             **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -421,6 +452,7 @@ class IntervalOperations:
         cols: Union[list[str], None] = ["chrom", "start", "end"],
         on_cols: Union[list[str], None] = None,
         output_type: str = "polars.LazyFrame",
+        projection_pushdown: bool = False,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
         Merge overlapping intervals. It is assumed that start < end.
@@ -433,6 +465,7 @@ class IntervalOperations:
                 genomic intervals, provided separately for each set.
             on_cols: List of additional column names for clustering. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
+            projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
         Returns:
             **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

polars_bio/range_op_helpers.py CHANGED Viewed

@@ -31,6 +31,7 @@ def range_operation(
     ctx: BioSessionContext,
     read_options1: Union[ReadOptions, None] = None,
     read_options2: Union[ReadOptions, None] = None,
+    projection_pushdown: bool = False,
 ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame"]:
     ctx.sync_options()
     if isinstance(df1, str) and isinstance(df2, str):
@@ -67,6 +68,7 @@ def range_operation(
                 ctx=ctx,
                 read_options1=read_options1,
                 read_options2=read_options2,
+                projection_pushdown=projection_pushdown,
             )
         elif output_type == "polars.DataFrame":
             return range_operation_scan(
@@ -100,7 +102,14 @@ def range_operation(
                     **_rename_columns(df2, range_options.suffixes[1]).schema,
                 }
             )
-            return range_lazy_scan(df1, df2, merged_schema, range_options, ctx)
+            return range_lazy_scan(
+                df1,
+                df2,
+                merged_schema,
+                range_options,
+                ctx,
+                projection_pushdown=projection_pushdown,
+            )
         else:
             df1 = _df_to_reader(df1, range_options.columns_1[0])
             df2 = _df_to_reader(df2, range_options.columns_2[0])

polars_bio/range_op_io.py CHANGED Viewed

@@ -35,6 +35,7 @@ def range_lazy_scan(
     ctx: BioSessionContext,
     read_options1: Union[ReadOptions, None] = None,
     read_options2: Union[ReadOptions, None] = None,
+    projection_pushdown: bool = False,
 ) -> pl.LazyFrame:
     range_function = None
     if isinstance(df_1, str) and isinstance(df_2, str):
@@ -50,27 +51,59 @@ def range_lazy_scan(
         _n_rows: Union[int, None],
         _batch_size: Union[int, None],
     ) -> Iterator[pl.DataFrame]:
+        # Extract projected columns if projection pushdown is enabled
+        projected_columns = None
+        if projection_pushdown and with_columns is not None:
+            from .io import _extract_column_names_from_expr
+            projected_columns = _extract_column_names_from_expr(with_columns)
+        # Apply projection pushdown to range options if enabled
+        modified_range_options = range_options
+        if projection_pushdown and projected_columns:
+            # Create a copy of range options with projection information
+            # This is where we would modify the SQL generation in a full implementation
+            modified_range_options = range_options
         df_lazy: datafusion.DataFrame = (
             range_function(
-                ctx, df_1, df_2, range_options, read_options1, read_options2, _n_rows
+                ctx,
+                df_1,
+                df_2,
+                modified_range_options,
+                read_options1,
+                read_options2,
+                _n_rows,
             )
             if isinstance(df_1, str) and isinstance(df_2, str)
-            else range_function(ctx, df_1, df_2, range_options, _n_rows)
+            else range_function(ctx, df_1, df_2, modified_range_options, _n_rows)
         )
+        # Apply DataFusion-level projection if enabled
+        datafusion_projection_applied = False
+        if projection_pushdown and projected_columns:
+            try:
+                # Try to select only the requested columns at the DataFusion level
+                df_lazy = df_lazy.select(projected_columns)
+                datafusion_projection_applied = True
+            except Exception:
+                # Fallback to Python-level selection if DataFusion selection fails
+                datafusion_projection_applied = False
         df_lazy.schema()
         df_stream = df_lazy.execute_stream()
         progress_bar = tqdm(unit="rows")
         for r in df_stream:
             py_df = r.to_pyarrow()
             df = pl.DataFrame(py_df)
-            # # TODO: We can push predicates down to the DataFusion plan in the future,
-            # #  but for now we'll do it here.
-            # if predicate is not None:
-            #     df = df.filter(predicate)
-            # # TODO: We can push columns down to the DataFusion plan in the future,
-            # #  but for now we'll do it here.
-            # if with_columns is not None:
-            #     df = df.select(with_columns)
+            # Handle predicate and column projection
+            if predicate is not None:
+                df = df.filter(predicate)
+            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
+            if with_columns is not None and (
+                not projection_pushdown or not datafusion_projection_applied
+            ):
+                df = df.select(with_columns)
             progress_bar.update(len(df))
             yield df

polars_bio/sql.py CHANGED Viewed

@@ -31,7 +31,6 @@ class SQL:
     def register_vcf(
         path: str,
         name: Union[str, None] = None,
-        info_fields: Union[list[str], None] = None,
         thread_num: int = 1,
         chunk_size: int = 64,
         concurrent_fetches: int = 8,
@@ -47,13 +46,12 @@ class SQL:
         Parameters:
             path: The path to the VCF file.
             name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
-            info_fields: The fields to read from the INFO column.
             thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
             chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
             concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
             allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
-            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
         !!! note
@@ -81,8 +79,24 @@ class SQL:
             compression_type=compression_type,
         )
+        # Get all info fields from VCF header for automatic field detection
+        all_info_fields = None
+        try:
+            from .io import IOOperations
+            vcf_schema_df = IOOperations.describe_vcf(
+                path,
+                allow_anonymous=allow_anonymous,
+                enable_request_payer=enable_request_payer,
+                compression_type=compression_type,
+            )
+            all_info_fields = vcf_schema_df.select("name").to_series().to_list()
+        except Exception:
+            # Fallback to empty list if unable to get info fields
+            all_info_fields = []
         vcf_read_options = VcfReadOptions(
-            info_fields=_cleanse_fields(info_fields),
+            info_fields=all_info_fields,
             thread_num=thread_num,
             object_storage_options=object_storage_options,
         )
@@ -93,7 +107,6 @@ class SQL:
     def register_gff(
         path: str,
         name: Union[str, None] = None,
-        attr_fields: Union[list[str], None] = None,
         thread_num: int = 1,
         chunk_size: int = 64,
         concurrent_fetches: int = 8,
@@ -102,6 +115,7 @@ class SQL:
         timeout: int = 300,
         enable_request_payer: bool = False,
         compression_type: str = "auto",
+        parallel: bool = False,
     ) -> None:
         """
         Register a GFF file as a Datafusion table.
@@ -109,7 +123,6 @@ class SQL:
         Parameters:
             path: The path to the GFF file.
             name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
-            attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
             thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
             chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
             concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
@@ -118,6 +131,7 @@ class SQL:
             compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
+            parallel: Whether to use the parallel reader for BGZF-compressed local files. Default is False.
         !!! note
             GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -127,8 +141,8 @@ class SQL:
             ```
             ```python
             import polars_bio as pb
-            pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz", attr_fields=["ID", "Parent"])
-            pb.sql("SELECT `Parent`, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY `Parent`").limit(5).collect()
+            pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz")
+            pb.sql("SELECT attributes, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY attributes").limit(5).collect()
             ```
             ```shell
@@ -161,9 +175,10 @@ class SQL:
         )
         gff_read_options = GffReadOptions(
-            attr_fields=_cleanse_fields(attr_fields),
+            attr_fields=None,
             thread_num=thread_num,
             object_storage_options=object_storage_options,
+            parallel=parallel,
         )
         read_options = ReadOptions(gff_read_options=gff_read_options)
         py_register_table(ctx, path, name, InputFormat.Gff, read_options)
@@ -179,7 +194,7 @@ class SQL:
         timeout: int = 300,
         enable_request_payer: bool = False,
         compression_type: str = "auto",
-        parallel: bool = True,
+        parallel: bool = False,
     ) -> None:
         """
         Register a FASTQ file as a Datafusion table.
@@ -194,7 +209,7 @@ class SQL:
             compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
-            parallel: Whether to use the parallel reader for BGZF compressed files.
+            parallel: Whether to use the parallel reader for BGZF compressed files. Default is False. If a file ends with ".gz" but is actually BGZF, it will attempt the parallel path and fall back to standard if not BGZF.
         !!! Example
             ```python
@@ -265,7 +280,7 @@ class SQL:
             concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
             allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
-            compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+            compression_type: The compression type of the BED file. If not specified, it will be detected automatically..
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.

polars_bio/utils.py CHANGED Viewed

@@ -12,9 +12,11 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
     return [x.strip() for x in t]
-def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
+def _lazy_scan(
+    df: Union[pl.DataFrame, pl.LazyFrame], projection_pushdown: bool = False
+) -> pl.LazyFrame:
     df_lazy: DataFrame = df
-    arrow_schema = df_lazy.schema()
+    original_schema = df_lazy.schema()
     def _overlap_source(
         with_columns: Union[pl.Expr, None],
@@ -22,25 +24,101 @@ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
         n_rows: Union[int, None],
         _batch_size: Union[int, None],
     ) -> Iterator[pl.DataFrame]:
+        # Extract column names from with_columns if projection pushdown is enabled
+        projected_columns = None
+        if projection_pushdown and with_columns is not None:
+            projected_columns = _extract_column_names_from_expr(with_columns)
+        # Apply column projection to DataFusion query if enabled
+        query_df = df_lazy
+        datafusion_projection_applied = False
+        if projection_pushdown and projected_columns:
+            try:
+                query_df = df_lazy.select(projected_columns)
+                datafusion_projection_applied = True
+                # For testing: allow inspection of the execution plan
+                if hasattr(df_lazy, "_test_projection_capture"):
+                    df_lazy._test_projection_capture = {
+                        "original_plan": str(df_lazy.optimized_logical_plan()),
+                        "projected_plan": str(query_df.optimized_logical_plan()),
+                        "projected_columns": projected_columns,
+                        "datafusion_projection_applied": True,
+                    }
+            except Exception as e:
+                # Fallback to original behavior if projection fails
+                query_df = df_lazy
+                projected_columns = None
+                datafusion_projection_applied = False
+                # For testing: capture the failure
+                if hasattr(df_lazy, "_test_projection_capture"):
+                    df_lazy._test_projection_capture = {
+                        "original_plan": str(df_lazy.optimized_logical_plan()),
+                        "projected_plan": None,
+                        "projected_columns": projected_columns,
+                        "datafusion_projection_applied": False,
+                        "error": str(e),
+                    }
         if n_rows and n_rows < 8192:  # 8192 is the default batch size in datafusion
-            df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
+            df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
             df = pl.DataFrame(df).limit(n_rows)
             if predicate is not None:
                 df = df.filter(predicate)
-            if with_columns is not None:
+            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
+            if with_columns is not None and (
+                not projection_pushdown or not datafusion_projection_applied
+            ):
                 df = df.select(with_columns)
             yield df
             return
-        df_stream = df_lazy.execute_stream()
+        df_stream = query_df.execute_stream()
         progress_bar = tqdm(unit="rows")
         for r in df_stream:
             py_df = r.to_pyarrow()
             df = pl.DataFrame(py_df)
             if predicate is not None:
                 df = df.filter(predicate)
-            if with_columns is not None:
+            # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
+            if with_columns is not None and (
+                not projection_pushdown or not datafusion_projection_applied
+            ):
                 df = df.select(with_columns)
             progress_bar.update(len(df))
             yield df
-    return register_io_source(_overlap_source, schema=arrow_schema)
+    return register_io_source(_overlap_source, schema=original_schema)
+def _extract_column_names_from_expr(with_columns: Union[pl.Expr, list]) -> list[str]:
+    """Extract column names from Polars expressions."""
+    if with_columns is None:
+        return []
+    # Handle different types of with_columns input
+    if hasattr(with_columns, "__iter__") and not isinstance(with_columns, str):
+        # It's a list of expressions or strings
+        column_names = []
+        for item in with_columns:
+            if isinstance(item, str):
+                column_names.append(item)
+            elif hasattr(item, "meta") and hasattr(item.meta, "output_name"):
+                # Polars expression with output name
+                try:
+                    column_names.append(item.meta.output_name())
+                except Exception:
+                    pass
+        return column_names
+    elif isinstance(with_columns, str):
+        return [with_columns]
+    elif hasattr(with_columns, "meta") and hasattr(with_columns.meta, "output_name"):
+        # Single Polars expression
+        try:
+            return [with_columns.meta.output_name()]
+        except Exception:
+            pass
+    return []

{polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: polars-bio
-Version: 0.13.1
+Version: 0.14.0
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -9,6 +9,7 @@ Requires-Dist: pyarrow~=21.0.0
 Requires-Dist: datafusion~=48.0.0
 Requires-Dist: tqdm~=4.67.1
 Requires-Dist: typing-extensions~=4.14.0
+Requires-Dist: mkdocs-glightbox>=0.5.1,<0.6.0
 Requires-Dist: pandas ; extra == 'pandas'
 Requires-Dist: bioframe ; extra == 'viz'
 Requires-Dist: matplotlib ; extra == 'viz'

polars_bio-0.14.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+polars_bio-0.14.0.dist-info/METADATA,sha256=f6XqkJ12QrWensCi-C5g5rGWRMz9oS5BIcKy0VAA2uI,729
+polars_bio-0.14.0.dist-info/WHEEL,sha256=-M5O7l5EczTA8VFaBQsg2Fpg0dKz0WOuvpt3nEh86bo,94
+polars_bio-0.14.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+polars_bio/__init__.py,sha256=IkWQcFLzSF66OCBJouo2urgOm-me7mo0fBGhFaDgN7Y,3097
+polars_bio/constants.py,sha256=m9jMLB8PpbmWcsrCQZhRBGsBAE6X8frsSlgteVeEyo4,119
+polars_bio/context.py,sha256=AP5EM2TsB9zcomlsPEz8xMwQnEXwqpRsgBTnZsHYQwA,1723
+polars_bio/interval_op_helpers.py,sha256=DQIo4lUxzd-ySUbjfwNSk5zYcxpprwQe32kTPE28ypw,2930
+polars_bio/io.py,sha256=fvToItTlOxR-nOCAXeYekxzdWJT_BHcjbuExCGhRQmw,52066
+polars_bio/logging.py,sha256=Q25cv4qiwLmAiGJq6ZlqYJn2WJ_uN-c5_eopib2z8bc,1354
+polars_bio/operations.py,sha256=amhaff8Ha3UuQmS8OCVFXRQWvQOW_4G2T5U8tF1f7mc,2272
+polars_bio/polars_bio.pyd,sha256=fflNh2VhTw-2neThPmJE6oStpPxEap3qARZVYNAKjV4,275155968
+polars_bio/polars_ext.py,sha256=lT8-cYAvSyhbzbpozjlF59VWTCYOzLafSZ-7bi9f49Y,9658
+polars_bio/range_op.py,sha256=UbWKBf06rPf2GXAQT0TzXR6H0rVZeCcFCqxISMuzNpk,26289
+polars_bio/range_op_helpers.py,sha256=RcvXc52cJVnK4fyCtwEcYvOB5TmKItGyiReiHBGHDng,6200
+polars_bio/range_op_io.py,sha256=XTBTclFCCe4utMRAju9rOUzHvLkpKo5dCn-aCBwzRfY,7275
+polars_bio/range_utils.py,sha256=Q0UPB7DV4mPjOlQ_xDVLN3vJaY9ZEr4IHFVfVBnPLDY,1446
+polars_bio/sql.py,sha256=vWdZCyAXTPUHTko9al90JK8tgrChnB7Fn2hUiE0bw5c,24986
+polars_bio/utils.py,sha256=a-PHpiggjFm5u_PkrswPFT4DgY1kq2Ks0XLkw3nMxAI,5096
+polars_bio-0.14.0.dist-info/RECORD,,

{polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: maturin (1.9.3)
+Generator: maturin (1.9.4)
 Root-Is-Purelib: false
 Tag: cp39-abi3-win_amd64

polars_bio-0.13.1.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-polars_bio-0.13.1.dist-info/METADATA,sha256=wGl6-MrJR_DMgJstlr5b4R326hxH3K6-QPMq8zKmfsw,683
-polars_bio-0.13.1.dist-info/WHEEL,sha256=2XatmAWXBfp_P6DUtFAtbdzzba6f_xbhEtpqsZt_zEg,94
-polars_bio-0.13.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-polars_bio/__init__.py,sha256=2Nhz5w4jLDk7-5OSLQ3ieTz8KuogpggK6JsL2fTRp0U,3097
-polars_bio/constants.py,sha256=m9jMLB8PpbmWcsrCQZhRBGsBAE6X8frsSlgteVeEyo4,119
-polars_bio/context.py,sha256=AP5EM2TsB9zcomlsPEz8xMwQnEXwqpRsgBTnZsHYQwA,1723
-polars_bio/interval_op_helpers.py,sha256=DQIo4lUxzd-ySUbjfwNSk5zYcxpprwQe32kTPE28ypw,2930
-polars_bio/io.py,sha256=o63g1l33nRlwPp7pSQbkTrxiaXajE7ClA1_-xN9v5SI,38624
-polars_bio/logging.py,sha256=Q25cv4qiwLmAiGJq6ZlqYJn2WJ_uN-c5_eopib2z8bc,1354
-polars_bio/operations.py,sha256=amhaff8Ha3UuQmS8OCVFXRQWvQOW_4G2T5U8tF1f7mc,2272
-polars_bio/polars_bio.pyd,sha256=E87sVQXw1xid0eNAXjHZAiHgFPvmIqyk_Oxn-Sz8UPk,274686976
-polars_bio/polars_ext.py,sha256=lT8-cYAvSyhbzbpozjlF59VWTCYOzLafSZ-7bi9f49Y,9658
-polars_bio/range_op.py,sha256=K6VyfgbXb4q8G7XAYSj1zrjuUHQGxIvNMt-y-6iPCyQ,24863
-polars_bio/range_op_helpers.py,sha256=IoWQb-BpeDn67KkTl5x3nXdrftsd_mtjOJDLCqO7mrI,5943
-polars_bio/range_op_io.py,sha256=MKl7Zg8Wd_mWLXhOJghrmpNsPOOMvq5pNcJvp9DtCG0,5883
-polars_bio/range_utils.py,sha256=Q0UPB7DV4mPjOlQ_xDVLN3vJaY9ZEr4IHFVfVBnPLDY,1446
-polars_bio/sql.py,sha256=m6P99rfnomFXB01AicOwx72tT79IO8wKQafYQTb78SI,24618
-polars_bio/utils.py,sha256=RaAU5pMt0P6Ptt6LYBeK5-0WKAmuvhV7ifU05nfVGA8,1611
-polars_bio-0.13.1.dist-info/RECORD,,

{polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes