PyPI - polars-bio - Versions diffs - 0.11.0__cp39-abi3-macosx_10_12_x86_64.whl → 0.13.0__cp39-abi3-macosx_10_12_x86_64.whl - Mend

polars-bio 0.11.0__cp39-abi3-macosx_10_12_x86_64.whl → 0.13.0__cp39-abi3-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

polars_bio/__init__.py +75 -32
polars_bio/interval_op_helpers.py +1 -5
polars_bio/io.py +291 -286
polars_bio/polars_bio.abi3.so +0 -0
polars_bio/range_op.py +3 -17
polars_bio/range_op_helpers.py +0 -13
polars_bio/sql.py +4 -8
polars_bio/utils.py +46 -0
{polars_bio-0.11.0.dist-info → polars_bio-0.13.0.dist-info}/METADATA +2 -2
polars_bio-0.13.0.dist-info/RECORD +19 -0
polars_bio-0.11.0.dist-info/RECORD +0 -18
{polars_bio-0.11.0.dist-info → polars_bio-0.13.0.dist-info}/WHEEL +0 -0
{polars_bio-0.11.0.dist-info → polars_bio-0.13.0.dist-info}/licenses/LICENSE +0 -0

polars_bio/__init__.py CHANGED Viewed

@@ -1,39 +1,53 @@
+import os
+# Set POLARS_FORCE_NEW_STREAMING to "1" by default if not already set
+if "POLARS_FORCE_NEW_STREAMING" not in os.environ:
+    os.environ["POLARS_FORCE_NEW_STREAMING"] = "1"
 from polars_bio.polars_bio import GffReadOptions, InputFormat
 from polars_bio.polars_bio import PyObjectStorageOptions as ObjectStorageOptions
 from polars_bio.polars_bio import ReadOptions, VcfReadOptions
 from .context import ctx, set_option
-from .sql import SQL
-register_gff = SQL.register_gff
-register_vcf = SQL.register_vcf
-register_fastq = SQL.register_fastq
-register_bam = SQL.register_bam
-register_bed = SQL.register_bed
-register_view = SQL.register_view
-sql = SQL.sql
-from .io import IOOperations
-describe_vcf = IOOperations.describe_vcf
-from_polars = IOOperations.from_polars
-read_bam = IOOperations.read_bam
-read_fastq = IOOperations.read_fastq
-read_gff = IOOperations.read_gff
-read_table = IOOperations.read_table
-read_vcf = IOOperations.read_vcf
-read_fastq = IOOperations.read_fastq
-read_bed = IOOperations.read_bed
-read_fasta = IOOperations.read_fasta
-from .range_op import IntervalOperations
-overlap = IntervalOperations.overlap
-nearest = IntervalOperations.nearest
-count_overlaps = IntervalOperations.count_overlaps
-coverage = IntervalOperations.coverage
-merge = IntervalOperations.merge
+from .sql import SQL as data_processing
+register_gff = data_processing.register_gff
+register_vcf = data_processing.register_vcf
+register_fastq = data_processing.register_fastq
+register_bam = data_processing.register_bam
+register_bed = data_processing.register_bed
+register_view = data_processing.register_view
+sql = data_processing.sql
+from .io import IOOperations as data_input
+describe_vcf = data_input.describe_vcf
+from_polars = data_input.from_polars
+read_bam = data_input.read_bam
+read_fastq = data_input.read_fastq
+read_gff = data_input.read_gff
+read_table = data_input.read_table
+read_vcf = data_input.read_vcf
+read_fastq = data_input.read_fastq
+read_bed = data_input.read_bed
+read_fasta = data_input.read_fasta
+scan_bam = data_input.scan_bam
+scan_bed = data_input.scan_bed
+scan_fasta = data_input.scan_fasta
+scan_fastq = data_input.scan_fastq
+scan_gff = data_input.scan_gff
+scan_table = data_input.scan_table
+scan_vcf = data_input.scan_vcf
+from .range_op import IntervalOperations as range_operations
+overlap = range_operations.overlap
+nearest = range_operations.nearest
+count_overlaps = range_operations.count_overlaps
+coverage = range_operations.coverage
+merge = range_operations.merge
 try:
     from .range_utils import Utils
@@ -59,7 +73,7 @@ except ImportError:
 POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
-__version__ = "0.11.0"
+__version__ = "0.13.0"
 __all__ = [
     "ctx",
     "FilterOp",
@@ -74,4 +88,33 @@ __all__ = [
     "ObjectStorageOptions",
     "set_option",
     "set_loglevel",
+    "describe_vcf",
+    "from_polars",
+    "read_bam",
+    "read_bed",
+    "read_fasta",
+    "read_fastq",
+    "read_gff",
+    "read_table",
+    "read_vcf",
+    "scan_bam",
+    "scan_bed",
+    "scan_fasta",
+    "scan_fastq",
+    "scan_gff",
+    "scan_table",
+    "scan_vcf",
+    "register_gff",
+    "register_vcf",
+    "register_fastq",
+    "register_bam",
+    "register_bed",
+    "register_view",
+    "sql",
+    "overlap",
+    "nearest",
+    "count_overlaps",
+    "coverage",
+    "merge",
+    "visualize_intervals",
 ]

polars_bio/interval_op_helpers.py CHANGED Viewed

@@ -81,12 +81,8 @@ def df_to_lazyframe(df: datafusion.DataFrame) -> pl.LazyFrame:
 def convert_result(
-    df: datafusion.DataFrame, output_type: str, streaming: bool
+    df: datafusion.DataFrame, output_type: str
 ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame"]:
-    # TODO: implement streaming
-    if streaming:
-        # raise NotImplementedError("streaming is not implemented")
-        return df.to_polars().lazy()
     if output_type == "polars.DataFrame":
         return df.to_polars()
     elif output_type == "pandas.DataFrame":

polars_bio/io.py CHANGED Viewed

@@ -70,29 +70,6 @@ SCHEMAS = {
 class IOOperations:
-    # TODO handling reference
-    # def read_cram(path: str) -> pl.LazyFrame:
-    #     """
-    #     Read a CRAM file into a LazyFrame.
-    #
-    #     Parameters:
-    #         path: The path to the CRAM file.
-    #     """
-    #     return file_lazy_scan(path, InputFormat.Cram)
-    # TODO passing of bam_region_filter
-    # def read_indexed_bam(path: str) -> pl.LazyFrame:
-    #     """
-    #     Read an indexed BAM file into a LazyFrame.
-    #
-    #     Parameters:
-    #         path: The path to the BAM file.
-    #
-    #     !!! warning
-    #         Predicate pushdown is not supported yet. So no real benefit from using an indexed BAM file.
-    #     """
-    #     return file_lazy_scan(path, InputFormat.IndexedBam)
     @staticmethod
     def read_fasta(
         path: str,
@@ -103,11 +80,10 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
-        streaming: bool = False,
-    ) -> Union[pl.LazyFrame, pl.DataFrame]:
+    ) -> pl.DataFrame:
         """
-        Read a FASTA file into a LazyFrame.
+        Read a FASTA file into a DataFrame.
         Parameters:
             path: The path to the FASTA file.
@@ -118,7 +94,6 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
-            streaming: Whether to read the FASTA file in streaming mode.
         !!! Example
             ```shell
@@ -127,7 +102,63 @@ class IOOperations:
             ```python
             import polars_bio as pb
-            pb.read_fasta("/tmp/test.fasta").limit(1).collect()
+            pb.read_fasta("/tmp/test.fasta").limit(1)
+            ```
+            ```shell
+             shape: (1, 3)
+            ┌─────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
+            │ name                    ┆ description                     ┆ sequence                        │
+            │ ---                     ┆ ---                             ┆ ---                             │
+            │ str                     ┆ str                             ┆ str                             │
+            ╞═════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
+            │ ENA|BK006935|BK006935.2 ┆ TPA_inf: Saccharomyces cerevis… ┆ CCACACCACACCCACACACCCACACACCAC… │
+            └─────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘
+            ```
+        """
+        return IOOperations.scan_fasta(
+            path,
+            chunk_size,
+            concurrent_fetches,
+            allow_anonymous,
+            enable_request_payer,
+            max_retries,
+            timeout,
+            compression_type,
+        ).collect()
+    @staticmethod
+    def scan_fasta(
+        path: str,
+        chunk_size: int = 8,
+        concurrent_fetches: int = 1,
+        allow_anonymous: bool = True,
+        enable_request_payer: bool = False,
+        max_retries: int = 5,
+        timeout: int = 300,
+        compression_type: str = "auto",
+    ) -> pl.LazyFrame:
+        """
+        Lazily read a FASTA file into a LazyFrame.
+        Parameters:
+            path: The path to the FASTA file.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
+            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
+            max_retries:  The maximum number of retries for reading the file from object storage.
+            timeout: The timeout in seconds for reading the file from object storage.
+            compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
+        !!! Example
+            ```shell
+            wget https://www.ebi.ac.uk/ena/browser/api/fasta/BK006935.2?download=true -O /tmp/test.fasta
+            ```
+            ```python
+            import polars_bio as pb
+            pb.scan_fasta("/tmp/test.fasta").limit(1).collect()
             ```
             ```shell
              shape: (1, 3)
@@ -153,11 +184,7 @@ class IOOperations:
             object_storage_options=object_storage_options
         )
         read_options = ReadOptions(fasta_read_options=fasta_read_options)
-        if streaming:
-            return read_file(path, InputFormat.Fasta, read_options, streaming)
-        else:
-            df = read_file(path, InputFormat.Fasta, read_options)
-            return lazy_scan(df)
+        return _read_file(path, InputFormat.Fasta, read_options)
     @staticmethod
     def read_vcf(
@@ -171,10 +198,53 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
-        streaming: bool = False,
-    ) -> Union[pl.LazyFrame, pl.DataFrame]:
+    ) -> pl.DataFrame:
         """
-        Read a VCF file into a LazyFrame.
+        Read a VCF file into a DataFrame.
+        Parameters:
+            path: The path to the VCF file.
+            info_fields: The fields to read from the INFO column.
+            thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
+            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
+            max_retries:  The maximum number of retries for reading the file from object storage.
+            timeout: The timeout in seconds for reading the file from object storage.
+            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
+        !!! note
+            VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
+        """
+        return IOOperations.scan_vcf(
+            path,
+            info_fields,
+            thread_num,
+            chunk_size,
+            concurrent_fetches,
+            allow_anonymous,
+            enable_request_payer,
+            max_retries,
+            timeout,
+            compression_type,
+        ).collect()
+    @staticmethod
+    def scan_vcf(
+        path: str,
+        info_fields: Union[list[str], None] = None,
+        thread_num: int = 1,
+        chunk_size: int = 8,
+        concurrent_fetches: int = 1,
+        allow_anonymous: bool = True,
+        enable_request_payer: bool = False,
+        max_retries: int = 5,
+        timeout: int = 300,
+        compression_type: str = "auto",
+    ) -> pl.LazyFrame:
+        """
+        Lazily read a VCF file into a LazyFrame.
         Parameters:
             path: The path to the VCF file.
@@ -187,7 +257,6 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
-            streaming: Whether to read the VCF file in streaming mode.
         !!! note
             VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -208,11 +277,7 @@ class IOOperations:
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(vcf_read_options=vcf_read_options)
-        if streaming:
-            return read_file(path, InputFormat.Vcf, read_options, streaming)
-        else:
-            df = read_file(path, InputFormat.Vcf, read_options)
-            return lazy_scan(df)
+        return _read_file(path, InputFormat.Vcf, read_options)
     @staticmethod
     def read_gff(
@@ -226,10 +291,9 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
-        streaming: bool = False,
-    ) -> Union[pl.LazyFrame, pl.DataFrame]:
+    ) -> pl.DataFrame:
         """
-        Read a GFF file into a LazyFrame.
+        Read a GFF file into a DataFrame.
         Parameters:
             path: The path to the GFF file.
@@ -242,58 +306,51 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
-            streaming: Whether to read the GFF file in streaming mode.
+        !!! note
+            GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
+        """
+        return IOOperations.scan_gff(
+            path,
+            attr_fields,
+            thread_num,
+            chunk_size,
+            concurrent_fetches,
+            allow_anonymous,
+            enable_request_payer,
+            max_retries,
+            timeout,
+            compression_type,
+        ).collect()
-        !!! Example
-            ```shell
-            wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz
-            ```
-            Read a GFF file **without** unnesting attributes:
-            ```python
-            import polars_bio as pb
-            gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
-            pb.read_gff(gff_path).limit(5).collect()
-            ```
-            ```shell
-            shape: (5, 9)
-            ┌───────┬───────┬───────┬────────────┬───┬───────┬────────┬───────┬─────────────────────────────────┐
-            │ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ score ┆ strand ┆ phase ┆ attributes                      │
-            │ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---   ┆ ---    ┆ ---   ┆ ---                             │
-            │ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ f32   ┆ str    ┆ u32   ┆ list[struct[2]]                 │
-            ╞═══════╪═══════╪═══════╪════════════╪═══╪═══════╪════════╪═══════╪═════════════════════════════════╡
-            │ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENSG00000223972.5"}, {… │
-            │ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENST00000456328.2"}, {… │
-            │ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
-            │ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
-            │ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
-            └───────┴───────┴───────┴────────────┴───┴───────┴────────┴───────┴─────────────────────────────────┘
-            ```
+    @staticmethod
+    def scan_gff(
+        path: str,
+        attr_fields: Union[list[str], None] = None,
+        thread_num: int = 1,
+        chunk_size: int = 8,
+        concurrent_fetches: int = 1,
+        allow_anonymous: bool = True,
+        enable_request_payer: bool = False,
+        max_retries: int = 5,
+        timeout: int = 300,
+        compression_type: str = "auto",
+    ) -> pl.LazyFrame:
+        """
+        Lazily read a GFF file into a LazyFrame.
-            Read a GFF file **with** unnesting attributes:
-            ```python
-            import polars_bio as pb
-            gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
-            pb.read_gff(gff_path, attr_fields=["ID", "havana_transcript"]).limit(5).collect()
-            ```
-            ```shell
+        Parameters:
+            path: The path to the GFF file.
+            attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
+            thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
+            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
+            max_retries:  The maximum number of retries for reading the file from object storage.
+            timeout: The timeout in seconds for reading the file from object storage.
+            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
-            shape: (5, 10)
-            ┌───────┬───────┬───────┬────────────┬───┬────────┬───────┬──────────────────────────┬──────────────────────┐
-            │ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ strand ┆ phase ┆ ID                       ┆ havana_transcript    │
-            │ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---    ┆ ---   ┆ ---                      ┆ ---                  │
-            │ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ str    ┆ u32   ┆ str                      ┆ str                  │
-            ╞═══════╪═══════╪═══════╪════════════╪═══╪════════╪═══════╪══════════════════════════╪══════════════════════╡
-            │ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ +      ┆ null  ┆ ENSG00000223972.5        ┆ null                 │
-            │ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ +      ┆ null  ┆ ENST00000456328.2        ┆ OTTHUMT00000362751.1 │
-            │ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:1 ┆ OTTHUMT00000362751.1 │
-            │ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:2 ┆ OTTHUMT00000362751.1 │
-            │ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:3 ┆ OTTHUMT00000362751.1 │
-            └───────┴───────┴───────┴────────────┴───┴────────┴───────┴──────────────────────────┴──────────────────────┘
-            ```
         !!! note
             GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
         """
@@ -313,11 +370,7 @@ class IOOperations:
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(gff_read_options=gff_read_options)
-        if streaming:
-            return read_file(path, InputFormat.Gff, read_options, streaming)
-        else:
-            df = read_file(path, InputFormat.Gff, read_options)
-            return lazy_scan(df)
+        return _read_file(path, InputFormat.Gff, read_options)
     @staticmethod
     def read_bam(
@@ -329,10 +382,9 @@ class IOOperations:
         enable_request_payer: bool = False,
         max_retries: int = 5,
         timeout: int = 300,
-        streaming: bool = False,
-    ) -> Union[pl.LazyFrame, pl.DataFrame]:
+    ) -> pl.DataFrame:
         """
-        Read a BAM file into a LazyFrame.
+        Read a BAM file into a DataFrame.
         Parameters:
             path: The path to the BAM file.
@@ -343,33 +395,44 @@ class IOOperations:
             enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
-            streaming: Whether to read the BAM file in streaming mode.
-        !!! Example
+        !!! note
+            BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
+        """
+        return IOOperations.scan_bam(
+            path,
+            thread_num,
+            chunk_size,
+            concurrent_fetches,
+            allow_anonymous,
+            enable_request_payer,
+            max_retries,
+            timeout,
+        ).collect()
-            ```python
-            import polars_bio as pb
-            bam = pb.read_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam").limit(3)
-            bam.collect()
-            ```
-            ```shell
-            INFO:polars_bio:Table: hg00096_mapped_illumina_bwa_gbr_low_coverage_20120522 registered for path: gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam
-            shape: (3, 11)
-            ┌────────────────────┬───────┬───────┬───────┬───┬────────────┬────────────┬─────────────────────────────────┬─────────────────────────────────┐
-            │ name               ┆ chrom ┆ start ┆ end   ┆ … ┆ mate_chrom ┆ mate_start ┆ sequence                        ┆ quality_scores                  │
-            │ ---                ┆ ---   ┆ ---   ┆ ---   ┆   ┆ ---        ┆ ---        ┆ ---                             ┆ ---                             │
-            │ str                ┆ str   ┆ u32   ┆ u32   ┆   ┆ str        ┆ u32        ┆ str                             ┆ str                             │
-            ╞════════════════════╪═══════╪═══════╪═══════╪═══╪════════════╪════════════╪═════════════════════════════════╪═════════════════════════════════╡
-            │ SRR062634.9882510  ┆ chr1  ┆ 10001 ┆ 10044 ┆ … ┆ chr1       ┆ 10069      ┆ TAACCCTAACCCTACCCTAACCCTAACCCT… ┆ 0<>=/0E:7;08FBDIF9;2%=<>+FCDDA… │
-            │ SRR062641.21956756 ┆ chr1  ┆ 10001 ┆ 10049 ┆ … ┆ chr1       ┆ 10051      ┆ TAACCCTACCCTAACCCTAACCCTAACCCT… ┆ 0=MLOOPNNPPJHPOQQROQPQQRIQPRJB… │
-            │ SRR062641.13613107 ┆ chr1  ┆ 10002 ┆ 10072 ┆ … ┆ chr1       ┆ 10110      ┆ AACCCTAACCCCTAACCCCTAACCCCTAAC… ┆ 0KKNPQOQOQIQRPQPRRRRPQPRRRRPRF… │
-            └────────────────────┴───────┴───────┴───────┴───┴────────────┴────────────┴─────────────────────────────────┴─────────────────────────────────┘
-            ```
+    @staticmethod
+    def scan_bam(
+        path: str,
+        thread_num: int = 1,
+        chunk_size: int = 8,
+        concurrent_fetches: int = 1,
+        allow_anonymous: bool = True,
+        enable_request_payer: bool = False,
+        max_retries: int = 5,
+        timeout: int = 300,
+    ) -> pl.LazyFrame:
+        """
+        Lazily read a BAM file into a LazyFrame.
-            ```python
-            bam.collect_schema()
-            Schema({'name': String, 'chrom': String, 'start': UInt32, 'end': UInt32, 'flags': UInt32, 'cigar': String, 'mapping_quality': UInt32, 'mate_chrom': String, 'mate_start': UInt32, 'sequence': String, 'quality_scores': String})
-            ```
+        Parameters:
+            path: The path to the BAM file.
+            thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
+            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
+            max_retries:  The maximum number of retries for reading the file from object storage.
+            timeout: The timeout in seconds for reading the file from object storage.
         !!! note
             BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
@@ -389,11 +452,7 @@ class IOOperations:
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(bam_read_options=bam_read_options)
-        if streaming:
-            return read_file(path, InputFormat.Bam, read_options, streaming)
-        else:
-            df = read_file(path, InputFormat.Bam, read_options)
-            return lazy_scan(df)
+        return _read_file(path, InputFormat.Bam, read_options)
     @staticmethod
     def read_fastq(
@@ -405,11 +464,10 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
-        streaming: bool = False,
         parallel: bool = False,
-    ) -> Union[pl.LazyFrame, pl.DataFrame]:
+    ) -> pl.DataFrame:
         """
-        Read a FASTQ file into a LazyFrame.
+        Read a FASTQ file into a DataFrame.
         Parameters:
             path: The path to the FASTQ file.
@@ -420,44 +478,46 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
-            streaming: Whether to read the FASTQ file in streaming mode.
             parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
+        """
+        return IOOperations.scan_fastq(
+            path,
+            chunk_size,
+            concurrent_fetches,
+            allow_anonymous,
+            enable_request_payer,
+            max_retries,
+            timeout,
+            compression_type,
+            parallel,
+        ).collect()
-        !!! Example
-            ```python
-            import polars_bio as pb
-            pb.read_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz").limit(1).collect()
-            ```
-            ```shell
-            shape: (1, 4)
-            ┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
-            │ name                ┆ description                     ┆ sequence                        ┆ quality_scores                  │
-            │ ---                 ┆ ---                             ┆ ---                             ┆ ---                             │
-            │ str                 ┆ str                             ┆ str                             ┆ str                             │
-            ╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
-            │ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… ┆ TGGAAGGTTCTCGAAAAAAATGGAATCGAA… ┆ ?@;DDBDDBHF??FFB@B)1:CD3*:?DFF… │
-            └─────────────────────┴─────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘
-            ```
-            Parallel reading of BZGF compressed FASTQ files stored locally:
-            ```shell
-            ls -1 /tmp/ERR194146.fastq.bgz*
-            ERR194146.fastq.bgz
-            ERR194146.fastq.bgz.gzi
-            ```
-            ```python
-            import polars_bio as pb
-            ## Set the number of target partitions (threads) to 2
-            pb.set_option("datafusion.execution.target_partitions", "2")
-            pb.read_fastq("/tmp/ERR194146.fastq.bgz", parallel=True).count().collect()
-            ```
+    @staticmethod
+    def scan_fastq(
+        path: str,
+        chunk_size: int = 8,
+        concurrent_fetches: int = 1,
+        allow_anonymous: bool = True,
+        enable_request_payer: bool = False,
+        max_retries: int = 5,
+        timeout: int = 300,
+        compression_type: str = "auto",
+        parallel: bool = False,
+    ) -> pl.LazyFrame:
         """
+        Lazily read a FASTQ file into a LazyFrame.
+        Parameters:
+            path: The path to the FASTQ file.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
+            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
+            max_retries:  The maximum number of retries for reading the file from object storage.
+            timeout: The timeout in seconds for reading the file from object storage.
+            compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
+            parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
+        """
         object_storage_options = PyObjectStorageOptions(
             allow_anonymous=allow_anonymous,
             enable_request_payer=enable_request_payer,
@@ -472,11 +532,7 @@ class IOOperations:
             object_storage_options=object_storage_options, parallel=parallel
         )
         read_options = ReadOptions(fastq_read_options=fastq_read_options)
-        if streaming:
-            return read_file(path, InputFormat.Fastq, read_options, streaming)
-        else:
-            df = read_file(path, InputFormat.Fastq, read_options)
-            return lazy_scan(df)
+        return _read_file(path, InputFormat.Fastq, read_options)
     @staticmethod
     def read_bed(
@@ -489,10 +545,9 @@ class IOOperations:
         max_retries: int = 5,
         timeout: int = 300,
         compression_type: str = "auto",
-        streaming: bool = False,
-    ) -> Union[pl.LazyFrame, pl.DataFrame]:
+    ) -> pl.DataFrame:
         """
-        Read a BED file into a LazyFrame.
+        Read a BED file into a DataFrame.
         Parameters:
             path: The path to the BED file.
@@ -504,44 +559,59 @@ class IOOperations:
             max_retries:  The maximum number of retries for reading the file from object storage.
             timeout: The timeout in seconds for reading the file from object storage.
             compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
-            streaming: Whether to read the BED file in streaming mode.
         !!! Note
             Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
             Also unlike other text formats, **GZIP** compression is not supported.
-        !!! Example
-            ```shell
+        !!! note
+            BED reader uses **1-based** coordinate system for the `start`, `end`.
+        """
+        return IOOperations.scan_bed(
+            path,
+            thread_num,
+            chunk_size,
+            concurrent_fetches,
+            allow_anonymous,
+            enable_request_payer,
+            max_retries,
+            timeout,
+            compression_type,
+        ).collect()
-             cd /tmp
-             wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
-             unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"
-            ```
+    @staticmethod
+    def scan_bed(
+        path: str,
+        thread_num: int = 1,
+        chunk_size: int = 8,
+        concurrent_fetches: int = 1,
+        allow_anonymous: bool = True,
+        enable_request_payer: bool = False,
+        max_retries: int = 5,
+        timeout: int = 300,
+        compression_type: str = "auto",
+    ) -> pl.LazyFrame:
+        """
+        Lazily read a BED file into a LazyFrame.
-            ```python
-            import polars_bio as pb
-            pb.read_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed").limit(5).collect()
-            ```
+        Parameters:
+            path: The path to the BED file.
+            thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
+            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
+            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
+            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
+            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
+            max_retries:  The maximum number of retries for reading the file from object storage.
+            timeout: The timeout in seconds for reading the file from object storage.
+            compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
-            ```shell
+        !!! Note
+            Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
+            Also unlike other text formats, **GZIP** compression is not supported.
-            shape: (5, 4)
-            ┌───────┬───────────┬───────────┬───────┐
-            │ chrom ┆ start     ┆ end       ┆ name  │
-            │ ---   ┆ ---       ┆ ---       ┆ ---   │
-            │ str   ┆ u32       ┆ u32       ┆ str   │
-            ╞═══════╪═══════════╪═══════════╪═══════╡
-            │ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
-            │ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
-            │ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
-            │ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
-            │ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
-            └───────┴───────────┴───────────┴───────┘
-            ```
         !!! note
             BED reader uses **1-based** coordinate system for the `start`, `end`.
         """
         object_storage_options = PyObjectStorageOptions(
             allow_anonymous=allow_anonymous,
             enable_request_payer=enable_request_payer,
@@ -557,24 +627,31 @@ class IOOperations:
             object_storage_options=object_storage_options,
         )
         read_options = ReadOptions(bed_read_options=bed_read_options)
-        if streaming:
-            return read_file(path, InputFormat.Bed, read_options, streaming)
-        else:
-            df = read_file(path, InputFormat.Bed, read_options)
-            return lazy_scan(df)
+        return _read_file(path, InputFormat.Bed, read_options)
     @staticmethod
-    def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
+    def read_table(path: str, schema: Dict = None, **kwargs) -> pl.DataFrame:
         """
-         Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
+         Read a tab-delimited (i.e. BED) file into a Polars DataFrame.
          Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
-         but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
+         but faster. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
         Parameters:
             path: The path to the file.
             schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
+        """
+        return IOOperations.scan_table(path, schema, **kwargs).collect()
+    @staticmethod
+    def scan_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
+        """
+         Lazily read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
+         Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
+         but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
+        Parameters:
+            path: The path to the file.
+            schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
         """
         df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
         if schema is not None:
@@ -602,30 +679,6 @@ class IOOperations:
             allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
             enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
             compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
-        !!! Example
-            ```python
-            import polars_bio as pb
-            vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
-            pb.describe_vcf(vcf_1, allow_anonymous=True).sort("name").limit(5)
-            ```
-            ```shell
-                shape: (5, 3)
-            ┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
-            │ name      ┆ type    ┆ description                                                                          │
-            │ ---       ┆ ---     ┆ ---                                                                                  │
-            │ str       ┆ str     ┆ str                                                                                  │
-            ╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════╡
-            │ AC        ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only).                     │
-            │ AC_XX     ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only).                  │
-            │ AC_XY     ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only).                  │
-            │ AC_afr    ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only).    │
-            │ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
-            └───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘
-            ```
         """
         object_storage_options = PyObjectStorageOptions(
             allow_anonymous=allow_anonymous,
@@ -646,30 +699,6 @@ class IOOperations:
         Parameters:
             name: The name of the table.
             df: The Polars DataFrame.
-        !!! Example
-            ```python
-            import polars as pl
-            import polars_bio as pb
-            df = pl.DataFrame({
-                "a": [1, 2, 3],
-                "b": [4, 5, 6]
-            })
-            pb.from_polars("test_df", df)
-            pb.sql("SELECT * FROM test_df").collect()
-            ```
-            ```shell
-            3rows [00:00, 2978.91rows/s]
-            shape: (3, 2)
-            ┌─────┬─────┐
-            │ a   ┆ b   │
-            │ --- ┆ --- │
-            │ i64 ┆ i64 │
-            ╞═════╪═════╡
-            │ 1   ┆ 4   │
-            │ 2   ┆ 5   │
-            │ 3   ┆ 6   │
-            └─────┴─────┘
-            ```
         """
         reader = (
             df.to_arrow()
@@ -685,7 +714,7 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
     return [x.strip() for x in t]
-def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
+def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
     df_lazy: DataFrame = df
     arrow_schema = df_lazy.schema()
@@ -700,8 +729,6 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
             df = pl.DataFrame(df).limit(n_rows)
             if predicate is not None:
                 df = df.filter(predicate)
-            # TODO: We can push columns down to the DataFusion plan in the future,
-            #  but for now we'll do it here.
             if with_columns is not None:
                 df = df.select(with_columns)
             yield df
@@ -713,8 +740,6 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
             df = pl.DataFrame(py_df)
             if predicate is not None:
                 df = df.filter(predicate)
-            # TODO: We can push columns down to the DataFusion plan in the future,
-            #  but for now we'll do it here.
             if with_columns is not None:
                 df = df.select(with_columns)
             progress_bar.update(len(df))
@@ -723,31 +748,11 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
     return register_io_source(_overlap_source, schema=arrow_schema)
-def read_file(
+def _read_file(
     path: str,
     input_format: InputFormat,
     read_options: ReadOptions,
-    streaming: bool = False,
-) -> Union[pl.LazyFrame, pl.DataFrame]:
-    """
-    Read a file into a DataFrame.
-    Parameters
-    ----------
-    path : str
-        The path to the file.
-    input_format : InputFormat
-        The input format of the file.
-    read_options : ReadOptions, e.g. VcfReadOptions
-    streaming: Whether to read the file in streaming mode.
-    Returns
-    -------
-    pl.DataFrame
-        The DataFrame.
-    """
+) -> pl.LazyFrame:
     table = py_register_table(ctx, path, None, input_format, read_options)
-    if streaming:
-        return stream_wrapper(py_scan_table(ctx, table.name))
-    else:
-        return py_read_table(ctx, table.name)
+    df = py_read_table(ctx, table.name)
+    return _lazy_scan(df)

polars_bio/polars_bio.abi3.so CHANGED Viewed

Binary file

polars_bio/range_op.py CHANGED Viewed

@@ -46,7 +46,6 @@ class IntervalOperations:
         cols2: Union[list[str], None] = ["chrom", "start", "end"],
         algorithm: str = "Coitrees",
         output_type: str = "polars.LazyFrame",
-        streaming: bool = False,
         read_options1: Union[ReadOptions, None] = None,
         read_options2: Union[ReadOptions, None] = None,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
@@ -64,9 +63,8 @@ class IntervalOperations:
                 genomic intervals, provided separately for each set.
             suffixes: Suffixes for the columns of the two overlapped sets.
             on_cols: List of additional column names to join on. default is None.
-            algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper
+            algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper, SuperIntervals
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
-            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
             read_options1: Additional options for reading the input files.
             read_options2: Additional options for reading the input files.
@@ -122,7 +120,6 @@ class IntervalOperations:
             columns_1=cols1,
             columns_2=cols2,
             overlap_alg=algorithm,
-            streaming=streaming,
         )
         return range_operation(
@@ -139,7 +136,6 @@ class IntervalOperations:
         cols1: Union[list[str], None] = ["chrom", "start", "end"],
         cols2: Union[list[str], None] = ["chrom", "start", "end"],
         output_type: str = "polars.LazyFrame",
-        streaming: bool = False,
         read_options: Union[ReadOptions, None] = None,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
@@ -157,7 +153,6 @@ class IntervalOperations:
             suffixes: Suffixes for the columns of the two overlapped sets.
             on_cols: List of additional column names to join on. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
-            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
             read_options: Additional options for reading the input files.
@@ -186,7 +181,6 @@ class IntervalOperations:
             suffixes=suffixes,
             columns_1=cols1,
             columns_2=cols2,
-            streaming=streaming,
         )
         return range_operation(df1, df2, range_options, output_type, ctx, read_options)
@@ -200,7 +194,6 @@ class IntervalOperations:
         cols1: Union[list[str], None] = ["chrom", "start", "end"],
         cols2: Union[list[str], None] = ["chrom", "start", "end"],
         output_type: str = "polars.LazyFrame",
-        streaming: bool = False,
         read_options: Union[ReadOptions, None] = None,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
@@ -218,7 +211,6 @@ class IntervalOperations:
             suffixes: Suffixes for the columns of the two overlapped sets.
             on_cols: List of additional column names to join on. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
-            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
             read_options: Additional options for reading the input files.
@@ -252,7 +244,6 @@ class IntervalOperations:
             suffixes=suffixes,
             columns_1=cols1,
             columns_2=cols2,
-            streaming=streaming,
         )
         return range_operation(df2, df1, range_options, output_type, ctx, read_options)
@@ -266,7 +257,6 @@ class IntervalOperations:
         cols2: Union[list[str], None] = ["chrom", "start", "end"],
         on_cols: Union[list[str], None] = None,
         output_type: str = "polars.LazyFrame",
-        streaming: bool = False,
         naive_query: bool = True,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
@@ -285,7 +275,6 @@ class IntervalOperations:
             on_cols: List of additional column names to join on. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
             naive_query: If True, use naive query for counting overlaps based on overlaps.
-            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
         Returns:
             **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -335,7 +324,6 @@ class IntervalOperations:
                 suffixes=suffixes,
                 columns_1=cols1,
                 columns_2=cols2,
-                streaming=streaming,
             )
             return range_operation(df2, df1, range_options, output_type, ctx)
         df1 = read_df_to_datafusion(my_ctx, df1)
@@ -423,7 +411,7 @@ class IntervalOperations:
             )
         )
-        return convert_result(df, output_type, streaming)
+        return convert_result(df, output_type)
     @staticmethod
     def merge(
@@ -433,7 +421,6 @@ class IntervalOperations:
         cols: Union[list[str], None] = ["chrom", "start", "end"],
         on_cols: Union[list[str], None] = None,
         output_type: str = "polars.LazyFrame",
-        streaming: bool = False,
     ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
         """
         Merge overlapping intervals. It is assumed that start < end.
@@ -446,7 +433,6 @@ class IntervalOperations:
                 genomic intervals, provided separately for each set.
             on_cols: List of additional column names for clustering. default is None.
             output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
-            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
         Returns:
             **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -574,4 +560,4 @@ class IntervalOperations:
             )
         )
-        return convert_result(result, output_type, streaming)
+        return convert_result(result, output_type)

polars_bio/range_op_helpers.py CHANGED Viewed

@@ -44,19 +44,6 @@ def range_operation(
             len(supported_exts.intersection(ext2)) > 0 or len(ext2) == 0
         ), "Dataframe2 must be a Parquet, a BED or CSV or VCF file"
         # use suffixes to avoid column name conflicts
-        if range_options.streaming:
-            # FIXME: Parallelism is not supported
-            # FIXME: StringViews not supported yet see: https://datafusion.apache.org/blog/2024/12/14/datafusion-python-43.1.0/
-            ctx.set_option("datafusion.execution.target_partitions", "1", False)
-            ctx.set_option(
-                "datafusion.execution.parquet.schema_force_view_types", "false", True
-            )
-            return stream_wrapper(
-                stream_range_operation_scan(
-                    ctx, df1, df2, range_options, read_options1, read_options2
-                )
-            )
         if range_options.range_op == RangeOp.CountOverlapsNaive:
             ## add count column to the schema

polars_bio/sql.py CHANGED Viewed

@@ -22,7 +22,7 @@ from polars_bio.polars_bio import (
 )
 from .context import ctx
-from .io import _cleanse_fields, lazy_scan
+from .io import _cleanse_fields, _lazy_scan
 from .range_op_helpers import stream_wrapper
@@ -436,13 +436,12 @@ class SQL:
         py_register_table(ctx, path, name, InputFormat.Bam, read_options)
     @staticmethod
-    def sql(query: str, streaming: bool = False) -> pl.LazyFrame:
+    def sql(query: str) -> pl.LazyFrame:
         """
         Execute a SQL query on the registered tables.
         Parameters:
             query: The SQL query.
-            streaming: Whether to execute the query in streaming mode.
         !!! Example
               ```python
@@ -451,8 +450,5 @@ class SQL:
               pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
               ```
         """
-        if streaming:
-            return stream_wrapper(py_scan_sql(ctx, query))
-        else:
-            df = py_read_sql(ctx, query)
-            return lazy_scan(df)
+        df = py_read_sql(ctx, query)
+        return _lazy_scan(df)

polars_bio/utils.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import Iterator, Union
+import polars as pl
+from datafusion import DataFrame
+from polars.io.plugins import register_io_source
+from tqdm.auto import tqdm
+def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
+    if t is None:
+        return None
+    return [x.strip() for x in t]
+def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
+    df_lazy: DataFrame = df
+    arrow_schema = df_lazy.schema()
+    def _overlap_source(
+        with_columns: Union[pl.Expr, None],
+        predicate: Union[pl.Expr, None],
+        n_rows: Union[int, None],
+        _batch_size: Union[int, None],
+    ) -> Iterator[pl.DataFrame]:
+        if n_rows and n_rows < 8192:  # 8192 is the default batch size in datafusion
+            df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
+            df = pl.DataFrame(df).limit(n_rows)
+            if predicate is not None:
+                df = df.filter(predicate)
+            if with_columns is not None:
+                df = df.select(with_columns)
+            yield df
+            return
+        df_stream = df_lazy.execute_stream()
+        progress_bar = tqdm(unit="rows")
+        for r in df_stream:
+            py_df = r.to_pyarrow()
+            df = pl.DataFrame(py_df)
+            if predicate is not None:
+                df = df.filter(predicate)
+            if with_columns is not None:
+                df = df.select(with_columns)
+            progress_bar.update(len(df))
+            yield df
+    return register_io_source(_overlap_source, schema=arrow_schema)

{polars_bio-0.11.0.dist-info → polars_bio-0.13.0.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.4
 Name: polars-bio
-Version: 0.11.0
+Version: 0.13.0
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
-Requires-Dist: polars~=1.21.0
+Requires-Dist: polars~=1.29.0
 Requires-Dist: pyarrow~=21.0.0
 Requires-Dist: datafusion~=48.0.0
 Requires-Dist: tqdm~=4.67.1

polars_bio-0.13.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+polars_bio-0.13.0.dist-info/METADATA,sha256=kRbYeTHKR-qtdAq4pD5bf8k1iUyHSriIAbUq3IUOO9o,683
+polars_bio-0.13.0.dist-info/WHEEL,sha256=TiMJekJwYXi-5FCpHPqncJXv9UVKDzSHt4YRv5UDSSg,104
+polars_bio-0.13.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+polars_bio/__init__.py,sha256=-4QHwzijNcm99A372_-pLCK-F3YrmO2my-HZgVjYEr8,2977
+polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
+polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
+polars_bio/interval_op_helpers.py,sha256=xMWxu2y3jIwt0KCtzIPF_cvbUMdhrb8Mif74MbHU1qY,2834
+polars_bio/io.py,sha256=YtcNqS0pzeTRZ78ckov4nfNekvWCyz5JGSHVl7LxfFQ,37866
+polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
+polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
+polars_bio/polars_bio.abi3.so,sha256=x8hfvc1l_jRS8o0h0-fbmrb5nexx5URRCfsNSIE7b-I,277132364
+polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
+polars_bio/range_op.py,sha256=3LAYTmbJhv7WY8eB7_OJfPLLoR9eonbZSFKkZi_Dp30,24300
+polars_bio/range_op_helpers.py,sha256=RQw6ZgIGhDh-3-pUTIQ56Vypuy9XQhpFGKQYGd_vrzY,5792
+polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
+polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
+polars_bio/sql.py,sha256=ORvSleiwUpkpewvgcFA3GeuyZhQXToq9RZ_XrO6iGxw,24164
+polars_bio/utils.py,sha256=KAq8tbIf6yBFhRwzrRLBUfM6zbbdCqK_NYK5bUy1qfA,1565
+polars_bio-0.13.0.dist-info/RECORD,,

polars_bio-0.11.0.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-polars_bio-0.11.0.dist-info/METADATA,sha256=ZCC8mNSP1aoNzqRMBnmaC5AcbLCiU4wZfe0-5dcEdAg,683
-polars_bio-0.11.0.dist-info/WHEEL,sha256=TiMJekJwYXi-5FCpHPqncJXv9UVKDzSHt4YRv5UDSSg,104
-polars_bio-0.11.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-polars_bio/__init__.py,sha256=C3l2s2bJf1kTGHu5BHWHd2oyCFvZIuLO_vrP1e9oSiY,1939
-polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
-polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
-polars_bio/interval_op_helpers.py,sha256=3xg6IEpfyTPb3y1QzkSVGFLhtFVypGQfDhbJEGdHpgo,3006
-polars_bio/io.py,sha256=9y9fYO_xZN1Efc7JJ0_G1SrbQ7xqT5HEfip9h1WyrrI,41285
-polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
-polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
-polars_bio/polars_bio.abi3.so,sha256=OXYsznpzbNRv8uyEN8F2VAK9rV_Y3QLgLrSaVtTGKfg,268517156
-polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
-polars_bio/range_op.py,sha256=k6UzhSZIWQIj61zhOOkRcGD7ucFo8fwGaOUnyF6REIw,25122
-polars_bio/range_op_helpers.py,sha256=9MRGKhGmx_HnZEWP50tWQ4rdsdhoMf8m-08E0f_YxMs,6407
-polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
-polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
-polars_bio/sql.py,sha256=PFTHeRyVouPyjSL26w60ByyAKZMLCsaeZ0wiJY_KH2k,24361
-polars_bio-0.11.0.dist-info/RECORD,,

{polars_bio-0.11.0.dist-info → polars_bio-0.13.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{polars_bio-0.11.0.dist-info → polars_bio-0.13.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes