polars-bio 0.11.0__cp39-abi3-macosx_10_12_x86_64.whl → 0.13.0__cp39-abi3-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polars_bio/__init__.py CHANGED
@@ -1,39 +1,53 @@
1
+ import os
2
+
3
+ # Set POLARS_FORCE_NEW_STREAMING to "1" by default if not already set
4
+ if "POLARS_FORCE_NEW_STREAMING" not in os.environ:
5
+ os.environ["POLARS_FORCE_NEW_STREAMING"] = "1"
6
+
1
7
  from polars_bio.polars_bio import GffReadOptions, InputFormat
2
8
  from polars_bio.polars_bio import PyObjectStorageOptions as ObjectStorageOptions
3
9
  from polars_bio.polars_bio import ReadOptions, VcfReadOptions
4
10
 
5
11
  from .context import ctx, set_option
6
- from .sql import SQL
7
-
8
- register_gff = SQL.register_gff
9
- register_vcf = SQL.register_vcf
10
- register_fastq = SQL.register_fastq
11
- register_bam = SQL.register_bam
12
- register_bed = SQL.register_bed
13
- register_view = SQL.register_view
14
-
15
- sql = SQL.sql
16
-
17
- from .io import IOOperations
18
-
19
- describe_vcf = IOOperations.describe_vcf
20
- from_polars = IOOperations.from_polars
21
- read_bam = IOOperations.read_bam
22
- read_fastq = IOOperations.read_fastq
23
- read_gff = IOOperations.read_gff
24
- read_table = IOOperations.read_table
25
- read_vcf = IOOperations.read_vcf
26
- read_fastq = IOOperations.read_fastq
27
- read_bed = IOOperations.read_bed
28
- read_fasta = IOOperations.read_fasta
29
-
30
- from .range_op import IntervalOperations
31
-
32
- overlap = IntervalOperations.overlap
33
- nearest = IntervalOperations.nearest
34
- count_overlaps = IntervalOperations.count_overlaps
35
- coverage = IntervalOperations.coverage
36
- merge = IntervalOperations.merge
12
+ from .sql import SQL as data_processing
13
+
14
+ register_gff = data_processing.register_gff
15
+ register_vcf = data_processing.register_vcf
16
+ register_fastq = data_processing.register_fastq
17
+ register_bam = data_processing.register_bam
18
+ register_bed = data_processing.register_bed
19
+ register_view = data_processing.register_view
20
+
21
+ sql = data_processing.sql
22
+
23
+ from .io import IOOperations as data_input
24
+
25
+ describe_vcf = data_input.describe_vcf
26
+ from_polars = data_input.from_polars
27
+ read_bam = data_input.read_bam
28
+ read_fastq = data_input.read_fastq
29
+ read_gff = data_input.read_gff
30
+ read_table = data_input.read_table
31
+ read_vcf = data_input.read_vcf
32
+ read_fastq = data_input.read_fastq
33
+ read_bed = data_input.read_bed
34
+ read_fasta = data_input.read_fasta
35
+ scan_bam = data_input.scan_bam
36
+ scan_bed = data_input.scan_bed
37
+ scan_fasta = data_input.scan_fasta
38
+ scan_fastq = data_input.scan_fastq
39
+ scan_gff = data_input.scan_gff
40
+ scan_table = data_input.scan_table
41
+ scan_vcf = data_input.scan_vcf
42
+
43
+
44
+ from .range_op import IntervalOperations as range_operations
45
+
46
+ overlap = range_operations.overlap
47
+ nearest = range_operations.nearest
48
+ count_overlaps = range_operations.count_overlaps
49
+ coverage = range_operations.coverage
50
+ merge = range_operations.merge
37
51
 
38
52
  try:
39
53
  from .range_utils import Utils
@@ -59,7 +73,7 @@ except ImportError:
59
73
  POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
60
74
 
61
75
 
62
- __version__ = "0.11.0"
76
+ __version__ = "0.13.0"
63
77
  __all__ = [
64
78
  "ctx",
65
79
  "FilterOp",
@@ -74,4 +88,33 @@ __all__ = [
74
88
  "ObjectStorageOptions",
75
89
  "set_option",
76
90
  "set_loglevel",
91
+ "describe_vcf",
92
+ "from_polars",
93
+ "read_bam",
94
+ "read_bed",
95
+ "read_fasta",
96
+ "read_fastq",
97
+ "read_gff",
98
+ "read_table",
99
+ "read_vcf",
100
+ "scan_bam",
101
+ "scan_bed",
102
+ "scan_fasta",
103
+ "scan_fastq",
104
+ "scan_gff",
105
+ "scan_table",
106
+ "scan_vcf",
107
+ "register_gff",
108
+ "register_vcf",
109
+ "register_fastq",
110
+ "register_bam",
111
+ "register_bed",
112
+ "register_view",
113
+ "sql",
114
+ "overlap",
115
+ "nearest",
116
+ "count_overlaps",
117
+ "coverage",
118
+ "merge",
119
+ "visualize_intervals",
77
120
  ]
@@ -81,12 +81,8 @@ def df_to_lazyframe(df: datafusion.DataFrame) -> pl.LazyFrame:
81
81
 
82
82
 
83
83
  def convert_result(
84
- df: datafusion.DataFrame, output_type: str, streaming: bool
84
+ df: datafusion.DataFrame, output_type: str
85
85
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame"]:
86
- # TODO: implement streaming
87
- if streaming:
88
- # raise NotImplementedError("streaming is not implemented")
89
- return df.to_polars().lazy()
90
86
  if output_type == "polars.DataFrame":
91
87
  return df.to_polars()
92
88
  elif output_type == "pandas.DataFrame":
polars_bio/io.py CHANGED
@@ -70,29 +70,6 @@ SCHEMAS = {
70
70
 
71
71
 
72
72
  class IOOperations:
73
- # TODO handling reference
74
- # def read_cram(path: str) -> pl.LazyFrame:
75
- # """
76
- # Read a CRAM file into a LazyFrame.
77
- #
78
- # Parameters:
79
- # path: The path to the CRAM file.
80
- # """
81
- # return file_lazy_scan(path, InputFormat.Cram)
82
-
83
- # TODO passing of bam_region_filter
84
- # def read_indexed_bam(path: str) -> pl.LazyFrame:
85
- # """
86
- # Read an indexed BAM file into a LazyFrame.
87
- #
88
- # Parameters:
89
- # path: The path to the BAM file.
90
- #
91
- # !!! warning
92
- # Predicate pushdown is not supported yet. So no real benefit from using an indexed BAM file.
93
- # """
94
- # return file_lazy_scan(path, InputFormat.IndexedBam)
95
-
96
73
  @staticmethod
97
74
  def read_fasta(
98
75
  path: str,
@@ -103,11 +80,10 @@ class IOOperations:
103
80
  max_retries: int = 5,
104
81
  timeout: int = 300,
105
82
  compression_type: str = "auto",
106
- streaming: bool = False,
107
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
83
+ ) -> pl.DataFrame:
108
84
  """
109
85
 
110
- Read a FASTA file into a LazyFrame.
86
+ Read a FASTA file into a DataFrame.
111
87
 
112
88
  Parameters:
113
89
  path: The path to the FASTA file.
@@ -118,7 +94,6 @@ class IOOperations:
118
94
  max_retries: The maximum number of retries for reading the file from object storage.
119
95
  timeout: The timeout in seconds for reading the file from object storage.
120
96
  compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
121
- streaming: Whether to read the FASTA file in streaming mode.
122
97
 
123
98
  !!! Example
124
99
  ```shell
@@ -127,7 +102,63 @@ class IOOperations:
127
102
 
128
103
  ```python
129
104
  import polars_bio as pb
130
- pb.read_fasta("/tmp/test.fasta").limit(1).collect()
105
+ pb.read_fasta("/tmp/test.fasta").limit(1)
106
+ ```
107
+ ```shell
108
+ shape: (1, 3)
109
+ ┌─────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
110
+ │ name ┆ description ┆ sequence │
111
+ │ --- ┆ --- ┆ --- │
112
+ │ str ┆ str ┆ str │
113
+ ╞═════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
114
+ │ ENA|BK006935|BK006935.2 ┆ TPA_inf: Saccharomyces cerevis… ┆ CCACACCACACCCACACACCCACACACCAC… │
115
+ └─────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘
116
+ ```
117
+ """
118
+ return IOOperations.scan_fasta(
119
+ path,
120
+ chunk_size,
121
+ concurrent_fetches,
122
+ allow_anonymous,
123
+ enable_request_payer,
124
+ max_retries,
125
+ timeout,
126
+ compression_type,
127
+ ).collect()
128
+
129
+ @staticmethod
130
+ def scan_fasta(
131
+ path: str,
132
+ chunk_size: int = 8,
133
+ concurrent_fetches: int = 1,
134
+ allow_anonymous: bool = True,
135
+ enable_request_payer: bool = False,
136
+ max_retries: int = 5,
137
+ timeout: int = 300,
138
+ compression_type: str = "auto",
139
+ ) -> pl.LazyFrame:
140
+ """
141
+
142
+ Lazily read a FASTA file into a LazyFrame.
143
+
144
+ Parameters:
145
+ path: The path to the FASTA file.
146
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
147
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
148
+ allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
149
+ enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
150
+ max_retries: The maximum number of retries for reading the file from object storage.
151
+ timeout: The timeout in seconds for reading the file from object storage.
152
+ compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
153
+
154
+ !!! Example
155
+ ```shell
156
+ wget https://www.ebi.ac.uk/ena/browser/api/fasta/BK006935.2?download=true -O /tmp/test.fasta
157
+ ```
158
+
159
+ ```python
160
+ import polars_bio as pb
161
+ pb.scan_fasta("/tmp/test.fasta").limit(1).collect()
131
162
  ```
132
163
  ```shell
133
164
  shape: (1, 3)
@@ -153,11 +184,7 @@ class IOOperations:
153
184
  object_storage_options=object_storage_options
154
185
  )
155
186
  read_options = ReadOptions(fasta_read_options=fasta_read_options)
156
- if streaming:
157
- return read_file(path, InputFormat.Fasta, read_options, streaming)
158
- else:
159
- df = read_file(path, InputFormat.Fasta, read_options)
160
- return lazy_scan(df)
187
+ return _read_file(path, InputFormat.Fasta, read_options)
161
188
 
162
189
  @staticmethod
163
190
  def read_vcf(
@@ -171,10 +198,53 @@ class IOOperations:
171
198
  max_retries: int = 5,
172
199
  timeout: int = 300,
173
200
  compression_type: str = "auto",
174
- streaming: bool = False,
175
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
201
+ ) -> pl.DataFrame:
176
202
  """
177
- Read a VCF file into a LazyFrame.
203
+ Read a VCF file into a DataFrame.
204
+
205
+ Parameters:
206
+ path: The path to the VCF file.
207
+ info_fields: The fields to read from the INFO column.
208
+ thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
209
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
210
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
211
+ allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
212
+ enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
213
+ max_retries: The maximum number of retries for reading the file from object storage.
214
+ timeout: The timeout in seconds for reading the file from object storage.
215
+ compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
216
+
217
+ !!! note
218
+ VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
219
+ """
220
+ return IOOperations.scan_vcf(
221
+ path,
222
+ info_fields,
223
+ thread_num,
224
+ chunk_size,
225
+ concurrent_fetches,
226
+ allow_anonymous,
227
+ enable_request_payer,
228
+ max_retries,
229
+ timeout,
230
+ compression_type,
231
+ ).collect()
232
+
233
+ @staticmethod
234
+ def scan_vcf(
235
+ path: str,
236
+ info_fields: Union[list[str], None] = None,
237
+ thread_num: int = 1,
238
+ chunk_size: int = 8,
239
+ concurrent_fetches: int = 1,
240
+ allow_anonymous: bool = True,
241
+ enable_request_payer: bool = False,
242
+ max_retries: int = 5,
243
+ timeout: int = 300,
244
+ compression_type: str = "auto",
245
+ ) -> pl.LazyFrame:
246
+ """
247
+ Lazily read a VCF file into a LazyFrame.
178
248
 
179
249
  Parameters:
180
250
  path: The path to the VCF file.
@@ -187,7 +257,6 @@ class IOOperations:
187
257
  max_retries: The maximum number of retries for reading the file from object storage.
188
258
  timeout: The timeout in seconds for reading the file from object storage.
189
259
  compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
190
- streaming: Whether to read the VCF file in streaming mode.
191
260
 
192
261
  !!! note
193
262
  VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -208,11 +277,7 @@ class IOOperations:
208
277
  object_storage_options=object_storage_options,
209
278
  )
210
279
  read_options = ReadOptions(vcf_read_options=vcf_read_options)
211
- if streaming:
212
- return read_file(path, InputFormat.Vcf, read_options, streaming)
213
- else:
214
- df = read_file(path, InputFormat.Vcf, read_options)
215
- return lazy_scan(df)
280
+ return _read_file(path, InputFormat.Vcf, read_options)
216
281
 
217
282
  @staticmethod
218
283
  def read_gff(
@@ -226,10 +291,9 @@ class IOOperations:
226
291
  max_retries: int = 5,
227
292
  timeout: int = 300,
228
293
  compression_type: str = "auto",
229
- streaming: bool = False,
230
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
294
+ ) -> pl.DataFrame:
231
295
  """
232
- Read a GFF file into a LazyFrame.
296
+ Read a GFF file into a DataFrame.
233
297
 
234
298
  Parameters:
235
299
  path: The path to the GFF file.
@@ -242,58 +306,51 @@ class IOOperations:
242
306
  max_retries: The maximum number of retries for reading the file from object storage.
243
307
  timeout: The timeout in seconds for reading the file from object storage.
244
308
  compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
245
- streaming: Whether to read the GFF file in streaming mode.
246
309
 
310
+ !!! note
311
+ GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
312
+ """
313
+ return IOOperations.scan_gff(
314
+ path,
315
+ attr_fields,
316
+ thread_num,
317
+ chunk_size,
318
+ concurrent_fetches,
319
+ allow_anonymous,
320
+ enable_request_payer,
321
+ max_retries,
322
+ timeout,
323
+ compression_type,
324
+ ).collect()
247
325
 
248
- !!! Example
249
- ```shell
250
- wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz
251
- ```
252
- Read a GFF file **without** unnesting attributes:
253
- ```python
254
- import polars_bio as pb
255
- gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
256
- pb.read_gff(gff_path).limit(5).collect()
257
- ```
258
-
259
- ```shell
260
-
261
- shape: (5, 9)
262
- ┌───────┬───────┬───────┬────────────┬───┬───────┬────────┬───────┬─────────────────────────────────┐
263
- │ chrom ┆ start ┆ end ┆ type ┆ … ┆ score ┆ strand ┆ phase ┆ attributes │
264
- │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
265
- │ str ┆ u32 ┆ u32 ┆ str ┆ ┆ f32 ┆ str ┆ u32 ┆ list[struct[2]] │
266
- ╞═══════╪═══════╪═══════╪════════════╪═══╪═══════╪════════╪═══════╪═════════════════════════════════╡
267
- │ chr1 ┆ 11869 ┆ 14409 ┆ gene ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","ENSG00000223972.5"}, {… │
268
- │ chr1 ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","ENST00000456328.2"}, {… │
269
- │ chr1 ┆ 11869 ┆ 12227 ┆ exon ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","exon:ENST00000456328.2… │
270
- │ chr1 ┆ 12613 ┆ 12721 ┆ exon ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","exon:ENST00000456328.2… │
271
- │ chr1 ┆ 13221 ┆ 14409 ┆ exon ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","exon:ENST00000456328.2… │
272
- └───────┴───────┴───────┴────────────┴───┴───────┴────────┴───────┴─────────────────────────────────┘
273
-
274
- ```
326
+ @staticmethod
327
+ def scan_gff(
328
+ path: str,
329
+ attr_fields: Union[list[str], None] = None,
330
+ thread_num: int = 1,
331
+ chunk_size: int = 8,
332
+ concurrent_fetches: int = 1,
333
+ allow_anonymous: bool = True,
334
+ enable_request_payer: bool = False,
335
+ max_retries: int = 5,
336
+ timeout: int = 300,
337
+ compression_type: str = "auto",
338
+ ) -> pl.LazyFrame:
339
+ """
340
+ Lazily read a GFF file into a LazyFrame.
275
341
 
276
- Read a GFF file **with** unnesting attributes:
277
- ```python
278
- import polars_bio as pb
279
- gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
280
- pb.read_gff(gff_path, attr_fields=["ID", "havana_transcript"]).limit(5).collect()
281
- ```
282
- ```shell
342
+ Parameters:
343
+ path: The path to the GFF file.
344
+ attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
345
+ thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
346
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
347
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
348
+ allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
349
+ enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
350
+ max_retries: The maximum number of retries for reading the file from object storage.
351
+ timeout: The timeout in seconds for reading the file from object storage.
352
+ compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
283
353
 
284
- shape: (5, 10)
285
- ┌───────┬───────┬───────┬────────────┬───┬────────┬───────┬──────────────────────────┬──────────────────────┐
286
- │ chrom ┆ start ┆ end ┆ type ┆ … ┆ strand ┆ phase ┆ ID ┆ havana_transcript │
287
- │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
288
- │ str ┆ u32 ┆ u32 ┆ str ┆ ┆ str ┆ u32 ┆ str ┆ str │
289
- ╞═══════╪═══════╪═══════╪════════════╪═══╪════════╪═══════╪══════════════════════════╪══════════════════════╡
290
- │ chr1 ┆ 11869 ┆ 14409 ┆ gene ┆ … ┆ + ┆ null ┆ ENSG00000223972.5 ┆ null │
291
- │ chr1 ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ + ┆ null ┆ ENST00000456328.2 ┆ OTTHUMT00000362751.1 │
292
- │ chr1 ┆ 11869 ┆ 12227 ┆ exon ┆ … ┆ + ┆ null ┆ exon:ENST00000456328.2:1 ┆ OTTHUMT00000362751.1 │
293
- │ chr1 ┆ 12613 ┆ 12721 ┆ exon ┆ … ┆ + ┆ null ┆ exon:ENST00000456328.2:2 ┆ OTTHUMT00000362751.1 │
294
- │ chr1 ┆ 13221 ┆ 14409 ┆ exon ┆ … ┆ + ┆ null ┆ exon:ENST00000456328.2:3 ┆ OTTHUMT00000362751.1 │
295
- └───────┴───────┴───────┴────────────┴───┴────────┴───────┴──────────────────────────┴──────────────────────┘
296
- ```
297
354
  !!! note
298
355
  GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
299
356
  """
@@ -313,11 +370,7 @@ class IOOperations:
313
370
  object_storage_options=object_storage_options,
314
371
  )
315
372
  read_options = ReadOptions(gff_read_options=gff_read_options)
316
- if streaming:
317
- return read_file(path, InputFormat.Gff, read_options, streaming)
318
- else:
319
- df = read_file(path, InputFormat.Gff, read_options)
320
- return lazy_scan(df)
373
+ return _read_file(path, InputFormat.Gff, read_options)
321
374
 
322
375
  @staticmethod
323
376
  def read_bam(
@@ -329,10 +382,9 @@ class IOOperations:
329
382
  enable_request_payer: bool = False,
330
383
  max_retries: int = 5,
331
384
  timeout: int = 300,
332
- streaming: bool = False,
333
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
385
+ ) -> pl.DataFrame:
334
386
  """
335
- Read a BAM file into a LazyFrame.
387
+ Read a BAM file into a DataFrame.
336
388
 
337
389
  Parameters:
338
390
  path: The path to the BAM file.
@@ -343,33 +395,44 @@ class IOOperations:
343
395
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
344
396
  max_retries: The maximum number of retries for reading the file from object storage.
345
397
  timeout: The timeout in seconds for reading the file from object storage.
346
- streaming: Whether to read the BAM file in streaming mode.
347
398
 
348
- !!! Example
399
+ !!! note
400
+ BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
401
+ """
402
+ return IOOperations.scan_bam(
403
+ path,
404
+ thread_num,
405
+ chunk_size,
406
+ concurrent_fetches,
407
+ allow_anonymous,
408
+ enable_request_payer,
409
+ max_retries,
410
+ timeout,
411
+ ).collect()
349
412
 
350
- ```python
351
- import polars_bio as pb
352
- bam = pb.read_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam").limit(3)
353
- bam.collect()
354
- ```
355
- ```shell
356
- INFO:polars_bio:Table: hg00096_mapped_illumina_bwa_gbr_low_coverage_20120522 registered for path: gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam
357
- shape: (3, 11)
358
- ┌────────────────────┬───────┬───────┬───────┬───┬────────────┬────────────┬─────────────────────────────────┬─────────────────────────────────┐
359
- name ┆ chrom ┆ start ┆ end ┆ … ┆ mate_chrom ┆ mate_start ┆ sequence ┆ quality_scores │
360
- --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
361
- │ str ┆ str ┆ u32 ┆ u32 ┆ ┆ str ┆ u32 ┆ str ┆ str │
362
- ╞════════════════════╪═══════╪═══════╪═══════╪═══╪════════════╪════════════╪═════════════════════════════════╪═════════════════════════════════╡
363
- │ SRR062634.9882510 ┆ chr1 ┆ 10001 ┆ 10044 ┆ … ┆ chr1 ┆ 10069 ┆ TAACCCTAACCCTACCCTAACCCTAACCCT… ┆ 0<>=/0E:7;08FBDIF9;2%=<>+FCDDA… │
364
- │ SRR062641.21956756 ┆ chr1 ┆ 10001 ┆ 10049 ┆ … ┆ chr1 ┆ 10051 ┆ TAACCCTACCCTAACCCTAACCCTAACCCT… ┆ 0=MLOOPNNPPJHPOQQROQPQQRIQPRJB… │
365
- │ SRR062641.13613107 ┆ chr1 ┆ 10002 ┆ 10072 ┆ … ┆ chr1 ┆ 10110 ┆ AACCCTAACCCCTAACCCCTAACCCCTAAC… ┆ 0KKNPQOQOQIQRPQPRRRRPQPRRRRPRF… │
366
- └────────────────────┴───────┴───────┴───────┴───┴────────────┴────────────┴─────────────────────────────────┴─────────────────────────────────┘
367
- ```
413
+ @staticmethod
414
+ def scan_bam(
415
+ path: str,
416
+ thread_num: int = 1,
417
+ chunk_size: int = 8,
418
+ concurrent_fetches: int = 1,
419
+ allow_anonymous: bool = True,
420
+ enable_request_payer: bool = False,
421
+ max_retries: int = 5,
422
+ timeout: int = 300,
423
+ ) -> pl.LazyFrame:
424
+ """
425
+ Lazily read a BAM file into a LazyFrame.
368
426
 
369
- ```python
370
- bam.collect_schema()
371
- Schema({'name': String, 'chrom': String, 'start': UInt32, 'end': UInt32, 'flags': UInt32, 'cigar': String, 'mapping_quality': UInt32, 'mate_chrom': String, 'mate_start': UInt32, 'sequence': String, 'quality_scores': String})
372
- ```
427
+ Parameters:
428
+ path: The path to the BAM file.
429
+ thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
430
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
431
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
432
+ allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
433
+ enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
434
+ max_retries: The maximum number of retries for reading the file from object storage.
435
+ timeout: The timeout in seconds for reading the file from object storage.
373
436
 
374
437
  !!! note
375
438
  BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
@@ -389,11 +452,7 @@ class IOOperations:
389
452
  object_storage_options=object_storage_options,
390
453
  )
391
454
  read_options = ReadOptions(bam_read_options=bam_read_options)
392
- if streaming:
393
- return read_file(path, InputFormat.Bam, read_options, streaming)
394
- else:
395
- df = read_file(path, InputFormat.Bam, read_options)
396
- return lazy_scan(df)
455
+ return _read_file(path, InputFormat.Bam, read_options)
397
456
 
398
457
  @staticmethod
399
458
  def read_fastq(
@@ -405,11 +464,10 @@ class IOOperations:
405
464
  max_retries: int = 5,
406
465
  timeout: int = 300,
407
466
  compression_type: str = "auto",
408
- streaming: bool = False,
409
467
  parallel: bool = False,
410
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
468
+ ) -> pl.DataFrame:
411
469
  """
412
- Read a FASTQ file into a LazyFrame.
470
+ Read a FASTQ file into a DataFrame.
413
471
 
414
472
  Parameters:
415
473
  path: The path to the FASTQ file.
@@ -420,44 +478,46 @@ class IOOperations:
420
478
  max_retries: The maximum number of retries for reading the file from object storage.
421
479
  timeout: The timeout in seconds for reading the file from object storage.
422
480
  compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
423
- streaming: Whether to read the FASTQ file in streaming mode.
424
481
  parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
482
+ """
483
+ return IOOperations.scan_fastq(
484
+ path,
485
+ chunk_size,
486
+ concurrent_fetches,
487
+ allow_anonymous,
488
+ enable_request_payer,
489
+ max_retries,
490
+ timeout,
491
+ compression_type,
492
+ parallel,
493
+ ).collect()
425
494
 
426
- !!! Example
427
-
428
- ```python
429
- import polars_bio as pb
430
- pb.read_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz").limit(1).collect()
431
- ```
432
- ```shell
433
- shape: (1, 4)
434
- ┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
435
- name ┆ description ┆ sequence ┆ quality_scores │
436
- --- ┆ --- ┆ --- ┆ --- │
437
- str ┆ str ┆ str ┆ str │
438
- ╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
439
- │ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… ┆ TGGAAGGTTCTCGAAAAAAATGGAATCGAA… ┆ ?@;DDBDDBHF??FFB@B)1:CD3*:?DFF… │
440
- └─────────────────────┴─────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘
441
-
442
- ```
443
-
444
- Parallel reading of BZGF compressed FASTQ files stored locally:
445
- ```shell
446
- ls -1 /tmp/ERR194146.fastq.bgz*
447
- ERR194146.fastq.bgz
448
- ERR194146.fastq.bgz.gzi
449
- ```
450
-
451
- ```python
452
- import polars_bio as pb
453
- ## Set the number of target partitions (threads) to 2
454
- pb.set_option("datafusion.execution.target_partitions", "2")
455
- pb.read_fastq("/tmp/ERR194146.fastq.bgz", parallel=True).count().collect()
456
- ```
457
-
458
-
495
+ @staticmethod
496
+ def scan_fastq(
497
+ path: str,
498
+ chunk_size: int = 8,
499
+ concurrent_fetches: int = 1,
500
+ allow_anonymous: bool = True,
501
+ enable_request_payer: bool = False,
502
+ max_retries: int = 5,
503
+ timeout: int = 300,
504
+ compression_type: str = "auto",
505
+ parallel: bool = False,
506
+ ) -> pl.LazyFrame:
459
507
  """
508
+ Lazily read a FASTQ file into a LazyFrame.
460
509
 
510
+ Parameters:
511
+ path: The path to the FASTQ file.
512
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
513
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
514
+ allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
515
+ enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
516
+ max_retries: The maximum number of retries for reading the file from object storage.
517
+ timeout: The timeout in seconds for reading the file from object storage.
518
+ compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
519
+ parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
520
+ """
461
521
  object_storage_options = PyObjectStorageOptions(
462
522
  allow_anonymous=allow_anonymous,
463
523
  enable_request_payer=enable_request_payer,
@@ -472,11 +532,7 @@ class IOOperations:
472
532
  object_storage_options=object_storage_options, parallel=parallel
473
533
  )
474
534
  read_options = ReadOptions(fastq_read_options=fastq_read_options)
475
- if streaming:
476
- return read_file(path, InputFormat.Fastq, read_options, streaming)
477
- else:
478
- df = read_file(path, InputFormat.Fastq, read_options)
479
- return lazy_scan(df)
535
+ return _read_file(path, InputFormat.Fastq, read_options)
480
536
 
481
537
  @staticmethod
482
538
  def read_bed(
@@ -489,10 +545,9 @@ class IOOperations:
489
545
  max_retries: int = 5,
490
546
  timeout: int = 300,
491
547
  compression_type: str = "auto",
492
- streaming: bool = False,
493
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
548
+ ) -> pl.DataFrame:
494
549
  """
495
- Read a BED file into a LazyFrame.
550
+ Read a BED file into a DataFrame.
496
551
 
497
552
  Parameters:
498
553
  path: The path to the BED file.
@@ -504,44 +559,59 @@ class IOOperations:
504
559
  max_retries: The maximum number of retries for reading the file from object storage.
505
560
  timeout: The timeout in seconds for reading the file from object storage.
506
561
  compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
507
- streaming: Whether to read the BED file in streaming mode.
508
562
 
509
563
  !!! Note
510
564
  Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
511
565
  Also unlike other text formats, **GZIP** compression is not supported.
512
566
 
513
- !!! Example
514
- ```shell
567
+ !!! note
568
+ BED reader uses **1-based** coordinate system for the `start`, `end`.
569
+ """
570
+ return IOOperations.scan_bed(
571
+ path,
572
+ thread_num,
573
+ chunk_size,
574
+ concurrent_fetches,
575
+ allow_anonymous,
576
+ enable_request_payer,
577
+ max_retries,
578
+ timeout,
579
+ compression_type,
580
+ ).collect()
515
581
 
516
- cd /tmp
517
- wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
518
- unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"
519
- ```
582
+ @staticmethod
583
+ def scan_bed(
584
+ path: str,
585
+ thread_num: int = 1,
586
+ chunk_size: int = 8,
587
+ concurrent_fetches: int = 1,
588
+ allow_anonymous: bool = True,
589
+ enable_request_payer: bool = False,
590
+ max_retries: int = 5,
591
+ timeout: int = 300,
592
+ compression_type: str = "auto",
593
+ ) -> pl.LazyFrame:
594
+ """
595
+ Lazily read a BED file into a LazyFrame.
520
596
 
521
- ```python
522
- import polars_bio as pb
523
- pb.read_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed").limit(5).collect()
524
- ```
597
+ Parameters:
598
+ path: The path to the BED file.
599
+ thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
600
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
601
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
602
+ allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
603
+ enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
604
+ max_retries: The maximum number of retries for reading the file from object storage.
605
+ timeout: The timeout in seconds for reading the file from object storage.
606
+ compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
525
607
 
526
- ```shell
608
+ !!! Note
609
+ Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
610
+ Also unlike other text formats, **GZIP** compression is not supported.
527
611
 
528
- shape: (5, 4)
529
- ┌───────┬───────────┬───────────┬───────┐
530
- │ chrom ┆ start ┆ end ┆ name │
531
- │ --- ┆ --- ┆ --- ┆ --- │
532
- │ str ┆ u32 ┆ u32 ┆ str │
533
- ╞═══════╪═══════════╪═══════════╪═══════╡
534
- │ chr5 ┆ 28900001 ┆ 42500000 ┆ FRA5A │
535
- │ chr5 ┆ 92300001 ┆ 98200000 ┆ FRA5B │
536
- │ chr5 ┆ 130600001 ┆ 136200000 ┆ FRA5C │
537
- │ chr5 ┆ 92300001 ┆ 93916228 ┆ FRA5D │
538
- │ chr5 ┆ 18400001 ┆ 28900000 ┆ FRA5E │
539
- └───────┴───────────┴───────────┴───────┘
540
- ```
541
612
  !!! note
542
613
  BED reader uses **1-based** coordinate system for the `start`, `end`.
543
614
  """
544
-
545
615
  object_storage_options = PyObjectStorageOptions(
546
616
  allow_anonymous=allow_anonymous,
547
617
  enable_request_payer=enable_request_payer,
@@ -557,24 +627,31 @@ class IOOperations:
557
627
  object_storage_options=object_storage_options,
558
628
  )
559
629
  read_options = ReadOptions(bed_read_options=bed_read_options)
560
- if streaming:
561
- return read_file(path, InputFormat.Bed, read_options, streaming)
562
- else:
563
- df = read_file(path, InputFormat.Bed, read_options)
564
- return lazy_scan(df)
630
+ return _read_file(path, InputFormat.Bed, read_options)
565
631
 
566
632
  @staticmethod
567
- def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
633
+ def read_table(path: str, schema: Dict = None, **kwargs) -> pl.DataFrame:
568
634
  """
569
- Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
635
+ Read a tab-delimited (i.e. BED) file into a Polars DataFrame.
570
636
  Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
571
- but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
637
+ but faster. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
572
638
 
573
639
  Parameters:
574
640
  path: The path to the file.
575
641
  schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
642
+ """
643
+ return IOOperations.scan_table(path, schema, **kwargs).collect()
576
644
 
645
+ @staticmethod
646
+ def scan_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
647
+ """
648
+ Lazily read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
649
+ Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
650
+ but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
577
651
 
652
+ Parameters:
653
+ path: The path to the file.
654
+ schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
578
655
  """
579
656
  df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
580
657
  if schema is not None:
@@ -602,30 +679,6 @@ class IOOperations:
602
679
  allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
603
680
  enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
604
681
  compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
605
-
606
- !!! Example
607
- ```python
608
- import polars_bio as pb
609
- vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
610
- pb.describe_vcf(vcf_1, allow_anonymous=True).sort("name").limit(5)
611
- ```
612
-
613
- ```shell
614
- shape: (5, 3)
615
- ┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
616
- │ name ┆ type ┆ description │
617
- │ --- ┆ --- ┆ --- │
618
- │ str ┆ str ┆ str │
619
- ╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════╡
620
- │ AC ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only). │
621
- │ AC_XX ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only). │
622
- │ AC_XY ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only). │
623
- │ AC_afr ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only). │
624
- │ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
625
- └───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘
626
-
627
-
628
- ```
629
682
  """
630
683
  object_storage_options = PyObjectStorageOptions(
631
684
  allow_anonymous=allow_anonymous,
@@ -646,30 +699,6 @@ class IOOperations:
646
699
  Parameters:
647
700
  name: The name of the table.
648
701
  df: The Polars DataFrame.
649
- !!! Example
650
- ```python
651
- import polars as pl
652
- import polars_bio as pb
653
- df = pl.DataFrame({
654
- "a": [1, 2, 3],
655
- "b": [4, 5, 6]
656
- })
657
- pb.from_polars("test_df", df)
658
- pb.sql("SELECT * FROM test_df").collect()
659
- ```
660
- ```shell
661
- 3rows [00:00, 2978.91rows/s]
662
- shape: (3, 2)
663
- ┌─────┬─────┐
664
- │ a ┆ b │
665
- │ --- ┆ --- │
666
- │ i64 ┆ i64 │
667
- ╞═════╪═════╡
668
- │ 1 ┆ 4 │
669
- │ 2 ┆ 5 │
670
- │ 3 ┆ 6 │
671
- └─────┴─────┘
672
- ```
673
702
  """
674
703
  reader = (
675
704
  df.to_arrow()
@@ -685,7 +714,7 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
685
714
  return [x.strip() for x in t]
686
715
 
687
716
 
688
- def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
717
+ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
689
718
  df_lazy: DataFrame = df
690
719
  arrow_schema = df_lazy.schema()
691
720
 
@@ -700,8 +729,6 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
700
729
  df = pl.DataFrame(df).limit(n_rows)
701
730
  if predicate is not None:
702
731
  df = df.filter(predicate)
703
- # TODO: We can push columns down to the DataFusion plan in the future,
704
- # but for now we'll do it here.
705
732
  if with_columns is not None:
706
733
  df = df.select(with_columns)
707
734
  yield df
@@ -713,8 +740,6 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
713
740
  df = pl.DataFrame(py_df)
714
741
  if predicate is not None:
715
742
  df = df.filter(predicate)
716
- # TODO: We can push columns down to the DataFusion plan in the future,
717
- # but for now we'll do it here.
718
743
  if with_columns is not None:
719
744
  df = df.select(with_columns)
720
745
  progress_bar.update(len(df))
@@ -723,31 +748,11 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
723
748
  return register_io_source(_overlap_source, schema=arrow_schema)
724
749
 
725
750
 
726
- def read_file(
751
+ def _read_file(
727
752
  path: str,
728
753
  input_format: InputFormat,
729
754
  read_options: ReadOptions,
730
- streaming: bool = False,
731
- ) -> Union[pl.LazyFrame, pl.DataFrame]:
732
- """
733
- Read a file into a DataFrame.
734
-
735
- Parameters
736
- ----------
737
- path : str
738
- The path to the file.
739
- input_format : InputFormat
740
- The input format of the file.
741
- read_options : ReadOptions, e.g. VcfReadOptions
742
- streaming: Whether to read the file in streaming mode.
743
-
744
- Returns
745
- -------
746
- pl.DataFrame
747
- The DataFrame.
748
- """
755
+ ) -> pl.LazyFrame:
749
756
  table = py_register_table(ctx, path, None, input_format, read_options)
750
- if streaming:
751
- return stream_wrapper(py_scan_table(ctx, table.name))
752
- else:
753
- return py_read_table(ctx, table.name)
757
+ df = py_read_table(ctx, table.name)
758
+ return _lazy_scan(df)
Binary file
polars_bio/range_op.py CHANGED
@@ -46,7 +46,6 @@ class IntervalOperations:
46
46
  cols2: Union[list[str], None] = ["chrom", "start", "end"],
47
47
  algorithm: str = "Coitrees",
48
48
  output_type: str = "polars.LazyFrame",
49
- streaming: bool = False,
50
49
  read_options1: Union[ReadOptions, None] = None,
51
50
  read_options2: Union[ReadOptions, None] = None,
52
51
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
@@ -64,9 +63,8 @@ class IntervalOperations:
64
63
  genomic intervals, provided separately for each set.
65
64
  suffixes: Suffixes for the columns of the two overlapped sets.
66
65
  on_cols: List of additional column names to join on. default is None.
67
- algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper
66
+ algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper, SuperIntervals
68
67
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
69
- streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
70
68
  read_options1: Additional options for reading the input files.
71
69
  read_options2: Additional options for reading the input files.
72
70
 
@@ -122,7 +120,6 @@ class IntervalOperations:
122
120
  columns_1=cols1,
123
121
  columns_2=cols2,
124
122
  overlap_alg=algorithm,
125
- streaming=streaming,
126
123
  )
127
124
 
128
125
  return range_operation(
@@ -139,7 +136,6 @@ class IntervalOperations:
139
136
  cols1: Union[list[str], None] = ["chrom", "start", "end"],
140
137
  cols2: Union[list[str], None] = ["chrom", "start", "end"],
141
138
  output_type: str = "polars.LazyFrame",
142
- streaming: bool = False,
143
139
  read_options: Union[ReadOptions, None] = None,
144
140
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
145
141
  """
@@ -157,7 +153,6 @@ class IntervalOperations:
157
153
  suffixes: Suffixes for the columns of the two overlapped sets.
158
154
  on_cols: List of additional column names to join on. default is None.
159
155
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
160
- streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
161
156
  read_options: Additional options for reading the input files.
162
157
 
163
158
 
@@ -186,7 +181,6 @@ class IntervalOperations:
186
181
  suffixes=suffixes,
187
182
  columns_1=cols1,
188
183
  columns_2=cols2,
189
- streaming=streaming,
190
184
  )
191
185
  return range_operation(df1, df2, range_options, output_type, ctx, read_options)
192
186
 
@@ -200,7 +194,6 @@ class IntervalOperations:
200
194
  cols1: Union[list[str], None] = ["chrom", "start", "end"],
201
195
  cols2: Union[list[str], None] = ["chrom", "start", "end"],
202
196
  output_type: str = "polars.LazyFrame",
203
- streaming: bool = False,
204
197
  read_options: Union[ReadOptions, None] = None,
205
198
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
206
199
  """
@@ -218,7 +211,6 @@ class IntervalOperations:
218
211
  suffixes: Suffixes for the columns of the two overlapped sets.
219
212
  on_cols: List of additional column names to join on. default is None.
220
213
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
221
- streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
222
214
  read_options: Additional options for reading the input files.
223
215
 
224
216
 
@@ -252,7 +244,6 @@ class IntervalOperations:
252
244
  suffixes=suffixes,
253
245
  columns_1=cols1,
254
246
  columns_2=cols2,
255
- streaming=streaming,
256
247
  )
257
248
  return range_operation(df2, df1, range_options, output_type, ctx, read_options)
258
249
 
@@ -266,7 +257,6 @@ class IntervalOperations:
266
257
  cols2: Union[list[str], None] = ["chrom", "start", "end"],
267
258
  on_cols: Union[list[str], None] = None,
268
259
  output_type: str = "polars.LazyFrame",
269
- streaming: bool = False,
270
260
  naive_query: bool = True,
271
261
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
272
262
  """
@@ -285,7 +275,6 @@ class IntervalOperations:
285
275
  on_cols: List of additional column names to join on. default is None.
286
276
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
287
277
  naive_query: If True, use naive query for counting overlaps based on overlaps.
288
- streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
289
278
  Returns:
290
279
  **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
291
280
 
@@ -335,7 +324,6 @@ class IntervalOperations:
335
324
  suffixes=suffixes,
336
325
  columns_1=cols1,
337
326
  columns_2=cols2,
338
- streaming=streaming,
339
327
  )
340
328
  return range_operation(df2, df1, range_options, output_type, ctx)
341
329
  df1 = read_df_to_datafusion(my_ctx, df1)
@@ -423,7 +411,7 @@ class IntervalOperations:
423
411
  )
424
412
  )
425
413
 
426
- return convert_result(df, output_type, streaming)
414
+ return convert_result(df, output_type)
427
415
 
428
416
  @staticmethod
429
417
  def merge(
@@ -433,7 +421,6 @@ class IntervalOperations:
433
421
  cols: Union[list[str], None] = ["chrom", "start", "end"],
434
422
  on_cols: Union[list[str], None] = None,
435
423
  output_type: str = "polars.LazyFrame",
436
- streaming: bool = False,
437
424
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
438
425
  """
439
426
  Merge overlapping intervals. It is assumed that start < end.
@@ -446,7 +433,6 @@ class IntervalOperations:
446
433
  genomic intervals, provided separately for each set.
447
434
  on_cols: List of additional column names for clustering. default is None.
448
435
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
449
- streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
450
436
 
451
437
  Returns:
452
438
  **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -574,4 +560,4 @@ class IntervalOperations:
574
560
  )
575
561
  )
576
562
 
577
- return convert_result(result, output_type, streaming)
563
+ return convert_result(result, output_type)
@@ -44,19 +44,6 @@ def range_operation(
44
44
  len(supported_exts.intersection(ext2)) > 0 or len(ext2) == 0
45
45
  ), "Dataframe2 must be a Parquet, a BED or CSV or VCF file"
46
46
  # use suffixes to avoid column name conflicts
47
- if range_options.streaming:
48
- # FIXME: Parallelism is not supported
49
- # FIXME: StringViews not supported yet see: https://datafusion.apache.org/blog/2024/12/14/datafusion-python-43.1.0/
50
-
51
- ctx.set_option("datafusion.execution.target_partitions", "1", False)
52
- ctx.set_option(
53
- "datafusion.execution.parquet.schema_force_view_types", "false", True
54
- )
55
- return stream_wrapper(
56
- stream_range_operation_scan(
57
- ctx, df1, df2, range_options, read_options1, read_options2
58
- )
59
- )
60
47
 
61
48
  if range_options.range_op == RangeOp.CountOverlapsNaive:
62
49
  ## add count column to the schema
polars_bio/sql.py CHANGED
@@ -22,7 +22,7 @@ from polars_bio.polars_bio import (
22
22
  )
23
23
 
24
24
  from .context import ctx
25
- from .io import _cleanse_fields, lazy_scan
25
+ from .io import _cleanse_fields, _lazy_scan
26
26
  from .range_op_helpers import stream_wrapper
27
27
 
28
28
 
@@ -436,13 +436,12 @@ class SQL:
436
436
  py_register_table(ctx, path, name, InputFormat.Bam, read_options)
437
437
 
438
438
  @staticmethod
439
- def sql(query: str, streaming: bool = False) -> pl.LazyFrame:
439
+ def sql(query: str) -> pl.LazyFrame:
440
440
  """
441
441
  Execute a SQL query on the registered tables.
442
442
 
443
443
  Parameters:
444
444
  query: The SQL query.
445
- streaming: Whether to execute the query in streaming mode.
446
445
 
447
446
  !!! Example
448
447
  ```python
@@ -451,8 +450,5 @@ class SQL:
451
450
  pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
452
451
  ```
453
452
  """
454
- if streaming:
455
- return stream_wrapper(py_scan_sql(ctx, query))
456
- else:
457
- df = py_read_sql(ctx, query)
458
- return lazy_scan(df)
453
+ df = py_read_sql(ctx, query)
454
+ return _lazy_scan(df)
polars_bio/utils.py ADDED
@@ -0,0 +1,46 @@
1
+ from typing import Iterator, Union
2
+
3
+ import polars as pl
4
+ from datafusion import DataFrame
5
+ from polars.io.plugins import register_io_source
6
+ from tqdm.auto import tqdm
7
+
8
+
9
+ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
10
+ if t is None:
11
+ return None
12
+ return [x.strip() for x in t]
13
+
14
+
15
+ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
16
+ df_lazy: DataFrame = df
17
+ arrow_schema = df_lazy.schema()
18
+
19
+ def _overlap_source(
20
+ with_columns: Union[pl.Expr, None],
21
+ predicate: Union[pl.Expr, None],
22
+ n_rows: Union[int, None],
23
+ _batch_size: Union[int, None],
24
+ ) -> Iterator[pl.DataFrame]:
25
+ if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
26
+ df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
27
+ df = pl.DataFrame(df).limit(n_rows)
28
+ if predicate is not None:
29
+ df = df.filter(predicate)
30
+ if with_columns is not None:
31
+ df = df.select(with_columns)
32
+ yield df
33
+ return
34
+ df_stream = df_lazy.execute_stream()
35
+ progress_bar = tqdm(unit="rows")
36
+ for r in df_stream:
37
+ py_df = r.to_pyarrow()
38
+ df = pl.DataFrame(py_df)
39
+ if predicate is not None:
40
+ df = df.filter(predicate)
41
+ if with_columns is not None:
42
+ df = df.select(with_columns)
43
+ progress_bar.update(len(df))
44
+ yield df
45
+
46
+ return register_io_source(_overlap_source, schema=arrow_schema)
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polars-bio
3
- Version: 0.11.0
3
+ Version: 0.13.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
7
- Requires-Dist: polars~=1.21.0
7
+ Requires-Dist: polars~=1.29.0
8
8
  Requires-Dist: pyarrow~=21.0.0
9
9
  Requires-Dist: datafusion~=48.0.0
10
10
  Requires-Dist: tqdm~=4.67.1
@@ -0,0 +1,19 @@
1
+ polars_bio-0.13.0.dist-info/METADATA,sha256=kRbYeTHKR-qtdAq4pD5bf8k1iUyHSriIAbUq3IUOO9o,683
2
+ polars_bio-0.13.0.dist-info/WHEEL,sha256=TiMJekJwYXi-5FCpHPqncJXv9UVKDzSHt4YRv5UDSSg,104
3
+ polars_bio-0.13.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
+ polars_bio/__init__.py,sha256=-4QHwzijNcm99A372_-pLCK-F3YrmO2my-HZgVjYEr8,2977
5
+ polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
6
+ polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
7
+ polars_bio/interval_op_helpers.py,sha256=xMWxu2y3jIwt0KCtzIPF_cvbUMdhrb8Mif74MbHU1qY,2834
8
+ polars_bio/io.py,sha256=YtcNqS0pzeTRZ78ckov4nfNekvWCyz5JGSHVl7LxfFQ,37866
9
+ polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
10
+ polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
11
+ polars_bio/polars_bio.abi3.so,sha256=x8hfvc1l_jRS8o0h0-fbmrb5nexx5URRCfsNSIE7b-I,277132364
12
+ polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
13
+ polars_bio/range_op.py,sha256=3LAYTmbJhv7WY8eB7_OJfPLLoR9eonbZSFKkZi_Dp30,24300
14
+ polars_bio/range_op_helpers.py,sha256=RQw6ZgIGhDh-3-pUTIQ56Vypuy9XQhpFGKQYGd_vrzY,5792
15
+ polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
16
+ polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
17
+ polars_bio/sql.py,sha256=ORvSleiwUpkpewvgcFA3GeuyZhQXToq9RZ_XrO6iGxw,24164
18
+ polars_bio/utils.py,sha256=KAq8tbIf6yBFhRwzrRLBUfM6zbbdCqK_NYK5bUy1qfA,1565
19
+ polars_bio-0.13.0.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- polars_bio-0.11.0.dist-info/METADATA,sha256=ZCC8mNSP1aoNzqRMBnmaC5AcbLCiU4wZfe0-5dcEdAg,683
2
- polars_bio-0.11.0.dist-info/WHEEL,sha256=TiMJekJwYXi-5FCpHPqncJXv9UVKDzSHt4YRv5UDSSg,104
3
- polars_bio-0.11.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
- polars_bio/__init__.py,sha256=C3l2s2bJf1kTGHu5BHWHd2oyCFvZIuLO_vrP1e9oSiY,1939
5
- polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
6
- polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
7
- polars_bio/interval_op_helpers.py,sha256=3xg6IEpfyTPb3y1QzkSVGFLhtFVypGQfDhbJEGdHpgo,3006
8
- polars_bio/io.py,sha256=9y9fYO_xZN1Efc7JJ0_G1SrbQ7xqT5HEfip9h1WyrrI,41285
9
- polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
10
- polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
11
- polars_bio/polars_bio.abi3.so,sha256=OXYsznpzbNRv8uyEN8F2VAK9rV_Y3QLgLrSaVtTGKfg,268517156
12
- polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
13
- polars_bio/range_op.py,sha256=k6UzhSZIWQIj61zhOOkRcGD7ucFo8fwGaOUnyF6REIw,25122
14
- polars_bio/range_op_helpers.py,sha256=9MRGKhGmx_HnZEWP50tWQ4rdsdhoMf8m-08E0f_YxMs,6407
15
- polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
16
- polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
17
- polars_bio/sql.py,sha256=PFTHeRyVouPyjSL26w60ByyAKZMLCsaeZ0wiJY_KH2k,24361
18
- polars_bio-0.11.0.dist-info/RECORD,,