polars-bio 0.13.1__cp39-abi3-win_amd64.whl → 0.14.0__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_bio/__init__.py +1 -1
- polars_bio/io.py +338 -34
- polars_bio/polars_bio.pyd +0 -0
- polars_bio/range_op.py +36 -3
- polars_bio/range_op_helpers.py +10 -1
- polars_bio/range_op_io.py +43 -10
- polars_bio/sql.py +27 -12
- polars_bio/utils.py +85 -7
- {polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/METADATA +2 -1
- polars_bio-0.14.0.dist-info/RECORD +19 -0
- {polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/WHEEL +1 -1
- polars_bio-0.13.1.dist-info/RECORD +0 -19
- {polars_bio-0.13.1.dist-info → polars_bio-0.14.0.dist-info}/licenses/LICENSE +0 -0
polars_bio/__init__.py
CHANGED
polars_bio/io.py
CHANGED
@@ -17,6 +17,7 @@ from polars_bio.polars_bio import (
|
|
17
17
|
VcfReadOptions,
|
18
18
|
py_describe_vcf,
|
19
19
|
py_from_polars,
|
20
|
+
py_read_sql,
|
20
21
|
py_read_table,
|
21
22
|
py_register_table,
|
22
23
|
py_scan_table,
|
@@ -80,6 +81,7 @@ class IOOperations:
|
|
80
81
|
max_retries: int = 5,
|
81
82
|
timeout: int = 300,
|
82
83
|
compression_type: str = "auto",
|
84
|
+
projection_pushdown: bool = False,
|
83
85
|
) -> pl.DataFrame:
|
84
86
|
"""
|
85
87
|
|
@@ -94,6 +96,7 @@ class IOOperations:
|
|
94
96
|
max_retries: The maximum number of retries for reading the file from object storage.
|
95
97
|
timeout: The timeout in seconds for reading the file from object storage.
|
96
98
|
compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
99
|
+
projection_pushdown: Enable column projection pushdown optimization. When True, only requested columns are processed at the DataFusion execution level, improving performance and reducing memory usage.
|
97
100
|
|
98
101
|
!!! Example
|
99
102
|
```shell
|
@@ -124,6 +127,7 @@ class IOOperations:
|
|
124
127
|
max_retries,
|
125
128
|
timeout,
|
126
129
|
compression_type,
|
130
|
+
projection_pushdown,
|
127
131
|
).collect()
|
128
132
|
|
129
133
|
@staticmethod
|
@@ -136,6 +140,7 @@ class IOOperations:
|
|
136
140
|
max_retries: int = 5,
|
137
141
|
timeout: int = 300,
|
138
142
|
compression_type: str = "auto",
|
143
|
+
projection_pushdown: bool = False,
|
139
144
|
) -> pl.LazyFrame:
|
140
145
|
"""
|
141
146
|
|
@@ -150,6 +155,7 @@ class IOOperations:
|
|
150
155
|
max_retries: The maximum number of retries for reading the file from object storage.
|
151
156
|
timeout: The timeout in seconds for reading the file from object storage.
|
152
157
|
compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
158
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
153
159
|
|
154
160
|
!!! Example
|
155
161
|
```shell
|
@@ -184,12 +190,11 @@ class IOOperations:
|
|
184
190
|
object_storage_options=object_storage_options
|
185
191
|
)
|
186
192
|
read_options = ReadOptions(fasta_read_options=fasta_read_options)
|
187
|
-
return _read_file(path, InputFormat.Fasta, read_options)
|
193
|
+
return _read_file(path, InputFormat.Fasta, read_options, projection_pushdown)
|
188
194
|
|
189
195
|
@staticmethod
|
190
196
|
def read_vcf(
|
191
197
|
path: str,
|
192
|
-
info_fields: Union[list[str], None] = None,
|
193
198
|
thread_num: int = 1,
|
194
199
|
chunk_size: int = 8,
|
195
200
|
concurrent_fetches: int = 1,
|
@@ -198,13 +203,13 @@ class IOOperations:
|
|
198
203
|
max_retries: int = 5,
|
199
204
|
timeout: int = 300,
|
200
205
|
compression_type: str = "auto",
|
206
|
+
projection_pushdown: bool = False,
|
201
207
|
) -> pl.DataFrame:
|
202
208
|
"""
|
203
209
|
Read a VCF file into a DataFrame.
|
204
210
|
|
205
211
|
Parameters:
|
206
212
|
path: The path to the VCF file.
|
207
|
-
info_fields: The fields to read from the INFO column.
|
208
213
|
thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
209
214
|
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
210
215
|
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
@@ -212,14 +217,14 @@ class IOOperations:
|
|
212
217
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
213
218
|
max_retries: The maximum number of retries for reading the file from object storage.
|
214
219
|
timeout: The timeout in seconds for reading the file from object storage.
|
215
|
-
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically
|
220
|
+
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
|
221
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
216
222
|
|
217
223
|
!!! note
|
218
224
|
VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
219
225
|
"""
|
220
226
|
return IOOperations.scan_vcf(
|
221
227
|
path,
|
222
|
-
info_fields,
|
223
228
|
thread_num,
|
224
229
|
chunk_size,
|
225
230
|
concurrent_fetches,
|
@@ -228,12 +233,12 @@ class IOOperations:
|
|
228
233
|
max_retries,
|
229
234
|
timeout,
|
230
235
|
compression_type,
|
236
|
+
projection_pushdown,
|
231
237
|
).collect()
|
232
238
|
|
233
239
|
@staticmethod
|
234
240
|
def scan_vcf(
|
235
241
|
path: str,
|
236
|
-
info_fields: Union[list[str], None] = None,
|
237
242
|
thread_num: int = 1,
|
238
243
|
chunk_size: int = 8,
|
239
244
|
concurrent_fetches: int = 1,
|
@@ -242,13 +247,13 @@ class IOOperations:
|
|
242
247
|
max_retries: int = 5,
|
243
248
|
timeout: int = 300,
|
244
249
|
compression_type: str = "auto",
|
250
|
+
projection_pushdown: bool = False,
|
245
251
|
) -> pl.LazyFrame:
|
246
252
|
"""
|
247
253
|
Lazily read a VCF file into a LazyFrame.
|
248
254
|
|
249
255
|
Parameters:
|
250
256
|
path: The path to the VCF file.
|
251
|
-
info_fields: The fields to read from the INFO column.
|
252
257
|
thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
253
258
|
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
254
259
|
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
@@ -256,7 +261,8 @@ class IOOperations:
|
|
256
261
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
257
262
|
max_retries: The maximum number of retries for reading the file from object storage.
|
258
263
|
timeout: The timeout in seconds for reading the file from object storage.
|
259
|
-
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically
|
264
|
+
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
|
265
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
260
266
|
|
261
267
|
!!! note
|
262
268
|
VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
@@ -271,18 +277,36 @@ class IOOperations:
|
|
271
277
|
compression_type=compression_type,
|
272
278
|
)
|
273
279
|
|
280
|
+
# Get all info fields from VCF header for proper projection pushdown
|
281
|
+
all_info_fields = None
|
282
|
+
try:
|
283
|
+
vcf_schema_df = IOOperations.describe_vcf(
|
284
|
+
path,
|
285
|
+
allow_anonymous=allow_anonymous,
|
286
|
+
enable_request_payer=enable_request_payer,
|
287
|
+
compression_type=compression_type,
|
288
|
+
)
|
289
|
+
# Use column name 'name' not 'id' based on the schema output
|
290
|
+
all_info_fields = vcf_schema_df.select("name").to_series().to_list()
|
291
|
+
except Exception:
|
292
|
+
# Fallback to None if unable to get info fields
|
293
|
+
all_info_fields = None
|
294
|
+
|
295
|
+
# Always start with all info fields to establish full schema
|
296
|
+
# The callback will re-register with only requested info fields for optimization
|
297
|
+
initial_info_fields = all_info_fields
|
298
|
+
|
274
299
|
vcf_read_options = VcfReadOptions(
|
275
|
-
info_fields=
|
300
|
+
info_fields=initial_info_fields,
|
276
301
|
thread_num=thread_num,
|
277
302
|
object_storage_options=object_storage_options,
|
278
303
|
)
|
279
304
|
read_options = ReadOptions(vcf_read_options=vcf_read_options)
|
280
|
-
return _read_file(path, InputFormat.Vcf, read_options)
|
305
|
+
return _read_file(path, InputFormat.Vcf, read_options, projection_pushdown)
|
281
306
|
|
282
307
|
@staticmethod
|
283
308
|
def read_gff(
|
284
309
|
path: str,
|
285
|
-
attr_fields: Union[list[str], None] = None,
|
286
310
|
thread_num: int = 1,
|
287
311
|
chunk_size: int = 8,
|
288
312
|
concurrent_fetches: int = 1,
|
@@ -291,13 +315,14 @@ class IOOperations:
|
|
291
315
|
max_retries: int = 5,
|
292
316
|
timeout: int = 300,
|
293
317
|
compression_type: str = "auto",
|
318
|
+
projection_pushdown: bool = False,
|
319
|
+
parallel: bool = False,
|
294
320
|
) -> pl.DataFrame:
|
295
321
|
"""
|
296
322
|
Read a GFF file into a DataFrame.
|
297
323
|
|
298
324
|
Parameters:
|
299
325
|
path: The path to the GFF file.
|
300
|
-
attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
|
301
326
|
thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
302
327
|
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
303
328
|
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
@@ -305,14 +330,15 @@ class IOOperations:
|
|
305
330
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
306
331
|
max_retries: The maximum number of retries for reading the file from object storage.
|
307
332
|
timeout: The timeout in seconds for reading the file from object storage.
|
308
|
-
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically
|
333
|
+
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically..
|
334
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
335
|
+
parallel: Whether to use the parallel reader for BGZF-compressed local files (uses BGZF chunk-level parallelism similar to FASTQ).
|
309
336
|
|
310
337
|
!!! note
|
311
338
|
GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
312
339
|
"""
|
313
340
|
return IOOperations.scan_gff(
|
314
341
|
path,
|
315
|
-
attr_fields,
|
316
342
|
thread_num,
|
317
343
|
chunk_size,
|
318
344
|
concurrent_fetches,
|
@@ -321,12 +347,13 @@ class IOOperations:
|
|
321
347
|
max_retries,
|
322
348
|
timeout,
|
323
349
|
compression_type,
|
350
|
+
projection_pushdown,
|
351
|
+
parallel,
|
324
352
|
).collect()
|
325
353
|
|
326
354
|
@staticmethod
|
327
355
|
def scan_gff(
|
328
356
|
path: str,
|
329
|
-
attr_fields: Union[list[str], None] = None,
|
330
357
|
thread_num: int = 1,
|
331
358
|
chunk_size: int = 8,
|
332
359
|
concurrent_fetches: int = 1,
|
@@ -335,21 +362,24 @@ class IOOperations:
|
|
335
362
|
max_retries: int = 5,
|
336
363
|
timeout: int = 300,
|
337
364
|
compression_type: str = "auto",
|
365
|
+
projection_pushdown: bool = False,
|
366
|
+
parallel: bool = False,
|
338
367
|
) -> pl.LazyFrame:
|
339
368
|
"""
|
340
369
|
Lazily read a GFF file into a LazyFrame.
|
341
370
|
|
342
371
|
Parameters:
|
343
372
|
path: The path to the GFF file.
|
344
|
-
attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
|
345
373
|
thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
346
374
|
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
347
|
-
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large
|
375
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large-scale operations, it is recommended to increase this value to 8 or even more.
|
348
376
|
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
349
377
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
350
378
|
max_retries: The maximum number of retries for reading the file from object storage.
|
351
379
|
timeout: The timeout in seconds for reading the file from object storage.
|
352
|
-
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically
|
380
|
+
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically.
|
381
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
382
|
+
parallel: Whether to use the parallel reader for BGZF-compressed local files (use BGZF chunk-level parallelism similar to FASTQ).
|
353
383
|
|
354
384
|
!!! note
|
355
385
|
GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
@@ -365,12 +395,13 @@ class IOOperations:
|
|
365
395
|
)
|
366
396
|
|
367
397
|
gff_read_options = GffReadOptions(
|
368
|
-
attr_fields=
|
398
|
+
attr_fields=None,
|
369
399
|
thread_num=thread_num,
|
370
400
|
object_storage_options=object_storage_options,
|
401
|
+
parallel=parallel,
|
371
402
|
)
|
372
403
|
read_options = ReadOptions(gff_read_options=gff_read_options)
|
373
|
-
return _read_file(path, InputFormat.Gff, read_options)
|
404
|
+
return _read_file(path, InputFormat.Gff, read_options, projection_pushdown)
|
374
405
|
|
375
406
|
@staticmethod
|
376
407
|
def read_bam(
|
@@ -382,6 +413,7 @@ class IOOperations:
|
|
382
413
|
enable_request_payer: bool = False,
|
383
414
|
max_retries: int = 5,
|
384
415
|
timeout: int = 300,
|
416
|
+
projection_pushdown: bool = False,
|
385
417
|
) -> pl.DataFrame:
|
386
418
|
"""
|
387
419
|
Read a BAM file into a DataFrame.
|
@@ -389,12 +421,13 @@ class IOOperations:
|
|
389
421
|
Parameters:
|
390
422
|
path: The path to the BAM file.
|
391
423
|
thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
392
|
-
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large
|
393
|
-
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large
|
424
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large-scale operations, it is recommended to increase this value to 64.
|
425
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large-scale operations, it is recommended to increase this value to 8 or even more.
|
394
426
|
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
395
427
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
396
428
|
max_retries: The maximum number of retries for reading the file from object storage.
|
397
429
|
timeout: The timeout in seconds for reading the file from object storage.
|
430
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
398
431
|
|
399
432
|
!!! note
|
400
433
|
BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
|
@@ -408,6 +441,7 @@ class IOOperations:
|
|
408
441
|
enable_request_payer,
|
409
442
|
max_retries,
|
410
443
|
timeout,
|
444
|
+
projection_pushdown,
|
411
445
|
).collect()
|
412
446
|
|
413
447
|
@staticmethod
|
@@ -420,6 +454,7 @@ class IOOperations:
|
|
420
454
|
enable_request_payer: bool = False,
|
421
455
|
max_retries: int = 5,
|
422
456
|
timeout: int = 300,
|
457
|
+
projection_pushdown: bool = False,
|
423
458
|
) -> pl.LazyFrame:
|
424
459
|
"""
|
425
460
|
Lazily read a BAM file into a LazyFrame.
|
@@ -433,6 +468,7 @@ class IOOperations:
|
|
433
468
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
434
469
|
max_retries: The maximum number of retries for reading the file from object storage.
|
435
470
|
timeout: The timeout in seconds for reading the file from object storage.
|
471
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
436
472
|
|
437
473
|
!!! note
|
438
474
|
BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
|
@@ -452,7 +488,7 @@ class IOOperations:
|
|
452
488
|
object_storage_options=object_storage_options,
|
453
489
|
)
|
454
490
|
read_options = ReadOptions(bam_read_options=bam_read_options)
|
455
|
-
return _read_file(path, InputFormat.Bam, read_options)
|
491
|
+
return _read_file(path, InputFormat.Bam, read_options, projection_pushdown)
|
456
492
|
|
457
493
|
@staticmethod
|
458
494
|
def read_fastq(
|
@@ -465,6 +501,7 @@ class IOOperations:
|
|
465
501
|
timeout: int = 300,
|
466
502
|
compression_type: str = "auto",
|
467
503
|
parallel: bool = False,
|
504
|
+
projection_pushdown: bool = False,
|
468
505
|
) -> pl.DataFrame:
|
469
506
|
"""
|
470
507
|
Read a FASTQ file into a DataFrame.
|
@@ -479,6 +516,7 @@ class IOOperations:
|
|
479
516
|
timeout: The timeout in seconds for reading the file from object storage.
|
480
517
|
compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
481
518
|
parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
|
519
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
482
520
|
"""
|
483
521
|
return IOOperations.scan_fastq(
|
484
522
|
path,
|
@@ -490,6 +528,7 @@ class IOOperations:
|
|
490
528
|
timeout,
|
491
529
|
compression_type,
|
492
530
|
parallel,
|
531
|
+
projection_pushdown,
|
493
532
|
).collect()
|
494
533
|
|
495
534
|
@staticmethod
|
@@ -503,6 +542,7 @@ class IOOperations:
|
|
503
542
|
timeout: int = 300,
|
504
543
|
compression_type: str = "auto",
|
505
544
|
parallel: bool = False,
|
545
|
+
projection_pushdown: bool = False,
|
506
546
|
) -> pl.LazyFrame:
|
507
547
|
"""
|
508
548
|
Lazily read a FASTQ file into a LazyFrame.
|
@@ -517,6 +557,7 @@ class IOOperations:
|
|
517
557
|
timeout: The timeout in seconds for reading the file from object storage.
|
518
558
|
compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
519
559
|
parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
|
560
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
520
561
|
"""
|
521
562
|
object_storage_options = PyObjectStorageOptions(
|
522
563
|
allow_anonymous=allow_anonymous,
|
@@ -532,7 +573,7 @@ class IOOperations:
|
|
532
573
|
object_storage_options=object_storage_options, parallel=parallel
|
533
574
|
)
|
534
575
|
read_options = ReadOptions(fastq_read_options=fastq_read_options)
|
535
|
-
return _read_file(path, InputFormat.Fastq, read_options)
|
576
|
+
return _read_file(path, InputFormat.Fastq, read_options, projection_pushdown)
|
536
577
|
|
537
578
|
@staticmethod
|
538
579
|
def read_bed(
|
@@ -545,6 +586,7 @@ class IOOperations:
|
|
545
586
|
max_retries: int = 5,
|
546
587
|
timeout: int = 300,
|
547
588
|
compression_type: str = "auto",
|
589
|
+
projection_pushdown: bool = False,
|
548
590
|
) -> pl.DataFrame:
|
549
591
|
"""
|
550
592
|
Read a BED file into a DataFrame.
|
@@ -559,6 +601,7 @@ class IOOperations:
|
|
559
601
|
max_retries: The maximum number of retries for reading the file from object storage.
|
560
602
|
timeout: The timeout in seconds for reading the file from object storage.
|
561
603
|
compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
|
604
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
562
605
|
|
563
606
|
!!! Note
|
564
607
|
Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
|
@@ -577,6 +620,7 @@ class IOOperations:
|
|
577
620
|
max_retries,
|
578
621
|
timeout,
|
579
622
|
compression_type,
|
623
|
+
projection_pushdown,
|
580
624
|
).collect()
|
581
625
|
|
582
626
|
@staticmethod
|
@@ -590,6 +634,7 @@ class IOOperations:
|
|
590
634
|
max_retries: int = 5,
|
591
635
|
timeout: int = 300,
|
592
636
|
compression_type: str = "auto",
|
637
|
+
projection_pushdown: bool = False,
|
593
638
|
) -> pl.LazyFrame:
|
594
639
|
"""
|
595
640
|
Lazily read a BED file into a LazyFrame.
|
@@ -604,6 +649,7 @@ class IOOperations:
|
|
604
649
|
max_retries: The maximum number of retries for reading the file from object storage.
|
605
650
|
timeout: The timeout in seconds for reading the file from object storage.
|
606
651
|
compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
|
652
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
607
653
|
|
608
654
|
!!! Note
|
609
655
|
Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
|
@@ -627,7 +673,7 @@ class IOOperations:
|
|
627
673
|
object_storage_options=object_storage_options,
|
628
674
|
)
|
629
675
|
read_options = ReadOptions(bed_read_options=bed_read_options)
|
630
|
-
return _read_file(path, InputFormat.Bed, read_options)
|
676
|
+
return _read_file(path, InputFormat.Bed, read_options, projection_pushdown)
|
631
677
|
|
632
678
|
@staticmethod
|
633
679
|
def read_table(path: str, schema: Dict = None, **kwargs) -> pl.DataFrame:
|
@@ -678,7 +724,7 @@ class IOOperations:
|
|
678
724
|
path: The path to the VCF file.
|
679
725
|
allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
|
680
726
|
enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
681
|
-
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically
|
727
|
+
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
|
682
728
|
"""
|
683
729
|
object_storage_options = PyObjectStorageOptions(
|
684
730
|
allow_anonymous=allow_anonymous,
|
@@ -714,9 +760,15 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
|
|
714
760
|
return [x.strip() for x in t]
|
715
761
|
|
716
762
|
|
717
|
-
def _lazy_scan(
|
763
|
+
def _lazy_scan(
|
764
|
+
df: Union[pl.DataFrame, pl.LazyFrame],
|
765
|
+
projection_pushdown: bool = False,
|
766
|
+
table_name: str = None,
|
767
|
+
input_format: InputFormat = None,
|
768
|
+
file_path: str = None,
|
769
|
+
) -> pl.LazyFrame:
|
718
770
|
df_lazy: DataFrame = df
|
719
|
-
|
771
|
+
original_schema = df_lazy.schema()
|
720
772
|
|
721
773
|
def _overlap_source(
|
722
774
|
with_columns: Union[pl.Expr, None],
|
@@ -724,35 +776,287 @@ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
|
724
776
|
n_rows: Union[int, None],
|
725
777
|
_batch_size: Union[int, None],
|
726
778
|
) -> Iterator[pl.DataFrame]:
|
779
|
+
# Extract column names from with_columns if projection pushdown is enabled
|
780
|
+
projected_columns = None
|
781
|
+
if projection_pushdown and with_columns is not None:
|
782
|
+
projected_columns = _extract_column_names_from_expr(with_columns)
|
783
|
+
|
784
|
+
# Projection pushdown is handled natively by table providers
|
785
|
+
query_df = df_lazy
|
786
|
+
|
787
|
+
# Apply column projection to DataFusion query if enabled
|
788
|
+
datafusion_projection_applied = False
|
789
|
+
|
790
|
+
if projection_pushdown and projected_columns:
|
791
|
+
try:
|
792
|
+
# Apply projection at the DataFusion level using SQL
|
793
|
+
# This approach works reliably with the DataFusion Python API
|
794
|
+
columns_sql = ", ".join([f'"{c}"' for c in projected_columns])
|
795
|
+
|
796
|
+
# Use the table name passed from _read_file, fallback if not available
|
797
|
+
table_to_query = table_name if table_name else "temp_table"
|
798
|
+
|
799
|
+
# Use py_read_sql to execute SQL projection (same as pb.sql() does)
|
800
|
+
from .context import ctx
|
801
|
+
|
802
|
+
query_df = py_read_sql(
|
803
|
+
ctx, f"SELECT {columns_sql} FROM {table_to_query}"
|
804
|
+
)
|
805
|
+
datafusion_projection_applied = True
|
806
|
+
except Exception as e:
|
807
|
+
# Fallback to original behavior if projection fails
|
808
|
+
print(f"DataFusion projection failed: {e}")
|
809
|
+
query_df = df_lazy
|
810
|
+
projected_columns = None
|
811
|
+
datafusion_projection_applied = False
|
812
|
+
|
727
813
|
if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
|
728
|
-
df =
|
814
|
+
df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
|
729
815
|
df = pl.DataFrame(df).limit(n_rows)
|
730
816
|
if predicate is not None:
|
731
817
|
df = df.filter(predicate)
|
732
|
-
if
|
818
|
+
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
819
|
+
if with_columns is not None and (
|
820
|
+
not projection_pushdown or not datafusion_projection_applied
|
821
|
+
):
|
733
822
|
df = df.select(with_columns)
|
734
823
|
yield df
|
735
824
|
return
|
736
|
-
|
825
|
+
|
826
|
+
df_stream = query_df.execute_stream()
|
737
827
|
progress_bar = tqdm(unit="rows")
|
738
828
|
for r in df_stream:
|
739
829
|
py_df = r.to_pyarrow()
|
740
830
|
df = pl.DataFrame(py_df)
|
741
831
|
if predicate is not None:
|
742
832
|
df = df.filter(predicate)
|
743
|
-
if
|
833
|
+
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
834
|
+
if with_columns is not None and (
|
835
|
+
not projection_pushdown or not datafusion_projection_applied
|
836
|
+
):
|
744
837
|
df = df.select(with_columns)
|
745
838
|
progress_bar.update(len(df))
|
746
839
|
yield df
|
747
840
|
|
748
|
-
return register_io_source(_overlap_source, schema=
|
841
|
+
return register_io_source(_overlap_source, schema=original_schema)
|
842
|
+
|
843
|
+
|
844
|
+
def _extract_column_names_from_expr(with_columns: Union[pl.Expr, list]) -> list[str]:
|
845
|
+
"""Extract column names from Polars expressions."""
|
846
|
+
if with_columns is None:
|
847
|
+
return []
|
848
|
+
|
849
|
+
# Handle different types of with_columns input
|
850
|
+
if hasattr(with_columns, "__iter__") and not isinstance(with_columns, str):
|
851
|
+
# It's a list of expressions or strings
|
852
|
+
column_names = []
|
853
|
+
for item in with_columns:
|
854
|
+
if isinstance(item, str):
|
855
|
+
column_names.append(item)
|
856
|
+
elif hasattr(item, "meta") and hasattr(item.meta, "output_name"):
|
857
|
+
# Polars expression with output name
|
858
|
+
try:
|
859
|
+
column_names.append(item.meta.output_name())
|
860
|
+
except Exception:
|
861
|
+
pass
|
862
|
+
return column_names
|
863
|
+
elif isinstance(with_columns, str):
|
864
|
+
return [with_columns]
|
865
|
+
elif hasattr(with_columns, "meta") and hasattr(with_columns.meta, "output_name"):
|
866
|
+
# Single Polars expression
|
867
|
+
try:
|
868
|
+
return [with_columns.meta.output_name()]
|
869
|
+
except Exception:
|
870
|
+
pass
|
871
|
+
|
872
|
+
return []
|
749
873
|
|
750
874
|
|
751
875
|
def _read_file(
|
752
876
|
path: str,
|
753
877
|
input_format: InputFormat,
|
754
878
|
read_options: ReadOptions,
|
879
|
+
projection_pushdown: bool = False,
|
755
880
|
) -> pl.LazyFrame:
|
756
881
|
table = py_register_table(ctx, path, None, input_format, read_options)
|
757
882
|
df = py_read_table(ctx, table.name)
|
758
|
-
|
883
|
+
|
884
|
+
lf = _lazy_scan(df, projection_pushdown, table.name, input_format, path)
|
885
|
+
|
886
|
+
# Wrap GFF LazyFrames with projection-aware wrapper for consistent attribute field handling
|
887
|
+
if input_format == InputFormat.Gff:
|
888
|
+
return GffLazyFrameWrapper(lf, path, read_options, projection_pushdown)
|
889
|
+
|
890
|
+
return lf
|
891
|
+
|
892
|
+
|
893
|
+
class GffLazyFrameWrapper:
|
894
|
+
"""Wrapper for GFF LazyFrames that handles attribute field detection in select operations."""
|
895
|
+
|
896
|
+
def __init__(
|
897
|
+
self,
|
898
|
+
base_lf: pl.LazyFrame,
|
899
|
+
file_path: str,
|
900
|
+
read_options: ReadOptions,
|
901
|
+
projection_pushdown: bool = True,
|
902
|
+
):
|
903
|
+
self._base_lf = base_lf
|
904
|
+
self._file_path = file_path
|
905
|
+
self._read_options = read_options
|
906
|
+
self._projection_pushdown = projection_pushdown
|
907
|
+
|
908
|
+
def select(self, exprs):
|
909
|
+
"""Override select to handle GFF attribute field detection.
|
910
|
+
|
911
|
+
Ensures queries requesting the raw `attributes` column use a registration
|
912
|
+
that exposes it, while preserving projection pushdown. For unnested
|
913
|
+
attribute fields (e.g., `gene_id`), re-registers with those fields to
|
914
|
+
enable efficient projection.
|
915
|
+
"""
|
916
|
+
# Extract column names from expressions
|
917
|
+
if isinstance(exprs, (list, tuple)):
|
918
|
+
columns = []
|
919
|
+
for expr in exprs:
|
920
|
+
if isinstance(expr, str):
|
921
|
+
columns.append(expr)
|
922
|
+
elif hasattr(expr, "meta") and hasattr(expr.meta, "output_name"):
|
923
|
+
try:
|
924
|
+
columns.append(expr.meta.output_name())
|
925
|
+
except:
|
926
|
+
pass
|
927
|
+
else:
|
928
|
+
# Single expression
|
929
|
+
if isinstance(exprs, str):
|
930
|
+
columns = [exprs]
|
931
|
+
elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
|
932
|
+
try:
|
933
|
+
columns = [exprs.meta.output_name()]
|
934
|
+
except:
|
935
|
+
columns = []
|
936
|
+
else:
|
937
|
+
columns = []
|
938
|
+
|
939
|
+
# Categorize columns
|
940
|
+
GFF_STATIC_COLUMNS = {
|
941
|
+
"chrom",
|
942
|
+
"start",
|
943
|
+
"end",
|
944
|
+
"type",
|
945
|
+
"source",
|
946
|
+
"score",
|
947
|
+
"strand",
|
948
|
+
"phase",
|
949
|
+
"attributes",
|
950
|
+
}
|
951
|
+
static_cols = [col for col in columns if col in GFF_STATIC_COLUMNS]
|
952
|
+
attribute_cols = [col for col in columns if col not in GFF_STATIC_COLUMNS]
|
953
|
+
|
954
|
+
# If 'attributes' is requested, ensure the registered table exposes it.
|
955
|
+
# Some parallel GFF providers omit the raw 'attributes' column; switch
|
956
|
+
# to a registration that includes it while keeping projection pushdown.
|
957
|
+
if "attributes" in static_cols:
|
958
|
+
from .context import ctx
|
959
|
+
|
960
|
+
# Preserve original parallelism and thread config when re-registering
|
961
|
+
orig_gff_opts = getattr(self._read_options, "gff_read_options", None)
|
962
|
+
orig_parallel = (
|
963
|
+
getattr(orig_gff_opts, "parallel", False) if orig_gff_opts else False
|
964
|
+
)
|
965
|
+
orig_thread = (
|
966
|
+
getattr(orig_gff_opts, "thread_num", None) if orig_gff_opts else None
|
967
|
+
)
|
968
|
+
|
969
|
+
# Build read options that ensure raw attributes are present
|
970
|
+
gff_options = GffReadOptions(
|
971
|
+
attr_fields=None, # keep nested 'attributes' column
|
972
|
+
thread_num=orig_thread if orig_thread is not None else 1,
|
973
|
+
object_storage_options=PyObjectStorageOptions(
|
974
|
+
allow_anonymous=True,
|
975
|
+
enable_request_payer=False,
|
976
|
+
chunk_size=8,
|
977
|
+
concurrent_fetches=1,
|
978
|
+
max_retries=5,
|
979
|
+
timeout=300,
|
980
|
+
compression_type="auto",
|
981
|
+
),
|
982
|
+
parallel=orig_parallel,
|
983
|
+
)
|
984
|
+
read_options = ReadOptions(gff_read_options=gff_options)
|
985
|
+
table = py_register_table(
|
986
|
+
ctx, self._file_path, None, InputFormat.Gff, read_options
|
987
|
+
)
|
988
|
+
df = py_read_table(ctx, table.name)
|
989
|
+
new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
|
990
|
+
return new_lf.select(exprs)
|
991
|
+
|
992
|
+
if self._projection_pushdown:
|
993
|
+
# Optimized path: when selecting specific unnested attribute fields, re-register
|
994
|
+
# GFF table with those fields so DataFusion can project them efficiently.
|
995
|
+
|
996
|
+
# Use optimized table re-registration (fast path)
|
997
|
+
from .context import ctx
|
998
|
+
|
999
|
+
gff_options = GffReadOptions(
|
1000
|
+
attr_fields=attribute_cols if attribute_cols else None,
|
1001
|
+
thread_num=1,
|
1002
|
+
object_storage_options=PyObjectStorageOptions(
|
1003
|
+
allow_anonymous=True,
|
1004
|
+
enable_request_payer=False,
|
1005
|
+
chunk_size=8,
|
1006
|
+
concurrent_fetches=1,
|
1007
|
+
max_retries=5,
|
1008
|
+
timeout=300,
|
1009
|
+
compression_type="auto",
|
1010
|
+
),
|
1011
|
+
# Keep parallel reading consistent with base options when possible
|
1012
|
+
parallel=getattr(
|
1013
|
+
getattr(self._read_options, "gff_read_options", None),
|
1014
|
+
"parallel",
|
1015
|
+
False,
|
1016
|
+
),
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
read_options = ReadOptions(gff_read_options=gff_options)
|
1020
|
+
table = py_register_table(
|
1021
|
+
ctx, self._file_path, None, InputFormat.Gff, read_options
|
1022
|
+
)
|
1023
|
+
df = py_read_table(ctx, table.name)
|
1024
|
+
|
1025
|
+
# Create new LazyFrame with optimized schema
|
1026
|
+
new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
|
1027
|
+
return new_lf.select(exprs)
|
1028
|
+
|
1029
|
+
elif attribute_cols:
|
1030
|
+
# Extract attribute fields from nested structure (compatibility path)
|
1031
|
+
import polars as pl
|
1032
|
+
|
1033
|
+
# Build selection with attribute field extraction
|
1034
|
+
selection_exprs = []
|
1035
|
+
|
1036
|
+
# Add static columns as-is
|
1037
|
+
for col in static_cols:
|
1038
|
+
selection_exprs.append(pl.col(col))
|
1039
|
+
|
1040
|
+
# Add attribute field extractions
|
1041
|
+
for attr_col in attribute_cols:
|
1042
|
+
attr_expr = (
|
1043
|
+
pl.col("attributes")
|
1044
|
+
.list.eval(
|
1045
|
+
pl.when(pl.element().struct.field("tag") == attr_col).then(
|
1046
|
+
pl.element().struct.field("value")
|
1047
|
+
)
|
1048
|
+
)
|
1049
|
+
.list.drop_nulls()
|
1050
|
+
.list.first()
|
1051
|
+
.alias(attr_col)
|
1052
|
+
)
|
1053
|
+
selection_exprs.append(attr_expr)
|
1054
|
+
|
1055
|
+
return self._base_lf.select(selection_exprs)
|
1056
|
+
else:
|
1057
|
+
# Static columns only, use base LazyFrame
|
1058
|
+
return self._base_lf.select(exprs)
|
1059
|
+
|
1060
|
+
def __getattr__(self, name):
|
1061
|
+
"""Delegate all other operations to base LazyFrame."""
|
1062
|
+
return getattr(self._base_lf, name)
|
polars_bio/polars_bio.pyd
CHANGED
Binary file
|
polars_bio/range_op.py
CHANGED
@@ -48,6 +48,7 @@ class IntervalOperations:
|
|
48
48
|
output_type: str = "polars.LazyFrame",
|
49
49
|
read_options1: Union[ReadOptions, None] = None,
|
50
50
|
read_options2: Union[ReadOptions, None] = None,
|
51
|
+
projection_pushdown: bool = False,
|
51
52
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
52
53
|
"""
|
53
54
|
Find pairs of overlapping genomic intervals.
|
@@ -67,6 +68,7 @@ class IntervalOperations:
|
|
67
68
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
68
69
|
read_options1: Additional options for reading the input files.
|
69
70
|
read_options2: Additional options for reading the input files.
|
71
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
70
72
|
|
71
73
|
Returns:
|
72
74
|
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
|
@@ -123,7 +125,14 @@ class IntervalOperations:
|
|
123
125
|
)
|
124
126
|
|
125
127
|
return range_operation(
|
126
|
-
df1,
|
128
|
+
df1,
|
129
|
+
df2,
|
130
|
+
range_options,
|
131
|
+
output_type,
|
132
|
+
ctx,
|
133
|
+
read_options1,
|
134
|
+
read_options2,
|
135
|
+
projection_pushdown,
|
127
136
|
)
|
128
137
|
|
129
138
|
@staticmethod
|
@@ -137,6 +146,7 @@ class IntervalOperations:
|
|
137
146
|
cols2: Union[list[str], None] = ["chrom", "start", "end"],
|
138
147
|
output_type: str = "polars.LazyFrame",
|
139
148
|
read_options: Union[ReadOptions, None] = None,
|
149
|
+
projection_pushdown: bool = False,
|
140
150
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
141
151
|
"""
|
142
152
|
Find pairs of closest genomic intervals.
|
@@ -154,6 +164,7 @@ class IntervalOperations:
|
|
154
164
|
on_cols: List of additional column names to join on. default is None.
|
155
165
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
156
166
|
read_options: Additional options for reading the input files.
|
167
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
157
168
|
|
158
169
|
|
159
170
|
Returns:
|
@@ -182,7 +193,15 @@ class IntervalOperations:
|
|
182
193
|
columns_1=cols1,
|
183
194
|
columns_2=cols2,
|
184
195
|
)
|
185
|
-
return range_operation(
|
196
|
+
return range_operation(
|
197
|
+
df1,
|
198
|
+
df2,
|
199
|
+
range_options,
|
200
|
+
output_type,
|
201
|
+
ctx,
|
202
|
+
read_options,
|
203
|
+
projection_pushdown=projection_pushdown,
|
204
|
+
)
|
186
205
|
|
187
206
|
@staticmethod
|
188
207
|
def coverage(
|
@@ -195,6 +214,7 @@ class IntervalOperations:
|
|
195
214
|
cols2: Union[list[str], None] = ["chrom", "start", "end"],
|
196
215
|
output_type: str = "polars.LazyFrame",
|
197
216
|
read_options: Union[ReadOptions, None] = None,
|
217
|
+
projection_pushdown: bool = False,
|
198
218
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
199
219
|
"""
|
200
220
|
Calculate intervals coverage.
|
@@ -212,6 +232,7 @@ class IntervalOperations:
|
|
212
232
|
on_cols: List of additional column names to join on. default is None.
|
213
233
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
214
234
|
read_options: Additional options for reading the input files.
|
235
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
215
236
|
|
216
237
|
|
217
238
|
Returns:
|
@@ -245,7 +266,15 @@ class IntervalOperations:
|
|
245
266
|
columns_1=cols1,
|
246
267
|
columns_2=cols2,
|
247
268
|
)
|
248
|
-
return range_operation(
|
269
|
+
return range_operation(
|
270
|
+
df2,
|
271
|
+
df1,
|
272
|
+
range_options,
|
273
|
+
output_type,
|
274
|
+
ctx,
|
275
|
+
read_options,
|
276
|
+
projection_pushdown=projection_pushdown,
|
277
|
+
)
|
249
278
|
|
250
279
|
@staticmethod
|
251
280
|
def count_overlaps(
|
@@ -258,6 +287,7 @@ class IntervalOperations:
|
|
258
287
|
on_cols: Union[list[str], None] = None,
|
259
288
|
output_type: str = "polars.LazyFrame",
|
260
289
|
naive_query: bool = True,
|
290
|
+
projection_pushdown: bool = False,
|
261
291
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
262
292
|
"""
|
263
293
|
Count pairs of overlapping genomic intervals.
|
@@ -275,6 +305,7 @@ class IntervalOperations:
|
|
275
305
|
on_cols: List of additional column names to join on. default is None.
|
276
306
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
277
307
|
naive_query: If True, use naive query for counting overlaps based on overlaps.
|
308
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
278
309
|
Returns:
|
279
310
|
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
|
280
311
|
|
@@ -421,6 +452,7 @@ class IntervalOperations:
|
|
421
452
|
cols: Union[list[str], None] = ["chrom", "start", "end"],
|
422
453
|
on_cols: Union[list[str], None] = None,
|
423
454
|
output_type: str = "polars.LazyFrame",
|
455
|
+
projection_pushdown: bool = False,
|
424
456
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
425
457
|
"""
|
426
458
|
Merge overlapping intervals. It is assumed that start < end.
|
@@ -433,6 +465,7 @@ class IntervalOperations:
|
|
433
465
|
genomic intervals, provided separately for each set.
|
434
466
|
on_cols: List of additional column names for clustering. default is None.
|
435
467
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
468
|
+
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
436
469
|
|
437
470
|
Returns:
|
438
471
|
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
|
polars_bio/range_op_helpers.py
CHANGED
@@ -31,6 +31,7 @@ def range_operation(
|
|
31
31
|
ctx: BioSessionContext,
|
32
32
|
read_options1: Union[ReadOptions, None] = None,
|
33
33
|
read_options2: Union[ReadOptions, None] = None,
|
34
|
+
projection_pushdown: bool = False,
|
34
35
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame"]:
|
35
36
|
ctx.sync_options()
|
36
37
|
if isinstance(df1, str) and isinstance(df2, str):
|
@@ -67,6 +68,7 @@ def range_operation(
|
|
67
68
|
ctx=ctx,
|
68
69
|
read_options1=read_options1,
|
69
70
|
read_options2=read_options2,
|
71
|
+
projection_pushdown=projection_pushdown,
|
70
72
|
)
|
71
73
|
elif output_type == "polars.DataFrame":
|
72
74
|
return range_operation_scan(
|
@@ -100,7 +102,14 @@ def range_operation(
|
|
100
102
|
**_rename_columns(df2, range_options.suffixes[1]).schema,
|
101
103
|
}
|
102
104
|
)
|
103
|
-
return range_lazy_scan(
|
105
|
+
return range_lazy_scan(
|
106
|
+
df1,
|
107
|
+
df2,
|
108
|
+
merged_schema,
|
109
|
+
range_options,
|
110
|
+
ctx,
|
111
|
+
projection_pushdown=projection_pushdown,
|
112
|
+
)
|
104
113
|
else:
|
105
114
|
df1 = _df_to_reader(df1, range_options.columns_1[0])
|
106
115
|
df2 = _df_to_reader(df2, range_options.columns_2[0])
|
polars_bio/range_op_io.py
CHANGED
@@ -35,6 +35,7 @@ def range_lazy_scan(
|
|
35
35
|
ctx: BioSessionContext,
|
36
36
|
read_options1: Union[ReadOptions, None] = None,
|
37
37
|
read_options2: Union[ReadOptions, None] = None,
|
38
|
+
projection_pushdown: bool = False,
|
38
39
|
) -> pl.LazyFrame:
|
39
40
|
range_function = None
|
40
41
|
if isinstance(df_1, str) and isinstance(df_2, str):
|
@@ -50,27 +51,59 @@ def range_lazy_scan(
|
|
50
51
|
_n_rows: Union[int, None],
|
51
52
|
_batch_size: Union[int, None],
|
52
53
|
) -> Iterator[pl.DataFrame]:
|
54
|
+
# Extract projected columns if projection pushdown is enabled
|
55
|
+
projected_columns = None
|
56
|
+
if projection_pushdown and with_columns is not None:
|
57
|
+
from .io import _extract_column_names_from_expr
|
58
|
+
|
59
|
+
projected_columns = _extract_column_names_from_expr(with_columns)
|
60
|
+
|
61
|
+
# Apply projection pushdown to range options if enabled
|
62
|
+
modified_range_options = range_options
|
63
|
+
if projection_pushdown and projected_columns:
|
64
|
+
# Create a copy of range options with projection information
|
65
|
+
# This is where we would modify the SQL generation in a full implementation
|
66
|
+
modified_range_options = range_options
|
67
|
+
|
53
68
|
df_lazy: datafusion.DataFrame = (
|
54
69
|
range_function(
|
55
|
-
ctx,
|
70
|
+
ctx,
|
71
|
+
df_1,
|
72
|
+
df_2,
|
73
|
+
modified_range_options,
|
74
|
+
read_options1,
|
75
|
+
read_options2,
|
76
|
+
_n_rows,
|
56
77
|
)
|
57
78
|
if isinstance(df_1, str) and isinstance(df_2, str)
|
58
|
-
else range_function(ctx, df_1, df_2,
|
79
|
+
else range_function(ctx, df_1, df_2, modified_range_options, _n_rows)
|
59
80
|
)
|
81
|
+
|
82
|
+
# Apply DataFusion-level projection if enabled
|
83
|
+
datafusion_projection_applied = False
|
84
|
+
if projection_pushdown and projected_columns:
|
85
|
+
try:
|
86
|
+
# Try to select only the requested columns at the DataFusion level
|
87
|
+
df_lazy = df_lazy.select(projected_columns)
|
88
|
+
datafusion_projection_applied = True
|
89
|
+
except Exception:
|
90
|
+
# Fallback to Python-level selection if DataFusion selection fails
|
91
|
+
datafusion_projection_applied = False
|
92
|
+
|
60
93
|
df_lazy.schema()
|
61
94
|
df_stream = df_lazy.execute_stream()
|
62
95
|
progress_bar = tqdm(unit="rows")
|
63
96
|
for r in df_stream:
|
64
97
|
py_df = r.to_pyarrow()
|
65
98
|
df = pl.DataFrame(py_df)
|
66
|
-
#
|
67
|
-
|
68
|
-
|
69
|
-
#
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
99
|
+
# Handle predicate and column projection
|
100
|
+
if predicate is not None:
|
101
|
+
df = df.filter(predicate)
|
102
|
+
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
103
|
+
if with_columns is not None and (
|
104
|
+
not projection_pushdown or not datafusion_projection_applied
|
105
|
+
):
|
106
|
+
df = df.select(with_columns)
|
74
107
|
progress_bar.update(len(df))
|
75
108
|
yield df
|
76
109
|
|
polars_bio/sql.py
CHANGED
@@ -31,7 +31,6 @@ class SQL:
|
|
31
31
|
def register_vcf(
|
32
32
|
path: str,
|
33
33
|
name: Union[str, None] = None,
|
34
|
-
info_fields: Union[list[str], None] = None,
|
35
34
|
thread_num: int = 1,
|
36
35
|
chunk_size: int = 64,
|
37
36
|
concurrent_fetches: int = 8,
|
@@ -47,13 +46,12 @@ class SQL:
|
|
47
46
|
Parameters:
|
48
47
|
path: The path to the VCF file.
|
49
48
|
name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
|
50
|
-
info_fields: The fields to read from the INFO column.
|
51
49
|
thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
52
50
|
chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
|
53
51
|
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
|
54
52
|
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
55
53
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
56
|
-
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically
|
54
|
+
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
|
57
55
|
max_retries: The maximum number of retries for reading the file from object storage.
|
58
56
|
timeout: The timeout in seconds for reading the file from object storage.
|
59
57
|
!!! note
|
@@ -81,8 +79,24 @@ class SQL:
|
|
81
79
|
compression_type=compression_type,
|
82
80
|
)
|
83
81
|
|
82
|
+
# Get all info fields from VCF header for automatic field detection
|
83
|
+
all_info_fields = None
|
84
|
+
try:
|
85
|
+
from .io import IOOperations
|
86
|
+
|
87
|
+
vcf_schema_df = IOOperations.describe_vcf(
|
88
|
+
path,
|
89
|
+
allow_anonymous=allow_anonymous,
|
90
|
+
enable_request_payer=enable_request_payer,
|
91
|
+
compression_type=compression_type,
|
92
|
+
)
|
93
|
+
all_info_fields = vcf_schema_df.select("name").to_series().to_list()
|
94
|
+
except Exception:
|
95
|
+
# Fallback to empty list if unable to get info fields
|
96
|
+
all_info_fields = []
|
97
|
+
|
84
98
|
vcf_read_options = VcfReadOptions(
|
85
|
-
info_fields=
|
99
|
+
info_fields=all_info_fields,
|
86
100
|
thread_num=thread_num,
|
87
101
|
object_storage_options=object_storage_options,
|
88
102
|
)
|
@@ -93,7 +107,6 @@ class SQL:
|
|
93
107
|
def register_gff(
|
94
108
|
path: str,
|
95
109
|
name: Union[str, None] = None,
|
96
|
-
attr_fields: Union[list[str], None] = None,
|
97
110
|
thread_num: int = 1,
|
98
111
|
chunk_size: int = 64,
|
99
112
|
concurrent_fetches: int = 8,
|
@@ -102,6 +115,7 @@ class SQL:
|
|
102
115
|
timeout: int = 300,
|
103
116
|
enable_request_payer: bool = False,
|
104
117
|
compression_type: str = "auto",
|
118
|
+
parallel: bool = False,
|
105
119
|
) -> None:
|
106
120
|
"""
|
107
121
|
Register a GFF file as a Datafusion table.
|
@@ -109,7 +123,6 @@ class SQL:
|
|
109
123
|
Parameters:
|
110
124
|
path: The path to the GFF file.
|
111
125
|
name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
|
112
|
-
attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
|
113
126
|
thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
114
127
|
chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
|
115
128
|
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
|
@@ -118,6 +131,7 @@ class SQL:
|
|
118
131
|
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
|
119
132
|
max_retries: The maximum number of retries for reading the file from object storage.
|
120
133
|
timeout: The timeout in seconds for reading the file from object storage.
|
134
|
+
parallel: Whether to use the parallel reader for BGZF-compressed local files. Default is False.
|
121
135
|
!!! note
|
122
136
|
GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
123
137
|
|
@@ -127,8 +141,8 @@ class SQL:
|
|
127
141
|
```
|
128
142
|
```python
|
129
143
|
import polars_bio as pb
|
130
|
-
pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz"
|
131
|
-
pb.sql("SELECT
|
144
|
+
pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz")
|
145
|
+
pb.sql("SELECT attributes, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY attributes").limit(5).collect()
|
132
146
|
```
|
133
147
|
```shell
|
134
148
|
|
@@ -161,9 +175,10 @@ class SQL:
|
|
161
175
|
)
|
162
176
|
|
163
177
|
gff_read_options = GffReadOptions(
|
164
|
-
attr_fields=
|
178
|
+
attr_fields=None,
|
165
179
|
thread_num=thread_num,
|
166
180
|
object_storage_options=object_storage_options,
|
181
|
+
parallel=parallel,
|
167
182
|
)
|
168
183
|
read_options = ReadOptions(gff_read_options=gff_read_options)
|
169
184
|
py_register_table(ctx, path, name, InputFormat.Gff, read_options)
|
@@ -179,7 +194,7 @@ class SQL:
|
|
179
194
|
timeout: int = 300,
|
180
195
|
enable_request_payer: bool = False,
|
181
196
|
compression_type: str = "auto",
|
182
|
-
parallel: bool =
|
197
|
+
parallel: bool = False,
|
183
198
|
) -> None:
|
184
199
|
"""
|
185
200
|
Register a FASTQ file as a Datafusion table.
|
@@ -194,7 +209,7 @@ class SQL:
|
|
194
209
|
compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
|
195
210
|
max_retries: The maximum number of retries for reading the file from object storage.
|
196
211
|
timeout: The timeout in seconds for reading the file from object storage.
|
197
|
-
parallel: Whether to use the parallel reader for BGZF compressed files.
|
212
|
+
parallel: Whether to use the parallel reader for BGZF compressed files. Default is False. If a file ends with ".gz" but is actually BGZF, it will attempt the parallel path and fall back to standard if not BGZF.
|
198
213
|
|
199
214
|
!!! Example
|
200
215
|
```python
|
@@ -265,7 +280,7 @@ class SQL:
|
|
265
280
|
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
|
266
281
|
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
267
282
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
268
|
-
compression_type: The compression type of the BED file. If not specified, it will be detected automatically
|
283
|
+
compression_type: The compression type of the BED file. If not specified, it will be detected automatically..
|
269
284
|
max_retries: The maximum number of retries for reading the file from object storage.
|
270
285
|
timeout: The timeout in seconds for reading the file from object storage.
|
271
286
|
|
polars_bio/utils.py
CHANGED
@@ -12,9 +12,11 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
|
|
12
12
|
return [x.strip() for x in t]
|
13
13
|
|
14
14
|
|
15
|
-
def _lazy_scan(
|
15
|
+
def _lazy_scan(
|
16
|
+
df: Union[pl.DataFrame, pl.LazyFrame], projection_pushdown: bool = False
|
17
|
+
) -> pl.LazyFrame:
|
16
18
|
df_lazy: DataFrame = df
|
17
|
-
|
19
|
+
original_schema = df_lazy.schema()
|
18
20
|
|
19
21
|
def _overlap_source(
|
20
22
|
with_columns: Union[pl.Expr, None],
|
@@ -22,25 +24,101 @@ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
|
22
24
|
n_rows: Union[int, None],
|
23
25
|
_batch_size: Union[int, None],
|
24
26
|
) -> Iterator[pl.DataFrame]:
|
27
|
+
# Extract column names from with_columns if projection pushdown is enabled
|
28
|
+
projected_columns = None
|
29
|
+
if projection_pushdown and with_columns is not None:
|
30
|
+
projected_columns = _extract_column_names_from_expr(with_columns)
|
31
|
+
|
32
|
+
# Apply column projection to DataFusion query if enabled
|
33
|
+
query_df = df_lazy
|
34
|
+
datafusion_projection_applied = False
|
35
|
+
if projection_pushdown and projected_columns:
|
36
|
+
try:
|
37
|
+
query_df = df_lazy.select(projected_columns)
|
38
|
+
datafusion_projection_applied = True
|
39
|
+
|
40
|
+
# For testing: allow inspection of the execution plan
|
41
|
+
if hasattr(df_lazy, "_test_projection_capture"):
|
42
|
+
df_lazy._test_projection_capture = {
|
43
|
+
"original_plan": str(df_lazy.optimized_logical_plan()),
|
44
|
+
"projected_plan": str(query_df.optimized_logical_plan()),
|
45
|
+
"projected_columns": projected_columns,
|
46
|
+
"datafusion_projection_applied": True,
|
47
|
+
}
|
48
|
+
|
49
|
+
except Exception as e:
|
50
|
+
# Fallback to original behavior if projection fails
|
51
|
+
query_df = df_lazy
|
52
|
+
projected_columns = None
|
53
|
+
datafusion_projection_applied = False
|
54
|
+
|
55
|
+
# For testing: capture the failure
|
56
|
+
if hasattr(df_lazy, "_test_projection_capture"):
|
57
|
+
df_lazy._test_projection_capture = {
|
58
|
+
"original_plan": str(df_lazy.optimized_logical_plan()),
|
59
|
+
"projected_plan": None,
|
60
|
+
"projected_columns": projected_columns,
|
61
|
+
"datafusion_projection_applied": False,
|
62
|
+
"error": str(e),
|
63
|
+
}
|
64
|
+
|
25
65
|
if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
|
26
|
-
df =
|
66
|
+
df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
|
27
67
|
df = pl.DataFrame(df).limit(n_rows)
|
28
68
|
if predicate is not None:
|
29
69
|
df = df.filter(predicate)
|
30
|
-
if
|
70
|
+
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
71
|
+
if with_columns is not None and (
|
72
|
+
not projection_pushdown or not datafusion_projection_applied
|
73
|
+
):
|
31
74
|
df = df.select(with_columns)
|
32
75
|
yield df
|
33
76
|
return
|
34
|
-
|
77
|
+
|
78
|
+
df_stream = query_df.execute_stream()
|
35
79
|
progress_bar = tqdm(unit="rows")
|
36
80
|
for r in df_stream:
|
37
81
|
py_df = r.to_pyarrow()
|
38
82
|
df = pl.DataFrame(py_df)
|
39
83
|
if predicate is not None:
|
40
84
|
df = df.filter(predicate)
|
41
|
-
if
|
85
|
+
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
86
|
+
if with_columns is not None and (
|
87
|
+
not projection_pushdown or not datafusion_projection_applied
|
88
|
+
):
|
42
89
|
df = df.select(with_columns)
|
43
90
|
progress_bar.update(len(df))
|
44
91
|
yield df
|
45
92
|
|
46
|
-
return register_io_source(_overlap_source, schema=
|
93
|
+
return register_io_source(_overlap_source, schema=original_schema)
|
94
|
+
|
95
|
+
|
96
|
+
def _extract_column_names_from_expr(with_columns: Union[pl.Expr, list]) -> list[str]:
|
97
|
+
"""Extract column names from Polars expressions."""
|
98
|
+
if with_columns is None:
|
99
|
+
return []
|
100
|
+
|
101
|
+
# Handle different types of with_columns input
|
102
|
+
if hasattr(with_columns, "__iter__") and not isinstance(with_columns, str):
|
103
|
+
# It's a list of expressions or strings
|
104
|
+
column_names = []
|
105
|
+
for item in with_columns:
|
106
|
+
if isinstance(item, str):
|
107
|
+
column_names.append(item)
|
108
|
+
elif hasattr(item, "meta") and hasattr(item.meta, "output_name"):
|
109
|
+
# Polars expression with output name
|
110
|
+
try:
|
111
|
+
column_names.append(item.meta.output_name())
|
112
|
+
except Exception:
|
113
|
+
pass
|
114
|
+
return column_names
|
115
|
+
elif isinstance(with_columns, str):
|
116
|
+
return [with_columns]
|
117
|
+
elif hasattr(with_columns, "meta") and hasattr(with_columns.meta, "output_name"):
|
118
|
+
# Single Polars expression
|
119
|
+
try:
|
120
|
+
return [with_columns.meta.output_name()]
|
121
|
+
except Exception:
|
122
|
+
pass
|
123
|
+
|
124
|
+
return []
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: polars-bio
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.14.0
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
@@ -9,6 +9,7 @@ Requires-Dist: pyarrow~=21.0.0
|
|
9
9
|
Requires-Dist: datafusion~=48.0.0
|
10
10
|
Requires-Dist: tqdm~=4.67.1
|
11
11
|
Requires-Dist: typing-extensions~=4.14.0
|
12
|
+
Requires-Dist: mkdocs-glightbox>=0.5.1,<0.6.0
|
12
13
|
Requires-Dist: pandas ; extra == 'pandas'
|
13
14
|
Requires-Dist: bioframe ; extra == 'viz'
|
14
15
|
Requires-Dist: matplotlib ; extra == 'viz'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
polars_bio-0.14.0.dist-info/METADATA,sha256=f6XqkJ12QrWensCi-C5g5rGWRMz9oS5BIcKy0VAA2uI,729
|
2
|
+
polars_bio-0.14.0.dist-info/WHEEL,sha256=-M5O7l5EczTA8VFaBQsg2Fpg0dKz0WOuvpt3nEh86bo,94
|
3
|
+
polars_bio-0.14.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
4
|
+
polars_bio/__init__.py,sha256=IkWQcFLzSF66OCBJouo2urgOm-me7mo0fBGhFaDgN7Y,3097
|
5
|
+
polars_bio/constants.py,sha256=m9jMLB8PpbmWcsrCQZhRBGsBAE6X8frsSlgteVeEyo4,119
|
6
|
+
polars_bio/context.py,sha256=AP5EM2TsB9zcomlsPEz8xMwQnEXwqpRsgBTnZsHYQwA,1723
|
7
|
+
polars_bio/interval_op_helpers.py,sha256=DQIo4lUxzd-ySUbjfwNSk5zYcxpprwQe32kTPE28ypw,2930
|
8
|
+
polars_bio/io.py,sha256=fvToItTlOxR-nOCAXeYekxzdWJT_BHcjbuExCGhRQmw,52066
|
9
|
+
polars_bio/logging.py,sha256=Q25cv4qiwLmAiGJq6ZlqYJn2WJ_uN-c5_eopib2z8bc,1354
|
10
|
+
polars_bio/operations.py,sha256=amhaff8Ha3UuQmS8OCVFXRQWvQOW_4G2T5U8tF1f7mc,2272
|
11
|
+
polars_bio/polars_bio.pyd,sha256=fflNh2VhTw-2neThPmJE6oStpPxEap3qARZVYNAKjV4,275155968
|
12
|
+
polars_bio/polars_ext.py,sha256=lT8-cYAvSyhbzbpozjlF59VWTCYOzLafSZ-7bi9f49Y,9658
|
13
|
+
polars_bio/range_op.py,sha256=UbWKBf06rPf2GXAQT0TzXR6H0rVZeCcFCqxISMuzNpk,26289
|
14
|
+
polars_bio/range_op_helpers.py,sha256=RcvXc52cJVnK4fyCtwEcYvOB5TmKItGyiReiHBGHDng,6200
|
15
|
+
polars_bio/range_op_io.py,sha256=XTBTclFCCe4utMRAju9rOUzHvLkpKo5dCn-aCBwzRfY,7275
|
16
|
+
polars_bio/range_utils.py,sha256=Q0UPB7DV4mPjOlQ_xDVLN3vJaY9ZEr4IHFVfVBnPLDY,1446
|
17
|
+
polars_bio/sql.py,sha256=vWdZCyAXTPUHTko9al90JK8tgrChnB7Fn2hUiE0bw5c,24986
|
18
|
+
polars_bio/utils.py,sha256=a-PHpiggjFm5u_PkrswPFT4DgY1kq2Ks0XLkw3nMxAI,5096
|
19
|
+
polars_bio-0.14.0.dist-info/RECORD,,
|
@@ -1,19 +0,0 @@
|
|
1
|
-
polars_bio-0.13.1.dist-info/METADATA,sha256=wGl6-MrJR_DMgJstlr5b4R326hxH3K6-QPMq8zKmfsw,683
|
2
|
-
polars_bio-0.13.1.dist-info/WHEEL,sha256=2XatmAWXBfp_P6DUtFAtbdzzba6f_xbhEtpqsZt_zEg,94
|
3
|
-
polars_bio-0.13.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
4
|
-
polars_bio/__init__.py,sha256=2Nhz5w4jLDk7-5OSLQ3ieTz8KuogpggK6JsL2fTRp0U,3097
|
5
|
-
polars_bio/constants.py,sha256=m9jMLB8PpbmWcsrCQZhRBGsBAE6X8frsSlgteVeEyo4,119
|
6
|
-
polars_bio/context.py,sha256=AP5EM2TsB9zcomlsPEz8xMwQnEXwqpRsgBTnZsHYQwA,1723
|
7
|
-
polars_bio/interval_op_helpers.py,sha256=DQIo4lUxzd-ySUbjfwNSk5zYcxpprwQe32kTPE28ypw,2930
|
8
|
-
polars_bio/io.py,sha256=o63g1l33nRlwPp7pSQbkTrxiaXajE7ClA1_-xN9v5SI,38624
|
9
|
-
polars_bio/logging.py,sha256=Q25cv4qiwLmAiGJq6ZlqYJn2WJ_uN-c5_eopib2z8bc,1354
|
10
|
-
polars_bio/operations.py,sha256=amhaff8Ha3UuQmS8OCVFXRQWvQOW_4G2T5U8tF1f7mc,2272
|
11
|
-
polars_bio/polars_bio.pyd,sha256=E87sVQXw1xid0eNAXjHZAiHgFPvmIqyk_Oxn-Sz8UPk,274686976
|
12
|
-
polars_bio/polars_ext.py,sha256=lT8-cYAvSyhbzbpozjlF59VWTCYOzLafSZ-7bi9f49Y,9658
|
13
|
-
polars_bio/range_op.py,sha256=K6VyfgbXb4q8G7XAYSj1zrjuUHQGxIvNMt-y-6iPCyQ,24863
|
14
|
-
polars_bio/range_op_helpers.py,sha256=IoWQb-BpeDn67KkTl5x3nXdrftsd_mtjOJDLCqO7mrI,5943
|
15
|
-
polars_bio/range_op_io.py,sha256=MKl7Zg8Wd_mWLXhOJghrmpNsPOOMvq5pNcJvp9DtCG0,5883
|
16
|
-
polars_bio/range_utils.py,sha256=Q0UPB7DV4mPjOlQ_xDVLN3vJaY9ZEr4IHFVfVBnPLDY,1446
|
17
|
-
polars_bio/sql.py,sha256=m6P99rfnomFXB01AicOwx72tT79IO8wKQafYQTb78SI,24618
|
18
|
-
polars_bio/utils.py,sha256=RaAU5pMt0P6Ptt6LYBeK5-0WKAmuvhV7ifU05nfVGA8,1611
|
19
|
-
polars_bio-0.13.1.dist-info/RECORD,,
|
File without changes
|