polars-bio 0.13.0__cp39-abi3-macosx_11_0_arm64.whl → 0.14.0__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polars_bio/__init__.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
 
3
3
  # Set POLARS_FORCE_NEW_STREAMING to "1" by default if not already set
4
4
  if "POLARS_FORCE_NEW_STREAMING" not in os.environ:
5
- os.environ["POLARS_FORCE_NEW_STREAMING"] = "1"
5
+ os.environ["POLARS_FORCE_NEW_STREAMING"] = "0"
6
6
 
7
7
  from polars_bio.polars_bio import GffReadOptions, InputFormat
8
8
  from polars_bio.polars_bio import PyObjectStorageOptions as ObjectStorageOptions
@@ -73,7 +73,7 @@ except ImportError:
73
73
  POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
74
74
 
75
75
 
76
- __version__ = "0.13.0"
76
+ __version__ = "0.14.0"
77
77
  __all__ = [
78
78
  "ctx",
79
79
  "FilterOp",
polars_bio/io.py CHANGED
@@ -17,6 +17,7 @@ from polars_bio.polars_bio import (
17
17
  VcfReadOptions,
18
18
  py_describe_vcf,
19
19
  py_from_polars,
20
+ py_read_sql,
20
21
  py_read_table,
21
22
  py_register_table,
22
23
  py_scan_table,
@@ -80,6 +81,7 @@ class IOOperations:
80
81
  max_retries: int = 5,
81
82
  timeout: int = 300,
82
83
  compression_type: str = "auto",
84
+ projection_pushdown: bool = False,
83
85
  ) -> pl.DataFrame:
84
86
  """
85
87
 
@@ -94,6 +96,7 @@ class IOOperations:
94
96
  max_retries: The maximum number of retries for reading the file from object storage.
95
97
  timeout: The timeout in seconds for reading the file from object storage.
96
98
  compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
99
+ projection_pushdown: Enable column projection pushdown optimization. When True, only requested columns are processed at the DataFusion execution level, improving performance and reducing memory usage.
97
100
 
98
101
  !!! Example
99
102
  ```shell
@@ -124,6 +127,7 @@ class IOOperations:
124
127
  max_retries,
125
128
  timeout,
126
129
  compression_type,
130
+ projection_pushdown,
127
131
  ).collect()
128
132
 
129
133
  @staticmethod
@@ -136,6 +140,7 @@ class IOOperations:
136
140
  max_retries: int = 5,
137
141
  timeout: int = 300,
138
142
  compression_type: str = "auto",
143
+ projection_pushdown: bool = False,
139
144
  ) -> pl.LazyFrame:
140
145
  """
141
146
 
@@ -150,6 +155,7 @@ class IOOperations:
150
155
  max_retries: The maximum number of retries for reading the file from object storage.
151
156
  timeout: The timeout in seconds for reading the file from object storage.
152
157
  compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
158
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
153
159
 
154
160
  !!! Example
155
161
  ```shell
@@ -184,12 +190,11 @@ class IOOperations:
184
190
  object_storage_options=object_storage_options
185
191
  )
186
192
  read_options = ReadOptions(fasta_read_options=fasta_read_options)
187
- return _read_file(path, InputFormat.Fasta, read_options)
193
+ return _read_file(path, InputFormat.Fasta, read_options, projection_pushdown)
188
194
 
189
195
  @staticmethod
190
196
  def read_vcf(
191
197
  path: str,
192
- info_fields: Union[list[str], None] = None,
193
198
  thread_num: int = 1,
194
199
  chunk_size: int = 8,
195
200
  concurrent_fetches: int = 1,
@@ -198,13 +203,13 @@ class IOOperations:
198
203
  max_retries: int = 5,
199
204
  timeout: int = 300,
200
205
  compression_type: str = "auto",
206
+ projection_pushdown: bool = False,
201
207
  ) -> pl.DataFrame:
202
208
  """
203
209
  Read a VCF file into a DataFrame.
204
210
 
205
211
  Parameters:
206
212
  path: The path to the VCF file.
207
- info_fields: The fields to read from the INFO column.
208
213
  thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
209
214
  chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
210
215
  concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
@@ -212,14 +217,14 @@ class IOOperations:
212
217
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
213
218
  max_retries: The maximum number of retries for reading the file from object storage.
214
219
  timeout: The timeout in seconds for reading the file from object storage.
215
- compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
220
+ compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
221
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
216
222
 
217
223
  !!! note
218
224
  VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
219
225
  """
220
226
  return IOOperations.scan_vcf(
221
227
  path,
222
- info_fields,
223
228
  thread_num,
224
229
  chunk_size,
225
230
  concurrent_fetches,
@@ -228,12 +233,12 @@ class IOOperations:
228
233
  max_retries,
229
234
  timeout,
230
235
  compression_type,
236
+ projection_pushdown,
231
237
  ).collect()
232
238
 
233
239
  @staticmethod
234
240
  def scan_vcf(
235
241
  path: str,
236
- info_fields: Union[list[str], None] = None,
237
242
  thread_num: int = 1,
238
243
  chunk_size: int = 8,
239
244
  concurrent_fetches: int = 1,
@@ -242,13 +247,13 @@ class IOOperations:
242
247
  max_retries: int = 5,
243
248
  timeout: int = 300,
244
249
  compression_type: str = "auto",
250
+ projection_pushdown: bool = False,
245
251
  ) -> pl.LazyFrame:
246
252
  """
247
253
  Lazily read a VCF file into a LazyFrame.
248
254
 
249
255
  Parameters:
250
256
  path: The path to the VCF file.
251
- info_fields: The fields to read from the INFO column.
252
257
  thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
253
258
  chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
254
259
  concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
@@ -256,7 +261,8 @@ class IOOperations:
256
261
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
257
262
  max_retries: The maximum number of retries for reading the file from object storage.
258
263
  timeout: The timeout in seconds for reading the file from object storage.
259
- compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
264
+ compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
265
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
260
266
 
261
267
  !!! note
262
268
  VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -271,18 +277,36 @@ class IOOperations:
271
277
  compression_type=compression_type,
272
278
  )
273
279
 
280
+ # Get all info fields from VCF header for proper projection pushdown
281
+ all_info_fields = None
282
+ try:
283
+ vcf_schema_df = IOOperations.describe_vcf(
284
+ path,
285
+ allow_anonymous=allow_anonymous,
286
+ enable_request_payer=enable_request_payer,
287
+ compression_type=compression_type,
288
+ )
289
+ # Use column name 'name' not 'id' based on the schema output
290
+ all_info_fields = vcf_schema_df.select("name").to_series().to_list()
291
+ except Exception:
292
+ # Fallback to None if unable to get info fields
293
+ all_info_fields = None
294
+
295
+ # Always start with all info fields to establish full schema
296
+ # The callback will re-register with only requested info fields for optimization
297
+ initial_info_fields = all_info_fields
298
+
274
299
  vcf_read_options = VcfReadOptions(
275
- info_fields=_cleanse_fields(info_fields),
300
+ info_fields=initial_info_fields,
276
301
  thread_num=thread_num,
277
302
  object_storage_options=object_storage_options,
278
303
  )
279
304
  read_options = ReadOptions(vcf_read_options=vcf_read_options)
280
- return _read_file(path, InputFormat.Vcf, read_options)
305
+ return _read_file(path, InputFormat.Vcf, read_options, projection_pushdown)
281
306
 
282
307
  @staticmethod
283
308
  def read_gff(
284
309
  path: str,
285
- attr_fields: Union[list[str], None] = None,
286
310
  thread_num: int = 1,
287
311
  chunk_size: int = 8,
288
312
  concurrent_fetches: int = 1,
@@ -291,13 +315,14 @@ class IOOperations:
291
315
  max_retries: int = 5,
292
316
  timeout: int = 300,
293
317
  compression_type: str = "auto",
318
+ projection_pushdown: bool = False,
319
+ parallel: bool = False,
294
320
  ) -> pl.DataFrame:
295
321
  """
296
322
  Read a GFF file into a DataFrame.
297
323
 
298
324
  Parameters:
299
325
  path: The path to the GFF file.
300
- attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
301
326
  thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
302
327
  chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
303
328
  concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
@@ -305,14 +330,15 @@ class IOOperations:
305
330
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
306
331
  max_retries: The maximum number of retries for reading the file from object storage.
307
332
  timeout: The timeout in seconds for reading the file from object storage.
308
- compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
333
+ compression_type: The compression type of the GFF file. If not specified, it will be detected automatically..
334
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
335
+ parallel: Whether to use the parallel reader for BGZF-compressed local files (uses BGZF chunk-level parallelism similar to FASTQ).
309
336
 
310
337
  !!! note
311
338
  GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
312
339
  """
313
340
  return IOOperations.scan_gff(
314
341
  path,
315
- attr_fields,
316
342
  thread_num,
317
343
  chunk_size,
318
344
  concurrent_fetches,
@@ -321,12 +347,13 @@ class IOOperations:
321
347
  max_retries,
322
348
  timeout,
323
349
  compression_type,
350
+ projection_pushdown,
351
+ parallel,
324
352
  ).collect()
325
353
 
326
354
  @staticmethod
327
355
  def scan_gff(
328
356
  path: str,
329
- attr_fields: Union[list[str], None] = None,
330
357
  thread_num: int = 1,
331
358
  chunk_size: int = 8,
332
359
  concurrent_fetches: int = 1,
@@ -335,21 +362,24 @@ class IOOperations:
335
362
  max_retries: int = 5,
336
363
  timeout: int = 300,
337
364
  compression_type: str = "auto",
365
+ projection_pushdown: bool = False,
366
+ parallel: bool = False,
338
367
  ) -> pl.LazyFrame:
339
368
  """
340
369
  Lazily read a GFF file into a LazyFrame.
341
370
 
342
371
  Parameters:
343
372
  path: The path to the GFF file.
344
- attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
345
373
  thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
346
374
  chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
347
- concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
375
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large-scale operations, it is recommended to increase this value to 8 or even more.
348
376
  allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
349
377
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
350
378
  max_retries: The maximum number of retries for reading the file from object storage.
351
379
  timeout: The timeout in seconds for reading the file from object storage.
352
- compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
380
+ compression_type: The compression type of the GFF file. If not specified, it will be detected automatically.
381
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
382
+ parallel: Whether to use the parallel reader for BGZF-compressed local files (use BGZF chunk-level parallelism similar to FASTQ).
353
383
 
354
384
  !!! note
355
385
  GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
@@ -365,12 +395,13 @@ class IOOperations:
365
395
  )
366
396
 
367
397
  gff_read_options = GffReadOptions(
368
- attr_fields=_cleanse_fields(attr_fields),
398
+ attr_fields=None,
369
399
  thread_num=thread_num,
370
400
  object_storage_options=object_storage_options,
401
+ parallel=parallel,
371
402
  )
372
403
  read_options = ReadOptions(gff_read_options=gff_read_options)
373
- return _read_file(path, InputFormat.Gff, read_options)
404
+ return _read_file(path, InputFormat.Gff, read_options, projection_pushdown)
374
405
 
375
406
  @staticmethod
376
407
  def read_bam(
@@ -382,6 +413,7 @@ class IOOperations:
382
413
  enable_request_payer: bool = False,
383
414
  max_retries: int = 5,
384
415
  timeout: int = 300,
416
+ projection_pushdown: bool = False,
385
417
  ) -> pl.DataFrame:
386
418
  """
387
419
  Read a BAM file into a DataFrame.
@@ -389,12 +421,13 @@ class IOOperations:
389
421
  Parameters:
390
422
  path: The path to the BAM file.
391
423
  thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
392
- chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
393
- concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
424
+ chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large-scale operations, it is recommended to increase this value to 64.
425
+ concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large-scale operations, it is recommended to increase this value to 8 or even more.
394
426
  allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
395
427
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
396
428
  max_retries: The maximum number of retries for reading the file from object storage.
397
429
  timeout: The timeout in seconds for reading the file from object storage.
430
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
398
431
 
399
432
  !!! note
400
433
  BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
@@ -408,6 +441,7 @@ class IOOperations:
408
441
  enable_request_payer,
409
442
  max_retries,
410
443
  timeout,
444
+ projection_pushdown,
411
445
  ).collect()
412
446
 
413
447
  @staticmethod
@@ -420,6 +454,7 @@ class IOOperations:
420
454
  enable_request_payer: bool = False,
421
455
  max_retries: int = 5,
422
456
  timeout: int = 300,
457
+ projection_pushdown: bool = False,
423
458
  ) -> pl.LazyFrame:
424
459
  """
425
460
  Lazily read a BAM file into a LazyFrame.
@@ -433,6 +468,7 @@ class IOOperations:
433
468
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
434
469
  max_retries: The maximum number of retries for reading the file from object storage.
435
470
  timeout: The timeout in seconds for reading the file from object storage.
471
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
436
472
 
437
473
  !!! note
438
474
  BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
@@ -452,7 +488,7 @@ class IOOperations:
452
488
  object_storage_options=object_storage_options,
453
489
  )
454
490
  read_options = ReadOptions(bam_read_options=bam_read_options)
455
- return _read_file(path, InputFormat.Bam, read_options)
491
+ return _read_file(path, InputFormat.Bam, read_options, projection_pushdown)
456
492
 
457
493
  @staticmethod
458
494
  def read_fastq(
@@ -465,6 +501,7 @@ class IOOperations:
465
501
  timeout: int = 300,
466
502
  compression_type: str = "auto",
467
503
  parallel: bool = False,
504
+ projection_pushdown: bool = False,
468
505
  ) -> pl.DataFrame:
469
506
  """
470
507
  Read a FASTQ file into a DataFrame.
@@ -479,6 +516,7 @@ class IOOperations:
479
516
  timeout: The timeout in seconds for reading the file from object storage.
480
517
  compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
481
518
  parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
519
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
482
520
  """
483
521
  return IOOperations.scan_fastq(
484
522
  path,
@@ -490,6 +528,7 @@ class IOOperations:
490
528
  timeout,
491
529
  compression_type,
492
530
  parallel,
531
+ projection_pushdown,
493
532
  ).collect()
494
533
 
495
534
  @staticmethod
@@ -503,6 +542,7 @@ class IOOperations:
503
542
  timeout: int = 300,
504
543
  compression_type: str = "auto",
505
544
  parallel: bool = False,
545
+ projection_pushdown: bool = False,
506
546
  ) -> pl.LazyFrame:
507
547
  """
508
548
  Lazily read a FASTQ file into a LazyFrame.
@@ -517,6 +557,7 @@ class IOOperations:
517
557
  timeout: The timeout in seconds for reading the file from object storage.
518
558
  compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
519
559
  parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
560
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
520
561
  """
521
562
  object_storage_options = PyObjectStorageOptions(
522
563
  allow_anonymous=allow_anonymous,
@@ -532,7 +573,7 @@ class IOOperations:
532
573
  object_storage_options=object_storage_options, parallel=parallel
533
574
  )
534
575
  read_options = ReadOptions(fastq_read_options=fastq_read_options)
535
- return _read_file(path, InputFormat.Fastq, read_options)
576
+ return _read_file(path, InputFormat.Fastq, read_options, projection_pushdown)
536
577
 
537
578
  @staticmethod
538
579
  def read_bed(
@@ -545,6 +586,7 @@ class IOOperations:
545
586
  max_retries: int = 5,
546
587
  timeout: int = 300,
547
588
  compression_type: str = "auto",
589
+ projection_pushdown: bool = False,
548
590
  ) -> pl.DataFrame:
549
591
  """
550
592
  Read a BED file into a DataFrame.
@@ -559,6 +601,7 @@ class IOOperations:
559
601
  max_retries: The maximum number of retries for reading the file from object storage.
560
602
  timeout: The timeout in seconds for reading the file from object storage.
561
603
  compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
604
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
562
605
 
563
606
  !!! Note
564
607
  Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
@@ -577,6 +620,7 @@ class IOOperations:
577
620
  max_retries,
578
621
  timeout,
579
622
  compression_type,
623
+ projection_pushdown,
580
624
  ).collect()
581
625
 
582
626
  @staticmethod
@@ -590,6 +634,7 @@ class IOOperations:
590
634
  max_retries: int = 5,
591
635
  timeout: int = 300,
592
636
  compression_type: str = "auto",
637
+ projection_pushdown: bool = False,
593
638
  ) -> pl.LazyFrame:
594
639
  """
595
640
  Lazily read a BED file into a LazyFrame.
@@ -604,6 +649,7 @@ class IOOperations:
604
649
  max_retries: The maximum number of retries for reading the file from object storage.
605
650
  timeout: The timeout in seconds for reading the file from object storage.
606
651
  compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
652
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
607
653
 
608
654
  !!! Note
609
655
  Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
@@ -627,7 +673,7 @@ class IOOperations:
627
673
  object_storage_options=object_storage_options,
628
674
  )
629
675
  read_options = ReadOptions(bed_read_options=bed_read_options)
630
- return _read_file(path, InputFormat.Bed, read_options)
676
+ return _read_file(path, InputFormat.Bed, read_options, projection_pushdown)
631
677
 
632
678
  @staticmethod
633
679
  def read_table(path: str, schema: Dict = None, **kwargs) -> pl.DataFrame:
@@ -678,7 +724,7 @@ class IOOperations:
678
724
  path: The path to the VCF file.
679
725
  allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
680
726
  enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
681
- compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
727
+ compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
682
728
  """
683
729
  object_storage_options = PyObjectStorageOptions(
684
730
  allow_anonymous=allow_anonymous,
@@ -714,9 +760,15 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
714
760
  return [x.strip() for x in t]
715
761
 
716
762
 
717
- def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
763
+ def _lazy_scan(
764
+ df: Union[pl.DataFrame, pl.LazyFrame],
765
+ projection_pushdown: bool = False,
766
+ table_name: str = None,
767
+ input_format: InputFormat = None,
768
+ file_path: str = None,
769
+ ) -> pl.LazyFrame:
718
770
  df_lazy: DataFrame = df
719
- arrow_schema = df_lazy.schema()
771
+ original_schema = df_lazy.schema()
720
772
 
721
773
  def _overlap_source(
722
774
  with_columns: Union[pl.Expr, None],
@@ -724,35 +776,287 @@ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
724
776
  n_rows: Union[int, None],
725
777
  _batch_size: Union[int, None],
726
778
  ) -> Iterator[pl.DataFrame]:
779
+ # Extract column names from with_columns if projection pushdown is enabled
780
+ projected_columns = None
781
+ if projection_pushdown and with_columns is not None:
782
+ projected_columns = _extract_column_names_from_expr(with_columns)
783
+
784
+ # Projection pushdown is handled natively by table providers
785
+ query_df = df_lazy
786
+
787
+ # Apply column projection to DataFusion query if enabled
788
+ datafusion_projection_applied = False
789
+
790
+ if projection_pushdown and projected_columns:
791
+ try:
792
+ # Apply projection at the DataFusion level using SQL
793
+ # This approach works reliably with the DataFusion Python API
794
+ columns_sql = ", ".join([f'"{c}"' for c in projected_columns])
795
+
796
+ # Use the table name passed from _read_file, fallback if not available
797
+ table_to_query = table_name if table_name else "temp_table"
798
+
799
+ # Use py_read_sql to execute SQL projection (same as pb.sql() does)
800
+ from .context import ctx
801
+
802
+ query_df = py_read_sql(
803
+ ctx, f"SELECT {columns_sql} FROM {table_to_query}"
804
+ )
805
+ datafusion_projection_applied = True
806
+ except Exception as e:
807
+ # Fallback to original behavior if projection fails
808
+ print(f"DataFusion projection failed: {e}")
809
+ query_df = df_lazy
810
+ projected_columns = None
811
+ datafusion_projection_applied = False
812
+
727
813
  if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
728
- df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
814
+ df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
729
815
  df = pl.DataFrame(df).limit(n_rows)
730
816
  if predicate is not None:
731
817
  df = df.filter(predicate)
732
- if with_columns is not None:
818
+ # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
819
+ if with_columns is not None and (
820
+ not projection_pushdown or not datafusion_projection_applied
821
+ ):
733
822
  df = df.select(with_columns)
734
823
  yield df
735
824
  return
736
- df_stream = df_lazy.execute_stream()
825
+
826
+ df_stream = query_df.execute_stream()
737
827
  progress_bar = tqdm(unit="rows")
738
828
  for r in df_stream:
739
829
  py_df = r.to_pyarrow()
740
830
  df = pl.DataFrame(py_df)
741
831
  if predicate is not None:
742
832
  df = df.filter(predicate)
743
- if with_columns is not None:
833
+ # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
834
+ if with_columns is not None and (
835
+ not projection_pushdown or not datafusion_projection_applied
836
+ ):
744
837
  df = df.select(with_columns)
745
838
  progress_bar.update(len(df))
746
839
  yield df
747
840
 
748
- return register_io_source(_overlap_source, schema=arrow_schema)
841
+ return register_io_source(_overlap_source, schema=original_schema)
842
+
843
+
844
+ def _extract_column_names_from_expr(with_columns: Union[pl.Expr, list]) -> list[str]:
845
+ """Extract column names from Polars expressions."""
846
+ if with_columns is None:
847
+ return []
848
+
849
+ # Handle different types of with_columns input
850
+ if hasattr(with_columns, "__iter__") and not isinstance(with_columns, str):
851
+ # It's a list of expressions or strings
852
+ column_names = []
853
+ for item in with_columns:
854
+ if isinstance(item, str):
855
+ column_names.append(item)
856
+ elif hasattr(item, "meta") and hasattr(item.meta, "output_name"):
857
+ # Polars expression with output name
858
+ try:
859
+ column_names.append(item.meta.output_name())
860
+ except Exception:
861
+ pass
862
+ return column_names
863
+ elif isinstance(with_columns, str):
864
+ return [with_columns]
865
+ elif hasattr(with_columns, "meta") and hasattr(with_columns.meta, "output_name"):
866
+ # Single Polars expression
867
+ try:
868
+ return [with_columns.meta.output_name()]
869
+ except Exception:
870
+ pass
871
+
872
+ return []
749
873
 
750
874
 
751
875
  def _read_file(
752
876
  path: str,
753
877
  input_format: InputFormat,
754
878
  read_options: ReadOptions,
879
+ projection_pushdown: bool = False,
755
880
  ) -> pl.LazyFrame:
756
881
  table = py_register_table(ctx, path, None, input_format, read_options)
757
882
  df = py_read_table(ctx, table.name)
758
- return _lazy_scan(df)
883
+
884
+ lf = _lazy_scan(df, projection_pushdown, table.name, input_format, path)
885
+
886
+ # Wrap GFF LazyFrames with projection-aware wrapper for consistent attribute field handling
887
+ if input_format == InputFormat.Gff:
888
+ return GffLazyFrameWrapper(lf, path, read_options, projection_pushdown)
889
+
890
+ return lf
891
+
892
+
893
+ class GffLazyFrameWrapper:
894
+ """Wrapper for GFF LazyFrames that handles attribute field detection in select operations."""
895
+
896
+ def __init__(
897
+ self,
898
+ base_lf: pl.LazyFrame,
899
+ file_path: str,
900
+ read_options: ReadOptions,
901
+ projection_pushdown: bool = True,
902
+ ):
903
+ self._base_lf = base_lf
904
+ self._file_path = file_path
905
+ self._read_options = read_options
906
+ self._projection_pushdown = projection_pushdown
907
+
908
+ def select(self, exprs):
909
+ """Override select to handle GFF attribute field detection.
910
+
911
+ Ensures queries requesting the raw `attributes` column use a registration
912
+ that exposes it, while preserving projection pushdown. For unnested
913
+ attribute fields (e.g., `gene_id`), re-registers with those fields to
914
+ enable efficient projection.
915
+ """
916
+ # Extract column names from expressions
917
+ if isinstance(exprs, (list, tuple)):
918
+ columns = []
919
+ for expr in exprs:
920
+ if isinstance(expr, str):
921
+ columns.append(expr)
922
+ elif hasattr(expr, "meta") and hasattr(expr.meta, "output_name"):
923
+ try:
924
+ columns.append(expr.meta.output_name())
925
+ except:
926
+ pass
927
+ else:
928
+ # Single expression
929
+ if isinstance(exprs, str):
930
+ columns = [exprs]
931
+ elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
932
+ try:
933
+ columns = [exprs.meta.output_name()]
934
+ except:
935
+ columns = []
936
+ else:
937
+ columns = []
938
+
939
+ # Categorize columns
940
+ GFF_STATIC_COLUMNS = {
941
+ "chrom",
942
+ "start",
943
+ "end",
944
+ "type",
945
+ "source",
946
+ "score",
947
+ "strand",
948
+ "phase",
949
+ "attributes",
950
+ }
951
+ static_cols = [col for col in columns if col in GFF_STATIC_COLUMNS]
952
+ attribute_cols = [col for col in columns if col not in GFF_STATIC_COLUMNS]
953
+
954
+ # If 'attributes' is requested, ensure the registered table exposes it.
955
+ # Some parallel GFF providers omit the raw 'attributes' column; switch
956
+ # to a registration that includes it while keeping projection pushdown.
957
+ if "attributes" in static_cols:
958
+ from .context import ctx
959
+
960
+ # Preserve original parallelism and thread config when re-registering
961
+ orig_gff_opts = getattr(self._read_options, "gff_read_options", None)
962
+ orig_parallel = (
963
+ getattr(orig_gff_opts, "parallel", False) if orig_gff_opts else False
964
+ )
965
+ orig_thread = (
966
+ getattr(orig_gff_opts, "thread_num", None) if orig_gff_opts else None
967
+ )
968
+
969
+ # Build read options that ensure raw attributes are present
970
+ gff_options = GffReadOptions(
971
+ attr_fields=None, # keep nested 'attributes' column
972
+ thread_num=orig_thread if orig_thread is not None else 1,
973
+ object_storage_options=PyObjectStorageOptions(
974
+ allow_anonymous=True,
975
+ enable_request_payer=False,
976
+ chunk_size=8,
977
+ concurrent_fetches=1,
978
+ max_retries=5,
979
+ timeout=300,
980
+ compression_type="auto",
981
+ ),
982
+ parallel=orig_parallel,
983
+ )
984
+ read_options = ReadOptions(gff_read_options=gff_options)
985
+ table = py_register_table(
986
+ ctx, self._file_path, None, InputFormat.Gff, read_options
987
+ )
988
+ df = py_read_table(ctx, table.name)
989
+ new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
990
+ return new_lf.select(exprs)
991
+
992
+ if self._projection_pushdown:
993
+ # Optimized path: when selecting specific unnested attribute fields, re-register
994
+ # GFF table with those fields so DataFusion can project them efficiently.
995
+
996
+ # Use optimized table re-registration (fast path)
997
+ from .context import ctx
998
+
999
+ gff_options = GffReadOptions(
1000
+ attr_fields=attribute_cols if attribute_cols else None,
1001
+ thread_num=1,
1002
+ object_storage_options=PyObjectStorageOptions(
1003
+ allow_anonymous=True,
1004
+ enable_request_payer=False,
1005
+ chunk_size=8,
1006
+ concurrent_fetches=1,
1007
+ max_retries=5,
1008
+ timeout=300,
1009
+ compression_type="auto",
1010
+ ),
1011
+ # Keep parallel reading consistent with base options when possible
1012
+ parallel=getattr(
1013
+ getattr(self._read_options, "gff_read_options", None),
1014
+ "parallel",
1015
+ False,
1016
+ ),
1017
+ )
1018
+
1019
+ read_options = ReadOptions(gff_read_options=gff_options)
1020
+ table = py_register_table(
1021
+ ctx, self._file_path, None, InputFormat.Gff, read_options
1022
+ )
1023
+ df = py_read_table(ctx, table.name)
1024
+
1025
+ # Create new LazyFrame with optimized schema
1026
+ new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
1027
+ return new_lf.select(exprs)
1028
+
1029
+ elif attribute_cols:
1030
+ # Extract attribute fields from nested structure (compatibility path)
1031
+ import polars as pl
1032
+
1033
+ # Build selection with attribute field extraction
1034
+ selection_exprs = []
1035
+
1036
+ # Add static columns as-is
1037
+ for col in static_cols:
1038
+ selection_exprs.append(pl.col(col))
1039
+
1040
+ # Add attribute field extractions
1041
+ for attr_col in attribute_cols:
1042
+ attr_expr = (
1043
+ pl.col("attributes")
1044
+ .list.eval(
1045
+ pl.when(pl.element().struct.field("tag") == attr_col).then(
1046
+ pl.element().struct.field("value")
1047
+ )
1048
+ )
1049
+ .list.drop_nulls()
1050
+ .list.first()
1051
+ .alias(attr_col)
1052
+ )
1053
+ selection_exprs.append(attr_expr)
1054
+
1055
+ return self._base_lf.select(selection_exprs)
1056
+ else:
1057
+ # Static columns only, use base LazyFrame
1058
+ return self._base_lf.select(exprs)
1059
+
1060
+ def __getattr__(self, name):
1061
+ """Delegate all other operations to base LazyFrame."""
1062
+ return getattr(self._base_lf, name)
Binary file
polars_bio/range_op.py CHANGED
@@ -48,6 +48,7 @@ class IntervalOperations:
48
48
  output_type: str = "polars.LazyFrame",
49
49
  read_options1: Union[ReadOptions, None] = None,
50
50
  read_options2: Union[ReadOptions, None] = None,
51
+ projection_pushdown: bool = False,
51
52
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
52
53
  """
53
54
  Find pairs of overlapping genomic intervals.
@@ -67,6 +68,7 @@ class IntervalOperations:
67
68
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
68
69
  read_options1: Additional options for reading the input files.
69
70
  read_options2: Additional options for reading the input files.
71
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
70
72
 
71
73
  Returns:
72
74
  **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -123,7 +125,14 @@ class IntervalOperations:
123
125
  )
124
126
 
125
127
  return range_operation(
126
- df1, df2, range_options, output_type, ctx, read_options1, read_options2
128
+ df1,
129
+ df2,
130
+ range_options,
131
+ output_type,
132
+ ctx,
133
+ read_options1,
134
+ read_options2,
135
+ projection_pushdown,
127
136
  )
128
137
 
129
138
  @staticmethod
@@ -137,6 +146,7 @@ class IntervalOperations:
137
146
  cols2: Union[list[str], None] = ["chrom", "start", "end"],
138
147
  output_type: str = "polars.LazyFrame",
139
148
  read_options: Union[ReadOptions, None] = None,
149
+ projection_pushdown: bool = False,
140
150
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
141
151
  """
142
152
  Find pairs of closest genomic intervals.
@@ -154,6 +164,7 @@ class IntervalOperations:
154
164
  on_cols: List of additional column names to join on. default is None.
155
165
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
156
166
  read_options: Additional options for reading the input files.
167
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
157
168
 
158
169
 
159
170
  Returns:
@@ -182,7 +193,15 @@ class IntervalOperations:
182
193
  columns_1=cols1,
183
194
  columns_2=cols2,
184
195
  )
185
- return range_operation(df1, df2, range_options, output_type, ctx, read_options)
196
+ return range_operation(
197
+ df1,
198
+ df2,
199
+ range_options,
200
+ output_type,
201
+ ctx,
202
+ read_options,
203
+ projection_pushdown=projection_pushdown,
204
+ )
186
205
 
187
206
  @staticmethod
188
207
  def coverage(
@@ -195,6 +214,7 @@ class IntervalOperations:
195
214
  cols2: Union[list[str], None] = ["chrom", "start", "end"],
196
215
  output_type: str = "polars.LazyFrame",
197
216
  read_options: Union[ReadOptions, None] = None,
217
+ projection_pushdown: bool = False,
198
218
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
199
219
  """
200
220
  Calculate intervals coverage.
@@ -212,6 +232,7 @@ class IntervalOperations:
212
232
  on_cols: List of additional column names to join on. default is None.
213
233
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
214
234
  read_options: Additional options for reading the input files.
235
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
215
236
 
216
237
 
217
238
  Returns:
@@ -245,7 +266,15 @@ class IntervalOperations:
245
266
  columns_1=cols1,
246
267
  columns_2=cols2,
247
268
  )
248
- return range_operation(df2, df1, range_options, output_type, ctx, read_options)
269
+ return range_operation(
270
+ df2,
271
+ df1,
272
+ range_options,
273
+ output_type,
274
+ ctx,
275
+ read_options,
276
+ projection_pushdown=projection_pushdown,
277
+ )
249
278
 
250
279
  @staticmethod
251
280
  def count_overlaps(
@@ -258,6 +287,7 @@ class IntervalOperations:
258
287
  on_cols: Union[list[str], None] = None,
259
288
  output_type: str = "polars.LazyFrame",
260
289
  naive_query: bool = True,
290
+ projection_pushdown: bool = False,
261
291
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
262
292
  """
263
293
  Count pairs of overlapping genomic intervals.
@@ -275,6 +305,7 @@ class IntervalOperations:
275
305
  on_cols: List of additional column names to join on. default is None.
276
306
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
277
307
  naive_query: If True, use naive query for counting overlaps based on overlaps.
308
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
278
309
  Returns:
279
310
  **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
280
311
 
@@ -421,6 +452,7 @@ class IntervalOperations:
421
452
  cols: Union[list[str], None] = ["chrom", "start", "end"],
422
453
  on_cols: Union[list[str], None] = None,
423
454
  output_type: str = "polars.LazyFrame",
455
+ projection_pushdown: bool = False,
424
456
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
425
457
  """
426
458
  Merge overlapping intervals. It is assumed that start < end.
@@ -433,6 +465,7 @@ class IntervalOperations:
433
465
  genomic intervals, provided separately for each set.
434
466
  on_cols: List of additional column names for clustering. default is None.
435
467
  output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
468
+ projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
436
469
 
437
470
  Returns:
438
471
  **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
@@ -31,6 +31,7 @@ def range_operation(
31
31
  ctx: BioSessionContext,
32
32
  read_options1: Union[ReadOptions, None] = None,
33
33
  read_options2: Union[ReadOptions, None] = None,
34
+ projection_pushdown: bool = False,
34
35
  ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame"]:
35
36
  ctx.sync_options()
36
37
  if isinstance(df1, str) and isinstance(df2, str):
@@ -67,6 +68,7 @@ def range_operation(
67
68
  ctx=ctx,
68
69
  read_options1=read_options1,
69
70
  read_options2=read_options2,
71
+ projection_pushdown=projection_pushdown,
70
72
  )
71
73
  elif output_type == "polars.DataFrame":
72
74
  return range_operation_scan(
@@ -100,7 +102,14 @@ def range_operation(
100
102
  **_rename_columns(df2, range_options.suffixes[1]).schema,
101
103
  }
102
104
  )
103
- return range_lazy_scan(df1, df2, merged_schema, range_options, ctx)
105
+ return range_lazy_scan(
106
+ df1,
107
+ df2,
108
+ merged_schema,
109
+ range_options,
110
+ ctx,
111
+ projection_pushdown=projection_pushdown,
112
+ )
104
113
  else:
105
114
  df1 = _df_to_reader(df1, range_options.columns_1[0])
106
115
  df2 = _df_to_reader(df2, range_options.columns_2[0])
polars_bio/range_op_io.py CHANGED
@@ -35,6 +35,7 @@ def range_lazy_scan(
35
35
  ctx: BioSessionContext,
36
36
  read_options1: Union[ReadOptions, None] = None,
37
37
  read_options2: Union[ReadOptions, None] = None,
38
+ projection_pushdown: bool = False,
38
39
  ) -> pl.LazyFrame:
39
40
  range_function = None
40
41
  if isinstance(df_1, str) and isinstance(df_2, str):
@@ -50,27 +51,59 @@ def range_lazy_scan(
50
51
  _n_rows: Union[int, None],
51
52
  _batch_size: Union[int, None],
52
53
  ) -> Iterator[pl.DataFrame]:
54
+ # Extract projected columns if projection pushdown is enabled
55
+ projected_columns = None
56
+ if projection_pushdown and with_columns is not None:
57
+ from .io import _extract_column_names_from_expr
58
+
59
+ projected_columns = _extract_column_names_from_expr(with_columns)
60
+
61
+ # Apply projection pushdown to range options if enabled
62
+ modified_range_options = range_options
63
+ if projection_pushdown and projected_columns:
64
+ # Create a copy of range options with projection information
65
+ # This is where we would modify the SQL generation in a full implementation
66
+ modified_range_options = range_options
67
+
53
68
  df_lazy: datafusion.DataFrame = (
54
69
  range_function(
55
- ctx, df_1, df_2, range_options, read_options1, read_options2, _n_rows
70
+ ctx,
71
+ df_1,
72
+ df_2,
73
+ modified_range_options,
74
+ read_options1,
75
+ read_options2,
76
+ _n_rows,
56
77
  )
57
78
  if isinstance(df_1, str) and isinstance(df_2, str)
58
- else range_function(ctx, df_1, df_2, range_options, _n_rows)
79
+ else range_function(ctx, df_1, df_2, modified_range_options, _n_rows)
59
80
  )
81
+
82
+ # Apply DataFusion-level projection if enabled
83
+ datafusion_projection_applied = False
84
+ if projection_pushdown and projected_columns:
85
+ try:
86
+ # Try to select only the requested columns at the DataFusion level
87
+ df_lazy = df_lazy.select(projected_columns)
88
+ datafusion_projection_applied = True
89
+ except Exception:
90
+ # Fallback to Python-level selection if DataFusion selection fails
91
+ datafusion_projection_applied = False
92
+
60
93
  df_lazy.schema()
61
94
  df_stream = df_lazy.execute_stream()
62
95
  progress_bar = tqdm(unit="rows")
63
96
  for r in df_stream:
64
97
  py_df = r.to_pyarrow()
65
98
  df = pl.DataFrame(py_df)
66
- # # TODO: We can push predicates down to the DataFusion plan in the future,
67
- # # but for now we'll do it here.
68
- # if predicate is not None:
69
- # df = df.filter(predicate)
70
- # # TODO: We can push columns down to the DataFusion plan in the future,
71
- # # but for now we'll do it here.
72
- # if with_columns is not None:
73
- # df = df.select(with_columns)
99
+ # Handle predicate and column projection
100
+ if predicate is not None:
101
+ df = df.filter(predicate)
102
+ # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
103
+ if with_columns is not None and (
104
+ not projection_pushdown or not datafusion_projection_applied
105
+ ):
106
+ df = df.select(with_columns)
74
107
  progress_bar.update(len(df))
75
108
  yield df
76
109
 
polars_bio/sql.py CHANGED
@@ -31,7 +31,6 @@ class SQL:
31
31
  def register_vcf(
32
32
  path: str,
33
33
  name: Union[str, None] = None,
34
- info_fields: Union[list[str], None] = None,
35
34
  thread_num: int = 1,
36
35
  chunk_size: int = 64,
37
36
  concurrent_fetches: int = 8,
@@ -47,13 +46,12 @@ class SQL:
47
46
  Parameters:
48
47
  path: The path to the VCF file.
49
48
  name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
50
- info_fields: The fields to read from the INFO column.
51
49
  thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
52
50
  chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
53
51
  concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
54
52
  allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
55
53
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
56
- compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
54
+ compression_type: The compression type of the VCF file. If not specified, it will be detected automatically..
57
55
  max_retries: The maximum number of retries for reading the file from object storage.
58
56
  timeout: The timeout in seconds for reading the file from object storage.
59
57
  !!! note
@@ -81,8 +79,24 @@ class SQL:
81
79
  compression_type=compression_type,
82
80
  )
83
81
 
82
+ # Get all info fields from VCF header for automatic field detection
83
+ all_info_fields = None
84
+ try:
85
+ from .io import IOOperations
86
+
87
+ vcf_schema_df = IOOperations.describe_vcf(
88
+ path,
89
+ allow_anonymous=allow_anonymous,
90
+ enable_request_payer=enable_request_payer,
91
+ compression_type=compression_type,
92
+ )
93
+ all_info_fields = vcf_schema_df.select("name").to_series().to_list()
94
+ except Exception:
95
+ # Fallback to empty list if unable to get info fields
96
+ all_info_fields = []
97
+
84
98
  vcf_read_options = VcfReadOptions(
85
- info_fields=_cleanse_fields(info_fields),
99
+ info_fields=all_info_fields,
86
100
  thread_num=thread_num,
87
101
  object_storage_options=object_storage_options,
88
102
  )
@@ -93,7 +107,6 @@ class SQL:
93
107
  def register_gff(
94
108
  path: str,
95
109
  name: Union[str, None] = None,
96
- attr_fields: Union[list[str], None] = None,
97
110
  thread_num: int = 1,
98
111
  chunk_size: int = 64,
99
112
  concurrent_fetches: int = 8,
@@ -102,6 +115,7 @@ class SQL:
102
115
  timeout: int = 300,
103
116
  enable_request_payer: bool = False,
104
117
  compression_type: str = "auto",
118
+ parallel: bool = False,
105
119
  ) -> None:
106
120
  """
107
121
  Register a GFF file as a Datafusion table.
@@ -109,7 +123,6 @@ class SQL:
109
123
  Parameters:
110
124
  path: The path to the GFF file.
111
125
  name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
112
- attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
113
126
  thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
114
127
  chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
115
128
  concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
@@ -118,6 +131,7 @@ class SQL:
118
131
  compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
119
132
  max_retries: The maximum number of retries for reading the file from object storage.
120
133
  timeout: The timeout in seconds for reading the file from object storage.
134
+ parallel: Whether to use the parallel reader for BGZF-compressed local files. Default is False.
121
135
  !!! note
122
136
  GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
123
137
 
@@ -127,8 +141,8 @@ class SQL:
127
141
  ```
128
142
  ```python
129
143
  import polars_bio as pb
130
- pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz", attr_fields=["ID", "Parent"])
131
- pb.sql("SELECT `Parent`, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY `Parent`").limit(5).collect()
144
+ pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz")
145
+ pb.sql("SELECT attributes, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY attributes").limit(5).collect()
132
146
  ```
133
147
  ```shell
134
148
 
@@ -161,9 +175,10 @@ class SQL:
161
175
  )
162
176
 
163
177
  gff_read_options = GffReadOptions(
164
- attr_fields=_cleanse_fields(attr_fields),
178
+ attr_fields=None,
165
179
  thread_num=thread_num,
166
180
  object_storage_options=object_storage_options,
181
+ parallel=parallel,
167
182
  )
168
183
  read_options = ReadOptions(gff_read_options=gff_read_options)
169
184
  py_register_table(ctx, path, name, InputFormat.Gff, read_options)
@@ -179,7 +194,7 @@ class SQL:
179
194
  timeout: int = 300,
180
195
  enable_request_payer: bool = False,
181
196
  compression_type: str = "auto",
182
- parallel: bool = True,
197
+ parallel: bool = False,
183
198
  ) -> None:
184
199
  """
185
200
  Register a FASTQ file as a Datafusion table.
@@ -194,7 +209,7 @@ class SQL:
194
209
  compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
195
210
  max_retries: The maximum number of retries for reading the file from object storage.
196
211
  timeout: The timeout in seconds for reading the file from object storage.
197
- parallel: Whether to use the parallel reader for BGZF compressed files.
212
+ parallel: Whether to use the parallel reader for BGZF compressed files. Default is False. If a file ends with ".gz" but is actually BGZF, it will attempt the parallel path and fall back to standard if not BGZF.
198
213
 
199
214
  !!! Example
200
215
  ```python
@@ -265,7 +280,7 @@ class SQL:
265
280
  concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
266
281
  allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
267
282
  enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
268
- compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
283
+ compression_type: The compression type of the BED file. If not specified, it will be detected automatically..
269
284
  max_retries: The maximum number of retries for reading the file from object storage.
270
285
  timeout: The timeout in seconds for reading the file from object storage.
271
286
 
polars_bio/utils.py CHANGED
@@ -12,9 +12,11 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
12
12
  return [x.strip() for x in t]
13
13
 
14
14
 
15
- def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
15
+ def _lazy_scan(
16
+ df: Union[pl.DataFrame, pl.LazyFrame], projection_pushdown: bool = False
17
+ ) -> pl.LazyFrame:
16
18
  df_lazy: DataFrame = df
17
- arrow_schema = df_lazy.schema()
19
+ original_schema = df_lazy.schema()
18
20
 
19
21
  def _overlap_source(
20
22
  with_columns: Union[pl.Expr, None],
@@ -22,25 +24,101 @@ def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
22
24
  n_rows: Union[int, None],
23
25
  _batch_size: Union[int, None],
24
26
  ) -> Iterator[pl.DataFrame]:
27
+ # Extract column names from with_columns if projection pushdown is enabled
28
+ projected_columns = None
29
+ if projection_pushdown and with_columns is not None:
30
+ projected_columns = _extract_column_names_from_expr(with_columns)
31
+
32
+ # Apply column projection to DataFusion query if enabled
33
+ query_df = df_lazy
34
+ datafusion_projection_applied = False
35
+ if projection_pushdown and projected_columns:
36
+ try:
37
+ query_df = df_lazy.select(projected_columns)
38
+ datafusion_projection_applied = True
39
+
40
+ # For testing: allow inspection of the execution plan
41
+ if hasattr(df_lazy, "_test_projection_capture"):
42
+ df_lazy._test_projection_capture = {
43
+ "original_plan": str(df_lazy.optimized_logical_plan()),
44
+ "projected_plan": str(query_df.optimized_logical_plan()),
45
+ "projected_columns": projected_columns,
46
+ "datafusion_projection_applied": True,
47
+ }
48
+
49
+ except Exception as e:
50
+ # Fallback to original behavior if projection fails
51
+ query_df = df_lazy
52
+ projected_columns = None
53
+ datafusion_projection_applied = False
54
+
55
+ # For testing: capture the failure
56
+ if hasattr(df_lazy, "_test_projection_capture"):
57
+ df_lazy._test_projection_capture = {
58
+ "original_plan": str(df_lazy.optimized_logical_plan()),
59
+ "projected_plan": None,
60
+ "projected_columns": projected_columns,
61
+ "datafusion_projection_applied": False,
62
+ "error": str(e),
63
+ }
64
+
25
65
  if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
26
- df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
66
+ df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
27
67
  df = pl.DataFrame(df).limit(n_rows)
28
68
  if predicate is not None:
29
69
  df = df.filter(predicate)
30
- if with_columns is not None:
70
+ # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
71
+ if with_columns is not None and (
72
+ not projection_pushdown or not datafusion_projection_applied
73
+ ):
31
74
  df = df.select(with_columns)
32
75
  yield df
33
76
  return
34
- df_stream = df_lazy.execute_stream()
77
+
78
+ df_stream = query_df.execute_stream()
35
79
  progress_bar = tqdm(unit="rows")
36
80
  for r in df_stream:
37
81
  py_df = r.to_pyarrow()
38
82
  df = pl.DataFrame(py_df)
39
83
  if predicate is not None:
40
84
  df = df.filter(predicate)
41
- if with_columns is not None:
85
+ # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
86
+ if with_columns is not None and (
87
+ not projection_pushdown or not datafusion_projection_applied
88
+ ):
42
89
  df = df.select(with_columns)
43
90
  progress_bar.update(len(df))
44
91
  yield df
45
92
 
46
- return register_io_source(_overlap_source, schema=arrow_schema)
93
+ return register_io_source(_overlap_source, schema=original_schema)
94
+
95
+
96
+ def _extract_column_names_from_expr(with_columns: Union[pl.Expr, list]) -> list[str]:
97
+ """Extract column names from Polars expressions."""
98
+ if with_columns is None:
99
+ return []
100
+
101
+ # Handle different types of with_columns input
102
+ if hasattr(with_columns, "__iter__") and not isinstance(with_columns, str):
103
+ # It's a list of expressions or strings
104
+ column_names = []
105
+ for item in with_columns:
106
+ if isinstance(item, str):
107
+ column_names.append(item)
108
+ elif hasattr(item, "meta") and hasattr(item.meta, "output_name"):
109
+ # Polars expression with output name
110
+ try:
111
+ column_names.append(item.meta.output_name())
112
+ except Exception:
113
+ pass
114
+ return column_names
115
+ elif isinstance(with_columns, str):
116
+ return [with_columns]
117
+ elif hasattr(with_columns, "meta") and hasattr(with_columns.meta, "output_name"):
118
+ # Single Polars expression
119
+ try:
120
+ return [with_columns.meta.output_name()]
121
+ except Exception:
122
+ pass
123
+
124
+ return []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polars-bio
3
- Version: 0.13.0
3
+ Version: 0.14.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -9,6 +9,7 @@ Requires-Dist: pyarrow~=21.0.0
9
9
  Requires-Dist: datafusion~=48.0.0
10
10
  Requires-Dist: tqdm~=4.67.1
11
11
  Requires-Dist: typing-extensions~=4.14.0
12
+ Requires-Dist: mkdocs-glightbox>=0.5.1,<0.6.0
12
13
  Requires-Dist: pandas ; extra == 'pandas'
13
14
  Requires-Dist: bioframe ; extra == 'viz'
14
15
  Requires-Dist: matplotlib ; extra == 'viz'
@@ -0,0 +1,19 @@
1
+ polars_bio-0.14.0.dist-info/METADATA,sha256=f6XqkJ12QrWensCi-C5g5rGWRMz9oS5BIcKy0VAA2uI,729
2
+ polars_bio-0.14.0.dist-info/WHEEL,sha256=DLqF2HZq4W_umZdP6RnfAuqhmtX_UrV4mkqrSIMhipE,102
3
+ polars_bio-0.14.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
+ polars_bio/__init__.py,sha256=OuGpBDjpIjypj2obhiawe0PQemFhnqZULYn2sFRksoc,2977
5
+ polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
6
+ polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
7
+ polars_bio/interval_op_helpers.py,sha256=xMWxu2y3jIwt0KCtzIPF_cvbUMdhrb8Mif74MbHU1qY,2834
8
+ polars_bio/io.py,sha256=81HSyif7bJUtkkAOuoOGy8aKGXbolzje4pdAstwhODg,51004
9
+ polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
10
+ polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
11
+ polars_bio/polars_bio.abi3.so,sha256=fWWXTSQM2srM-Wx-sSVqLf5xWxq5pbhtSJvCFOScAe4,263702800
12
+ polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
13
+ polars_bio/range_op.py,sha256=awmzuCfsy19osJ-M3UwTCr2zT2oSsJtzwl5I3KcB5aI,25693
14
+ polars_bio/range_op_helpers.py,sha256=pgia2ewu9IzZMMcNvxoeHdaJmBdxVyhSxpHPM6Vc7lw,6040
15
+ polars_bio/range_op_io.py,sha256=Cs30bagbiJvmjebDaD1go9WIFlSlXeLgmmr5tHvZTII,7076
16
+ polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
17
+ polars_bio/sql.py,sha256=L3uwHEOT6BNmKmJVBD-8Mm0iWqrDyKLVkOwFzV4UlBw,24517
18
+ polars_bio/utils.py,sha256=AFrVVGpTwrhwhbVApbra2fH7wqo2IaPNMIwi796P-hQ,4972
19
+ polars_bio-0.14.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: maturin (1.9.3)
2
+ Generator: maturin (1.9.4)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp39-abi3-macosx_11_0_arm64
@@ -1,19 +0,0 @@
1
- polars_bio-0.13.0.dist-info/METADATA,sha256=kRbYeTHKR-qtdAq4pD5bf8k1iUyHSriIAbUq3IUOO9o,683
2
- polars_bio-0.13.0.dist-info/WHEEL,sha256=Ch1JWYj7gYwqKz7d9H_Lp0Bb9PFEomQpnode7eVvsxU,102
3
- polars_bio-0.13.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
- polars_bio/__init__.py,sha256=-4QHwzijNcm99A372_-pLCK-F3YrmO2my-HZgVjYEr8,2977
5
- polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
6
- polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
7
- polars_bio/interval_op_helpers.py,sha256=xMWxu2y3jIwt0KCtzIPF_cvbUMdhrb8Mif74MbHU1qY,2834
8
- polars_bio/io.py,sha256=YtcNqS0pzeTRZ78ckov4nfNekvWCyz5JGSHVl7LxfFQ,37866
9
- polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
10
- polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
11
- polars_bio/polars_bio.abi3.so,sha256=zhziZVyfe1LJfFURg_E6e2X1t4WDetvLIna6NGTe1Nk,263400864
12
- polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
13
- polars_bio/range_op.py,sha256=3LAYTmbJhv7WY8eB7_OJfPLLoR9eonbZSFKkZi_Dp30,24300
14
- polars_bio/range_op_helpers.py,sha256=RQw6ZgIGhDh-3-pUTIQ56Vypuy9XQhpFGKQYGd_vrzY,5792
15
- polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
16
- polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
17
- polars_bio/sql.py,sha256=ORvSleiwUpkpewvgcFA3GeuyZhQXToq9RZ_XrO6iGxw,24164
18
- polars_bio/utils.py,sha256=KAq8tbIf6yBFhRwzrRLBUfM6zbbdCqK_NYK5bUy1qfA,1565
19
- polars_bio-0.13.0.dist-info/RECORD,,