polars-bio 0.14.0__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. {polars_bio-0.14.0 → polars_bio-0.15.0}/Cargo.lock +26 -8
  2. {polars_bio-0.14.0 → polars_bio-0.15.0}/Cargo.toml +8 -8
  3. {polars_bio-0.14.0 → polars_bio-0.15.0}/PKG-INFO +1 -1
  4. polars_bio-0.15.0/benchmarks/01_general_performance.py +311 -0
  5. polars_bio-0.15.0/benchmarks/02_memory_profiling.py +489 -0
  6. polars_bio-0.15.0/benchmarks/03_thread_scalability.py +474 -0
  7. polars_bio-0.15.0/benchmarks/04_projection_pruning.py +244 -0
  8. polars_bio-0.15.0/benchmarks/05_predicate_pushdown.py +359 -0
  9. polars_bio-0.15.0/benchmarks/06_combined_optimizations.py +360 -0
  10. polars_bio-0.15.0/benchmarks/gff_parsers.py +119 -0
  11. polars_bio-0.15.0/benchmarks/run_all_benchmarks.py +139 -0
  12. polars_bio-0.15.0/benchmarks/visualize_results.py +573 -0
  13. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/__init__.py +1 -1
  14. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/io.py +425 -180
  15. polars_bio-0.15.0/polars_bio/predicate_translator.py +464 -0
  16. polars_bio-0.15.0/polars_bio/sql_predicate_builder.py +293 -0
  17. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/utils.py +29 -4
  18. {polars_bio-0.14.0 → polars_bio-0.15.0}/pyproject.toml +1 -1
  19. polars_bio-0.15.0/tests/conftest.py +66 -0
  20. polars_bio-0.15.0/tests/data/io/gff/chrY_test_subset.gff3.bgz +0 -0
  21. polars_bio-0.15.0/tests/test_optimization_bug_fix.py +363 -0
  22. polars_bio-0.15.0/tests/test_predicate_in_between.py +66 -0
  23. polars_bio-0.15.0/tests/test_predicate_pushdown.py +666 -0
  24. polars_bio-0.15.0/tests/test_predicate_pushdown_chrY_start.py +84 -0
  25. {polars_bio-0.14.0 → polars_bio-0.15.0}/.github/workflows/publish_documentation.yml +0 -0
  26. {polars_bio-0.14.0 → polars_bio-0.15.0}/.github/workflows/publish_to_pypi.yml +0 -0
  27. {polars_bio-0.14.0 → polars_bio-0.15.0}/.github/workflows/release.yml +0 -0
  28. {polars_bio-0.14.0 → polars_bio-0.15.0}/.gitignore +0 -0
  29. {polars_bio-0.14.0 → polars_bio-0.15.0}/.pre-commit-config.yaml +0 -0
  30. {polars_bio-0.14.0 → polars_bio-0.15.0}/.readthedocs.yaml +0 -0
  31. {polars_bio-0.14.0 → polars_bio-0.15.0}/LICENSE +0 -0
  32. {polars_bio-0.14.0 → polars_bio-0.15.0}/Makefile +0 -0
  33. {polars_bio-0.14.0 → polars_bio-0.15.0}/README.md +0 -0
  34. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/api.md +0 -0
  35. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/count-overlaps-parallel.png +0 -0
  36. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/count-overlaps-single.png +0 -0
  37. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/coverage-parallel.png +0 -0
  38. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/coverage-single.png +0 -0
  39. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/logo-large.png +0 -0
  40. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/logo.png +0 -0
  41. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/bioframe.png +0 -0
  42. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/bioframe_sink.png +0 -0
  43. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/dataframes/polars-bio-overlap-mem.png +0 -0
  44. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/dataframes/polars-bio-overlap-pd-mem.png +0 -0
  45. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/dataframes/polars-bio-overlap-pl-mem.png +0 -0
  46. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/polars-bio.png +0 -0
  47. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/polars-bio_sink.png +0 -0
  48. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/polars-bio_stream_sink.png +0 -0
  49. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/pyranges0.png +0 -0
  50. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/pyranges0_sink.png +0 -0
  51. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/pyranges1.png +0 -0
  52. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/memory/pyranges1_sink.png +0 -0
  53. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/nearest-parallel.png +0 -0
  54. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/nearest-single.png +0 -0
  55. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/overlap-parallel.png +0 -0
  56. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/overlap-single.png +0 -0
  57. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/results-nearest-0.1.1.png +0 -0
  58. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/assets/results-overlap-0.1.1.png +0 -0
  59. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/index.md +0 -0
  60. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/benchmark-operations-2025-09.md +0 -0
  61. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/all_operations_walltime_comparison.png +0 -0
  62. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/bench-20250-all_operations_speedup_comparison.png +0 -0
  63. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/benchmark_comparison_genomicranges_vs_polars_bio.png +0 -0
  64. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/benchmark_speedup_comparison_genomicranges_vs_polars_bio.png +0 -0
  65. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/combined_benchmark_visualization.png +0 -0
  66. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/combined_multi_testcase.png +0 -0
  67. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/blog/posts/figures/benchmark-sep-2025/star-history-202595.png +0 -0
  68. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/contact.md +0 -0
  69. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/faq.md +0 -0
  70. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/features.md +0 -0
  71. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/index.md +0 -0
  72. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/example.bam +0 -0
  73. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/example.bed.bgz +0 -0
  74. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/example.fastq.gz +0 -0
  75. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/example.gff3.bgz +0 -0
  76. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/example.vcf +0 -0
  77. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/exons/.part-00000-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet.crc +0 -0
  78. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/exons/.part-00001-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet.crc +0 -0
  79. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/exons/_SUCCESS +0 -0
  80. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/exons/part-00000-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet +0 -0
  81. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/exons/part-00001-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet +0 -0
  82. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/fBrain-DS14718/.part-00000-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet.crc +0 -0
  83. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/fBrain-DS14718/.part-00001-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet.crc +0 -0
  84. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/fBrain-DS14718/_SUCCESS +0 -0
  85. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/fBrain-DS14718/part-00000-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet +0 -0
  86. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/data/fBrain-DS14718/part-00001-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet +0 -0
  87. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/notebooks/tutorial.ipynb +0 -0
  88. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/performance.md +0 -0
  89. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/quickstart.md +0 -0
  90. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/requirements.txt +0 -0
  91. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/supplement.md +0 -0
  92. {polars_bio-0.14.0 → polars_bio-0.15.0}/docs/versions.json +0 -0
  93. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/README.md +0 -0
  94. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/bin/.env +0 -0
  95. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/bin/start.sh +0 -0
  96. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/bin/stop.sh +0 -0
  97. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/data/policy-anonymous.json +0 -0
  98. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/data/policy-priv.json +0 -0
  99. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/data/test.fasta +0 -0
  100. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/data/vep.vcf +0 -0
  101. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/data/vep.vcf.bgz +0 -0
  102. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/docker-compose.yml +0 -0
  103. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/it_ensembl_vcf_bgz.py +0 -0
  104. {polars_bio-0.14.0 → polars_bio-0.15.0}/it/it_object_storage_io.py +0 -0
  105. {polars_bio-0.14.0 → polars_bio-0.15.0}/mkdocs.yml +0 -0
  106. {polars_bio-0.14.0 → polars_bio-0.15.0}/poetry.lock +0 -0
  107. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars-bio.iml +0 -0
  108. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/constants.py +0 -0
  109. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/context.py +0 -0
  110. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/interval_op_helpers.py +0 -0
  111. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/logging.py +0 -0
  112. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/operations.py +0 -0
  113. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/polars_ext.py +0 -0
  114. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/range_op.py +0 -0
  115. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/range_op_helpers.py +0 -0
  116. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/range_op_io.py +0 -0
  117. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/range_utils.py +0 -0
  118. {polars_bio-0.14.0 → polars_bio-0.15.0}/polars_bio/sql.py +0 -0
  119. {polars_bio-0.14.0 → polars_bio-0.15.0}/requirements.txt +0 -0
  120. {polars_bio-0.14.0 → polars_bio-0.15.0}/rust-toolchain.toml +0 -0
  121. {polars_bio-0.14.0 → polars_bio-0.15.0}/rustfmt.toml +0 -0
  122. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/context.rs +0 -0
  123. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/lib.rs +0 -0
  124. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/operation.rs +0 -0
  125. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/option.rs +0 -0
  126. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/query.rs +0 -0
  127. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/scan.rs +0 -0
  128. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/streaming.rs +0 -0
  129. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/udtf.rs +0 -0
  130. {polars_bio-0.14.0 → polars_bio-0.15.0}/src/utils.rs +0 -0
  131. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/_expected.py +0 -0
  132. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/count_overlaps/reads.csv +0 -0
  133. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/count_overlaps/targets.csv +0 -0
  134. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/coverage/reads.csv +0 -0
  135. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/coverage/targets.csv +0 -0
  136. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/exons/.part-00000-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet.crc +0 -0
  137. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/exons/.part-00001-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet.crc +0 -0
  138. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/exons/_SUCCESS +0 -0
  139. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/exons/part-00000-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet +0 -0
  140. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/exons/part-00001-47fafbb5-1cab-410c-9461-d10effacf760-c000.snappy.parquet +0 -0
  141. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/fBrain-DS14718/.part-00000-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet.crc +0 -0
  142. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/fBrain-DS14718/.part-00001-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet.crc +0 -0
  143. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/fBrain-DS14718/_SUCCESS +0 -0
  144. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/fBrain-DS14718/part-00000-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet +0 -0
  145. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/fBrain-DS14718/part-00001-a0d75244-2d87-41eb-a3eb-a18847c7cb87-c000.snappy.parquet +0 -0
  146. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/bam/test.bam +0 -0
  147. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/bam/test.bam.bai +0 -0
  148. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/bed/ENCFF001XKR.bed.gz +0 -0
  149. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/bed/chr16_fragile_site.bed +0 -0
  150. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/bed/chr16_fragile_site.bed.bgz +0 -0
  151. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/bed/test.bed +0 -0
  152. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fasta/test.fasta +0 -0
  153. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/example.fastq +0 -0
  154. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/example.fastq.bgz +0 -0
  155. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/example.fastq.bgz.gzi +0 -0
  156. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/example.fastq.gz +0 -0
  157. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/sample_parallel.fastq.bgz +0 -0
  158. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/sample_parallel.fastq.bgz.gzi +0 -0
  159. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/test.fastq +0 -0
  160. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/fastq/wrong_extension.fastq.gz +0 -0
  161. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/gff/gencode.v38.annotation.gff3 +0 -0
  162. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/gff/gencode.v38.annotation.gff3.bgz +0 -0
  163. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/gff/gencode.v38.annotation.gff3.gz +0 -0
  164. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/gff/wrong_extension.gff3.gz +0 -0
  165. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/ensembl-2.vcf +0 -0
  166. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/ensembl.vcf +0 -0
  167. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/vep.vcf +0 -0
  168. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/vep.vcf.bgz +0 -0
  169. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/vep.vcf.gz +0 -0
  170. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/wrong_extension.vcf.bgz +0 -0
  171. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/io/vcf/wrong_extension.vcf.gz +0 -0
  172. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/merge/input.csv +0 -0
  173. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/nearest/reads.csv +0 -0
  174. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/nearest/targets.csv +0 -0
  175. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/overlap/reads.csv +0 -0
  176. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/data/overlap/targets.csv +0 -0
  177. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_bioframe.py +0 -0
  178. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_ensembl_vcf_parsing.py +0 -0
  179. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_execution_plan_validation.py +0 -0
  180. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_io.py +0 -0
  181. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_native.py +0 -0
  182. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_overlap_algorithms.py +0 -0
  183. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_pandas.py +0 -0
  184. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_parallel_io.py +0 -0
  185. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_polars.py +0 -0
  186. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_polars_bio_projection_validation.py +0 -0
  187. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_polars_ext.py +0 -0
  188. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_projection_performance.py +0 -0
  189. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_projection_pushdown.py +0 -0
  190. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_streaming.py +0 -0
  191. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_vcf_projection_pushdown.py +0 -0
  192. {polars_bio-0.14.0 → polars_bio-0.15.0}/tests/test_warnings.py +0 -0
@@ -1404,7 +1404,7 @@ dependencies = [
1404
1404
  [[package]]
1405
1405
  name = "datafusion-bio-format-bam"
1406
1406
  version = "0.1.0"
1407
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1407
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1408
1408
  dependencies = [
1409
1409
  "async-stream",
1410
1410
  "async-trait",
@@ -1425,7 +1425,7 @@ dependencies = [
1425
1425
  [[package]]
1426
1426
  name = "datafusion-bio-format-bed"
1427
1427
  version = "0.1.0"
1428
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1428
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1429
1429
  dependencies = [
1430
1430
  "async-compression",
1431
1431
  "async-stream",
@@ -1449,7 +1449,7 @@ dependencies = [
1449
1449
  [[package]]
1450
1450
  name = "datafusion-bio-format-core"
1451
1451
  version = "0.1.0"
1452
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1452
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1453
1453
  dependencies = [
1454
1454
  "async-compression",
1455
1455
  "bytes",
@@ -1468,7 +1468,7 @@ dependencies = [
1468
1468
  [[package]]
1469
1469
  name = "datafusion-bio-format-fasta"
1470
1470
  version = "0.1.0"
1471
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1471
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1472
1472
  dependencies = [
1473
1473
  "async-compression",
1474
1474
  "async-stream",
@@ -1490,7 +1490,7 @@ dependencies = [
1490
1490
  [[package]]
1491
1491
  name = "datafusion-bio-format-fastq"
1492
1492
  version = "0.1.0"
1493
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1493
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1494
1494
  dependencies = [
1495
1495
  "async-compression",
1496
1496
  "async-stream",
@@ -1514,7 +1514,7 @@ dependencies = [
1514
1514
  [[package]]
1515
1515
  name = "datafusion-bio-format-gff"
1516
1516
  version = "0.1.0"
1517
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1517
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1518
1518
  dependencies = [
1519
1519
  "async-compression",
1520
1520
  "async-stream",
@@ -1539,7 +1539,7 @@ dependencies = [
1539
1539
  [[package]]
1540
1540
  name = "datafusion-bio-format-vcf"
1541
1541
  version = "0.1.0"
1542
- source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=aecd7b316c1c498bff14c2cbb4ac0bd04d21612f#aecd7b316c1c498bff14c2cbb4ac0bd04d21612f"
1542
+ source = "git+https://github.com/biodatageeks/datafusion-bio-formats.git?rev=99e811d1eeb5f73b67b0a7ce5ea300f0bf557913#99e811d1eeb5f73b67b0a7ce5ea300f0bf557913"
1543
1543
  dependencies = [
1544
1544
  "async-compression",
1545
1545
  "async-stream",
@@ -1554,9 +1554,11 @@ dependencies = [
1554
1554
  "noodles 0.100.0",
1555
1555
  "noodles-bgzf 0.36.0",
1556
1556
  "noodles-vcf 0.80.0",
1557
+ "num_cpus",
1557
1558
  "opendal",
1558
1559
  "tokio",
1559
1560
  "tokio-util",
1561
+ "tracing",
1560
1562
  ]
1561
1563
 
1562
1564
  [[package]]
@@ -2777,6 +2779,12 @@ version = "0.5.0"
2777
2779
  source = "registry+https://github.com/rust-lang/crates.io-index"
2778
2780
  checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
2779
2781
 
2782
+ [[package]]
2783
+ name = "hermit-abi"
2784
+ version = "0.5.2"
2785
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2786
+ checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
2787
+
2780
2788
  [[package]]
2781
2789
  name = "hex"
2782
2790
  version = "0.4.3"
@@ -4184,6 +4192,16 @@ dependencies = [
4184
4192
  "libm",
4185
4193
  ]
4186
4194
 
4195
+ [[package]]
4196
+ name = "num_cpus"
4197
+ version = "1.17.0"
4198
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4199
+ checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
4200
+ dependencies = [
4201
+ "hermit-abi",
4202
+ "libc",
4203
+ ]
4204
+
4187
4205
  [[package]]
4188
4206
  name = "numpy"
4189
4207
  version = "0.24.0"
@@ -5225,7 +5243,7 @@ dependencies = [
5225
5243
 
5226
5244
  [[package]]
5227
5245
  name = "polars_bio"
5228
- version = "0.14.0"
5246
+ version = "0.15.0"
5229
5247
  dependencies = [
5230
5248
  "arrow",
5231
5249
  "arrow-array",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "polars_bio"
3
- version = "0.14.0"
3
+ version = "0.15.0"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -36,13 +36,13 @@ polars-arrow = { git = "https://github.com/mwiewior/polars.git" , rev = "da42ae
36
36
  polars-python = { git = "https://github.com/mwiewior/polars.git" , rev = "da42ae21ca9c25bc14562e36e07cf02eafd620ee"}
37
37
 
38
38
 
39
- datafusion-bio-format-vcf = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
40
- datafusion-bio-format-core = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
41
- datafusion-bio-format-gff = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
42
- datafusion-bio-format-fastq = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
43
- datafusion-bio-format-bam = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
44
- datafusion-bio-format-bed = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
45
- datafusion-bio-format-fasta = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "aecd7b316c1c498bff14c2cbb4ac0bd04d21612f" }
39
+ datafusion-bio-format-vcf = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
40
+ datafusion-bio-format-core = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
41
+ datafusion-bio-format-gff = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
42
+ datafusion-bio-format-fastq = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
43
+ datafusion-bio-format-bam = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
44
+ datafusion-bio-format-bed = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
45
+ datafusion-bio-format-fasta = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "99e811d1eeb5f73b67b0a7ce5ea300f0bf557913" }
46
46
 
47
47
 
48
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polars-bio
3
- Version: 0.14.0
3
+ Version: 0.15.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark script for general performance comparison between Pandas, Polars, and polars-bio.
4
+ Tests separate read-only and read+filter operations on compressed GFF files.
5
+
6
+ Features:
7
+ - Includes GFF attributes parsing for fair comparison (all libraries use List[Struct{key, value}])
8
+ - Configurable number of runs per benchmark (NUM_RUNS variable)
9
+ - Single-threaded execution for fair comparison
10
+
11
+ Split into two benchmark types:
12
+ 1. Reading only (no filtering) - measures raw I/O performance (single config per library)
13
+ 2. Reading with filtering applied - measures combined I/O + query performance
14
+
15
+ Notes:
16
+ - Polars has projection/predicate pushdown optimizations enabled by default
17
+ - polars-bio explicitly enables both optimizations for best performance comparison
18
+ - pandas doesn't have equivalent optimization concepts (eager evaluation)
19
+ - Detailed optimization testing is available in separate dedicated benchmark scripts
20
+ """
21
+
22
+ import csv
23
+ import os
24
+ import time
25
+ from pathlib import Path
26
+ from typing import Union
27
+
28
+ import polars as pl
29
+ from gff_parsers import pandas_read_gff, polars_scan_gff
30
+
31
+ import polars_bio as pb
32
+
33
+ # Configuration
34
+ GFF_FILE = "/tmp/gencode.v49.annotation.gff3.bgz"
35
+ NUM_RUNS = 1 # Number of runs per benchmark configuration
36
+
37
+
38
+ def benchmark_pandas_read_only():
39
+ """Benchmark pandas reading only (no filtering)"""
40
+ start_time = time.time()
41
+ df = pandas_read_gff(GFF_FILE)
42
+ read_time = time.time() - start_time
43
+ return read_time, len(df)
44
+
45
+
46
+ def benchmark_pandas_read_with_filter():
47
+ """Benchmark pandas reading with filtering applied"""
48
+ start_time = time.time()
49
+ df = pandas_read_gff(GFF_FILE)
50
+ filtered = df[
51
+ (df["seqid"] == "chrY") & (df["start"] < 500000) & (df["end"] > 510000)
52
+ ]
53
+ result = filtered[["seqid", "start", "end", "type"]]
54
+ total_time = time.time() - start_time
55
+ return total_time, len(result)
56
+
57
+
58
+ def benchmark_polars_read_only():
59
+ """Benchmark vanilla Polars reading only (no filtering)"""
60
+ start_time = time.time()
61
+ lf = polars_scan_gff(GFF_FILE)
62
+ result = lf.collect()
63
+ read_time = time.time() - start_time
64
+ return read_time, len(result)
65
+
66
+
67
+ def benchmark_polars_read_with_filter():
68
+ """Benchmark vanilla Polars reading with filtering applied"""
69
+ start_time = time.time()
70
+ lf = polars_scan_gff(GFF_FILE)
71
+ result = (
72
+ lf.filter(
73
+ (pl.col("seqid") == "chrY")
74
+ & (pl.col("start") < 500000)
75
+ & (pl.col("end") > 510000)
76
+ )
77
+ .select(["seqid", "start", "end", "type"])
78
+ .collect()
79
+ )
80
+ total_time = time.time() - start_time
81
+ return total_time, len(result)
82
+
83
+
84
+ def benchmark_polars_bio_read_only(
85
+ projection_pushdown: bool = False, predicate_pushdown: bool = False
86
+ ):
87
+ """Benchmark polars-bio reading only (no filtering)"""
88
+ pb.set_option("datafusion.execution.target_partitions", "1")
89
+ start_time = time.time()
90
+ lf = pb.scan_gff(
91
+ GFF_FILE,
92
+ projection_pushdown=projection_pushdown,
93
+ predicate_pushdown=predicate_pushdown,
94
+ )
95
+ result = lf.collect()
96
+ read_time = time.time() - start_time
97
+ return read_time, len(result)
98
+
99
+
100
+ def benchmark_polars_bio_read_with_filter(
101
+ projection_pushdown: bool = False, predicate_pushdown: bool = False
102
+ ):
103
+ """Benchmark polars-bio reading with filtering applied"""
104
+ os.environ["POLARS_MAX_THREADS"] = "1"
105
+ pb.set_option("datafusion.execution.target_partitions", "1")
106
+ start_time = time.time()
107
+ # Use select().filter() order (not filter().select()) to avoid optimization bug
108
+ lf = pb.scan_gff(
109
+ GFF_FILE,
110
+ projection_pushdown=projection_pushdown,
111
+ predicate_pushdown=predicate_pushdown,
112
+ )
113
+ result = (
114
+ lf.select(["chrom", "start", "end", "type"])
115
+ .filter(
116
+ (pl.col("chrom") == "chrY")
117
+ & (pl.col("start") < 500000)
118
+ & (pl.col("end") > 510000)
119
+ )
120
+ .collect()
121
+ )
122
+ total_time = time.time() - start_time
123
+ return total_time, len(result)
124
+
125
+
126
+ def main():
127
+ """Run benchmarks and save results"""
128
+ results = []
129
+
130
+ # Set single thread for fair comparison
131
+ os.environ["POLARS_MAX_THREADS"] = "1"
132
+ pb.set_option("datafusion.execution.target_partitions", "1")
133
+
134
+ print("Running general performance benchmarks...")
135
+
136
+ # Test cases: read only and read with filter
137
+ test_cases = [
138
+ ("read_only", "Reading only (no filtering)"),
139
+ ("read_with_filter", "Reading with filtering applied"),
140
+ ]
141
+
142
+ for test_type, description in test_cases:
143
+ print(f"\n=== {description} ===")
144
+
145
+ if test_type == "read_only":
146
+ # Benchmark pandas read only
147
+ print("Benchmarking Pandas (read only)...")
148
+ for i in range(NUM_RUNS):
149
+ total_time, result_count = benchmark_pandas_read_only()
150
+ results.append(
151
+ {
152
+ "library": "pandas",
153
+ "test_type": test_type,
154
+ "projection_pushdown": False,
155
+ "predicate_pushdown": False,
156
+ "run": i + 1,
157
+ "total_time": total_time,
158
+ "result_count": result_count,
159
+ "threads": 1,
160
+ }
161
+ )
162
+ print(f" Run {i+1}: {total_time:.3f}s ({result_count} rows)")
163
+
164
+ # Benchmark vanilla Polars read only
165
+ print("Benchmarking Polars (read only)...")
166
+ for i in range(NUM_RUNS):
167
+ total_time, result_count = benchmark_polars_read_only()
168
+ results.append(
169
+ {
170
+ "library": "polars",
171
+ "test_type": test_type,
172
+ "projection_pushdown": False,
173
+ "predicate_pushdown": False,
174
+ "run": i + 1,
175
+ "total_time": total_time,
176
+ "result_count": result_count,
177
+ "threads": 1,
178
+ }
179
+ )
180
+ print(f" Run {i+1}: {total_time:.3f}s ({result_count} rows)")
181
+
182
+ # Benchmark polars-bio read only (single configuration - optimizations don't apply)
183
+ print("Benchmarking polars-bio (read only, no optimizations needed)...")
184
+ for i in range(NUM_RUNS):
185
+ total_time, result_count = benchmark_polars_bio_read_only(False, False)
186
+ results.append(
187
+ {
188
+ "library": "polars-bio",
189
+ "test_type": test_type,
190
+ "projection_pushdown": False,
191
+ "predicate_pushdown": False,
192
+ "run": i + 1,
193
+ "total_time": total_time,
194
+ "result_count": result_count,
195
+ "threads": 1,
196
+ }
197
+ )
198
+ print(f" Run {i+1}: {total_time:.3f}s ({result_count} rows)")
199
+
200
+ else: # read_with_filter
201
+ # Benchmark pandas read with filter
202
+ print("Benchmarking Pandas (read with filter)...")
203
+ for i in range(NUM_RUNS):
204
+ total_time, result_count = benchmark_pandas_read_with_filter()
205
+ results.append(
206
+ {
207
+ "library": "pandas",
208
+ "test_type": test_type,
209
+ "projection_pushdown": False,
210
+ "predicate_pushdown": False,
211
+ "run": i + 1,
212
+ "total_time": total_time,
213
+ "result_count": result_count,
214
+ "threads": 1,
215
+ }
216
+ )
217
+ print(f" Run {i+1}: {total_time:.3f}s ({result_count} filtered rows)")
218
+
219
+ # Benchmark vanilla Polars read with filter
220
+ print("Benchmarking Polars (read with filter)...")
221
+ for i in range(NUM_RUNS):
222
+ total_time, result_count = benchmark_polars_read_with_filter()
223
+ results.append(
224
+ {
225
+ "library": "polars",
226
+ "test_type": test_type,
227
+ "projection_pushdown": False,
228
+ "predicate_pushdown": False,
229
+ "run": i + 1,
230
+ "total_time": total_time,
231
+ "result_count": result_count,
232
+ "threads": 1,
233
+ }
234
+ )
235
+ print(f" Run {i+1}: {total_time:.3f}s ({result_count} filtered rows)")
236
+
237
+ # Benchmark polars-bio read with filter (both optimizations enabled)
238
+ print("Benchmarking polars-bio (read with filter, both optimizations)...")
239
+ for i in range(NUM_RUNS):
240
+ total_time, result_count = benchmark_polars_bio_read_with_filter(
241
+ True, True
242
+ )
243
+ results.append(
244
+ {
245
+ "library": "polars-bio",
246
+ "test_type": test_type,
247
+ "projection_pushdown": True,
248
+ "predicate_pushdown": True,
249
+ "run": i + 1,
250
+ "total_time": total_time,
251
+ "result_count": result_count,
252
+ "threads": 1,
253
+ }
254
+ )
255
+ print(f" Run {i+1}: {total_time:.3f}s ({result_count} filtered rows)")
256
+
257
+ # Save results
258
+ Path("results").mkdir(exist_ok=True)
259
+ with open("results/general_performance.csv", "w", newline="") as f:
260
+ fieldnames = [
261
+ "library",
262
+ "test_type",
263
+ "projection_pushdown",
264
+ "predicate_pushdown",
265
+ "run",
266
+ "total_time",
267
+ "result_count",
268
+ "threads",
269
+ ]
270
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
271
+ writer.writeheader()
272
+ writer.writerows(results)
273
+
274
+ print("\nResults saved to results/general_performance.csv")
275
+
276
+ # Print summary statistics
277
+ print("\n=== Summary Statistics ===")
278
+ print("Library\t\t\tTest Type\t\tProj PD\tPred PD\tAvg Time")
279
+ print("-" * 75)
280
+
281
+ for test_type, _ in test_cases:
282
+ for library in ["pandas", "polars", "polars-bio"]:
283
+ lib_results = [
284
+ r
285
+ for r in results
286
+ if r["library"] == library and r["test_type"] == test_type
287
+ ]
288
+ if lib_results:
289
+ avg_time = sum(r["total_time"] for r in lib_results) / len(lib_results)
290
+ if library == "pandas":
291
+ print(f"{library}\t\t\t{test_type}\t\tN/A\tN/A\t{avg_time:.3f}s")
292
+ elif library == "polars":
293
+ if test_type == "read_with_filter":
294
+ print(
295
+ f"{library}\t\t\t{test_type}\t\tYes\tYes\t{avg_time:.3f}s (default)"
296
+ )
297
+ else:
298
+ print(
299
+ f"{library}\t\t\t{test_type}\t\tN/A\tN/A\t{avg_time:.3f}s"
300
+ )
301
+ else: # polars-bio
302
+ if test_type == "read_with_filter":
303
+ print(f"{library}\t\t{test_type}\t\tYes\tYes\t{avg_time:.3f}s")
304
+ else:
305
+ print(
306
+ f"{library}\t\t\t{test_type}\t\tN/A\tN/A\t{avg_time:.3f}s"
307
+ )
308
+
309
+
310
+ if __name__ == "__main__":
311
+ main()