polars-bio 0.12.0__cp39-abi3-macosx_10_12_x86_64.whl → 0.13.0__cp39-abi3-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_bio/__init__.py +75 -32
- polars_bio/interval_op_helpers.py +1 -5
- polars_bio/io.py +291 -286
- polars_bio/polars_bio.abi3.so +0 -0
- polars_bio/range_op.py +2 -16
- polars_bio/range_op_helpers.py +0 -13
- polars_bio/sql.py +4 -8
- polars_bio/utils.py +46 -0
- {polars_bio-0.12.0.dist-info → polars_bio-0.13.0.dist-info}/METADATA +2 -2
- polars_bio-0.13.0.dist-info/RECORD +19 -0
- polars_bio-0.12.0.dist-info/RECORD +0 -18
- {polars_bio-0.12.0.dist-info → polars_bio-0.13.0.dist-info}/WHEEL +0 -0
- {polars_bio-0.12.0.dist-info → polars_bio-0.13.0.dist-info}/licenses/LICENSE +0 -0
polars_bio/__init__.py
CHANGED
@@ -1,39 +1,53 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
# Set POLARS_FORCE_NEW_STREAMING to "1" by default if not already set
|
4
|
+
if "POLARS_FORCE_NEW_STREAMING" not in os.environ:
|
5
|
+
os.environ["POLARS_FORCE_NEW_STREAMING"] = "1"
|
6
|
+
|
1
7
|
from polars_bio.polars_bio import GffReadOptions, InputFormat
|
2
8
|
from polars_bio.polars_bio import PyObjectStorageOptions as ObjectStorageOptions
|
3
9
|
from polars_bio.polars_bio import ReadOptions, VcfReadOptions
|
4
10
|
|
5
11
|
from .context import ctx, set_option
|
6
|
-
from .sql import SQL
|
7
|
-
|
8
|
-
register_gff =
|
9
|
-
register_vcf =
|
10
|
-
register_fastq =
|
11
|
-
register_bam =
|
12
|
-
register_bed =
|
13
|
-
register_view =
|
14
|
-
|
15
|
-
sql =
|
16
|
-
|
17
|
-
from .io import IOOperations
|
18
|
-
|
19
|
-
describe_vcf =
|
20
|
-
from_polars =
|
21
|
-
read_bam =
|
22
|
-
read_fastq =
|
23
|
-
read_gff =
|
24
|
-
read_table =
|
25
|
-
read_vcf =
|
26
|
-
read_fastq =
|
27
|
-
read_bed =
|
28
|
-
read_fasta =
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
12
|
+
from .sql import SQL as data_processing
|
13
|
+
|
14
|
+
register_gff = data_processing.register_gff
|
15
|
+
register_vcf = data_processing.register_vcf
|
16
|
+
register_fastq = data_processing.register_fastq
|
17
|
+
register_bam = data_processing.register_bam
|
18
|
+
register_bed = data_processing.register_bed
|
19
|
+
register_view = data_processing.register_view
|
20
|
+
|
21
|
+
sql = data_processing.sql
|
22
|
+
|
23
|
+
from .io import IOOperations as data_input
|
24
|
+
|
25
|
+
describe_vcf = data_input.describe_vcf
|
26
|
+
from_polars = data_input.from_polars
|
27
|
+
read_bam = data_input.read_bam
|
28
|
+
read_fastq = data_input.read_fastq
|
29
|
+
read_gff = data_input.read_gff
|
30
|
+
read_table = data_input.read_table
|
31
|
+
read_vcf = data_input.read_vcf
|
32
|
+
read_fastq = data_input.read_fastq
|
33
|
+
read_bed = data_input.read_bed
|
34
|
+
read_fasta = data_input.read_fasta
|
35
|
+
scan_bam = data_input.scan_bam
|
36
|
+
scan_bed = data_input.scan_bed
|
37
|
+
scan_fasta = data_input.scan_fasta
|
38
|
+
scan_fastq = data_input.scan_fastq
|
39
|
+
scan_gff = data_input.scan_gff
|
40
|
+
scan_table = data_input.scan_table
|
41
|
+
scan_vcf = data_input.scan_vcf
|
42
|
+
|
43
|
+
|
44
|
+
from .range_op import IntervalOperations as range_operations
|
45
|
+
|
46
|
+
overlap = range_operations.overlap
|
47
|
+
nearest = range_operations.nearest
|
48
|
+
count_overlaps = range_operations.count_overlaps
|
49
|
+
coverage = range_operations.coverage
|
50
|
+
merge = range_operations.merge
|
37
51
|
|
38
52
|
try:
|
39
53
|
from .range_utils import Utils
|
@@ -59,7 +73,7 @@ except ImportError:
|
|
59
73
|
POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
|
60
74
|
|
61
75
|
|
62
|
-
__version__ = "0.
|
76
|
+
__version__ = "0.13.0"
|
63
77
|
__all__ = [
|
64
78
|
"ctx",
|
65
79
|
"FilterOp",
|
@@ -74,4 +88,33 @@ __all__ = [
|
|
74
88
|
"ObjectStorageOptions",
|
75
89
|
"set_option",
|
76
90
|
"set_loglevel",
|
91
|
+
"describe_vcf",
|
92
|
+
"from_polars",
|
93
|
+
"read_bam",
|
94
|
+
"read_bed",
|
95
|
+
"read_fasta",
|
96
|
+
"read_fastq",
|
97
|
+
"read_gff",
|
98
|
+
"read_table",
|
99
|
+
"read_vcf",
|
100
|
+
"scan_bam",
|
101
|
+
"scan_bed",
|
102
|
+
"scan_fasta",
|
103
|
+
"scan_fastq",
|
104
|
+
"scan_gff",
|
105
|
+
"scan_table",
|
106
|
+
"scan_vcf",
|
107
|
+
"register_gff",
|
108
|
+
"register_vcf",
|
109
|
+
"register_fastq",
|
110
|
+
"register_bam",
|
111
|
+
"register_bed",
|
112
|
+
"register_view",
|
113
|
+
"sql",
|
114
|
+
"overlap",
|
115
|
+
"nearest",
|
116
|
+
"count_overlaps",
|
117
|
+
"coverage",
|
118
|
+
"merge",
|
119
|
+
"visualize_intervals",
|
77
120
|
]
|
@@ -81,12 +81,8 @@ def df_to_lazyframe(df: datafusion.DataFrame) -> pl.LazyFrame:
|
|
81
81
|
|
82
82
|
|
83
83
|
def convert_result(
|
84
|
-
df: datafusion.DataFrame, output_type: str
|
84
|
+
df: datafusion.DataFrame, output_type: str
|
85
85
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame"]:
|
86
|
-
# TODO: implement streaming
|
87
|
-
if streaming:
|
88
|
-
# raise NotImplementedError("streaming is not implemented")
|
89
|
-
return df.to_polars().lazy()
|
90
86
|
if output_type == "polars.DataFrame":
|
91
87
|
return df.to_polars()
|
92
88
|
elif output_type == "pandas.DataFrame":
|
polars_bio/io.py
CHANGED
@@ -70,29 +70,6 @@ SCHEMAS = {
|
|
70
70
|
|
71
71
|
|
72
72
|
class IOOperations:
|
73
|
-
# TODO handling reference
|
74
|
-
# def read_cram(path: str) -> pl.LazyFrame:
|
75
|
-
# """
|
76
|
-
# Read a CRAM file into a LazyFrame.
|
77
|
-
#
|
78
|
-
# Parameters:
|
79
|
-
# path: The path to the CRAM file.
|
80
|
-
# """
|
81
|
-
# return file_lazy_scan(path, InputFormat.Cram)
|
82
|
-
|
83
|
-
# TODO passing of bam_region_filter
|
84
|
-
# def read_indexed_bam(path: str) -> pl.LazyFrame:
|
85
|
-
# """
|
86
|
-
# Read an indexed BAM file into a LazyFrame.
|
87
|
-
#
|
88
|
-
# Parameters:
|
89
|
-
# path: The path to the BAM file.
|
90
|
-
#
|
91
|
-
# !!! warning
|
92
|
-
# Predicate pushdown is not supported yet. So no real benefit from using an indexed BAM file.
|
93
|
-
# """
|
94
|
-
# return file_lazy_scan(path, InputFormat.IndexedBam)
|
95
|
-
|
96
73
|
@staticmethod
|
97
74
|
def read_fasta(
|
98
75
|
path: str,
|
@@ -103,11 +80,10 @@ class IOOperations:
|
|
103
80
|
max_retries: int = 5,
|
104
81
|
timeout: int = 300,
|
105
82
|
compression_type: str = "auto",
|
106
|
-
|
107
|
-
) -> Union[pl.LazyFrame, pl.DataFrame]:
|
83
|
+
) -> pl.DataFrame:
|
108
84
|
"""
|
109
85
|
|
110
|
-
Read a FASTA file into a
|
86
|
+
Read a FASTA file into a DataFrame.
|
111
87
|
|
112
88
|
Parameters:
|
113
89
|
path: The path to the FASTA file.
|
@@ -118,7 +94,6 @@ class IOOperations:
|
|
118
94
|
max_retries: The maximum number of retries for reading the file from object storage.
|
119
95
|
timeout: The timeout in seconds for reading the file from object storage.
|
120
96
|
compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
121
|
-
streaming: Whether to read the FASTA file in streaming mode.
|
122
97
|
|
123
98
|
!!! Example
|
124
99
|
```shell
|
@@ -127,7 +102,63 @@ class IOOperations:
|
|
127
102
|
|
128
103
|
```python
|
129
104
|
import polars_bio as pb
|
130
|
-
pb.read_fasta("/tmp/test.fasta").limit(1)
|
105
|
+
pb.read_fasta("/tmp/test.fasta").limit(1)
|
106
|
+
```
|
107
|
+
```shell
|
108
|
+
shape: (1, 3)
|
109
|
+
┌─────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
|
110
|
+
│ name ┆ description ┆ sequence │
|
111
|
+
│ --- ┆ --- ┆ --- │
|
112
|
+
│ str ┆ str ┆ str │
|
113
|
+
╞═════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
|
114
|
+
│ ENA|BK006935|BK006935.2 ┆ TPA_inf: Saccharomyces cerevis… ┆ CCACACCACACCCACACACCCACACACCAC… │
|
115
|
+
└─────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘
|
116
|
+
```
|
117
|
+
"""
|
118
|
+
return IOOperations.scan_fasta(
|
119
|
+
path,
|
120
|
+
chunk_size,
|
121
|
+
concurrent_fetches,
|
122
|
+
allow_anonymous,
|
123
|
+
enable_request_payer,
|
124
|
+
max_retries,
|
125
|
+
timeout,
|
126
|
+
compression_type,
|
127
|
+
).collect()
|
128
|
+
|
129
|
+
@staticmethod
|
130
|
+
def scan_fasta(
|
131
|
+
path: str,
|
132
|
+
chunk_size: int = 8,
|
133
|
+
concurrent_fetches: int = 1,
|
134
|
+
allow_anonymous: bool = True,
|
135
|
+
enable_request_payer: bool = False,
|
136
|
+
max_retries: int = 5,
|
137
|
+
timeout: int = 300,
|
138
|
+
compression_type: str = "auto",
|
139
|
+
) -> pl.LazyFrame:
|
140
|
+
"""
|
141
|
+
|
142
|
+
Lazily read a FASTA file into a LazyFrame.
|
143
|
+
|
144
|
+
Parameters:
|
145
|
+
path: The path to the FASTA file.
|
146
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
147
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
148
|
+
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
149
|
+
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
150
|
+
max_retries: The maximum number of retries for reading the file from object storage.
|
151
|
+
timeout: The timeout in seconds for reading the file from object storage.
|
152
|
+
compression_type: The compression type of the FASTA file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
153
|
+
|
154
|
+
!!! Example
|
155
|
+
```shell
|
156
|
+
wget https://www.ebi.ac.uk/ena/browser/api/fasta/BK006935.2?download=true -O /tmp/test.fasta
|
157
|
+
```
|
158
|
+
|
159
|
+
```python
|
160
|
+
import polars_bio as pb
|
161
|
+
pb.scan_fasta("/tmp/test.fasta").limit(1).collect()
|
131
162
|
```
|
132
163
|
```shell
|
133
164
|
shape: (1, 3)
|
@@ -153,11 +184,7 @@ class IOOperations:
|
|
153
184
|
object_storage_options=object_storage_options
|
154
185
|
)
|
155
186
|
read_options = ReadOptions(fasta_read_options=fasta_read_options)
|
156
|
-
|
157
|
-
return read_file(path, InputFormat.Fasta, read_options, streaming)
|
158
|
-
else:
|
159
|
-
df = read_file(path, InputFormat.Fasta, read_options)
|
160
|
-
return lazy_scan(df)
|
187
|
+
return _read_file(path, InputFormat.Fasta, read_options)
|
161
188
|
|
162
189
|
@staticmethod
|
163
190
|
def read_vcf(
|
@@ -171,10 +198,53 @@ class IOOperations:
|
|
171
198
|
max_retries: int = 5,
|
172
199
|
timeout: int = 300,
|
173
200
|
compression_type: str = "auto",
|
174
|
-
|
175
|
-
) -> Union[pl.LazyFrame, pl.DataFrame]:
|
201
|
+
) -> pl.DataFrame:
|
176
202
|
"""
|
177
|
-
Read a VCF file into a
|
203
|
+
Read a VCF file into a DataFrame.
|
204
|
+
|
205
|
+
Parameters:
|
206
|
+
path: The path to the VCF file.
|
207
|
+
info_fields: The fields to read from the INFO column.
|
208
|
+
thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
209
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
210
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
211
|
+
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
212
|
+
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
213
|
+
max_retries: The maximum number of retries for reading the file from object storage.
|
214
|
+
timeout: The timeout in seconds for reading the file from object storage.
|
215
|
+
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
|
216
|
+
|
217
|
+
!!! note
|
218
|
+
VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
219
|
+
"""
|
220
|
+
return IOOperations.scan_vcf(
|
221
|
+
path,
|
222
|
+
info_fields,
|
223
|
+
thread_num,
|
224
|
+
chunk_size,
|
225
|
+
concurrent_fetches,
|
226
|
+
allow_anonymous,
|
227
|
+
enable_request_payer,
|
228
|
+
max_retries,
|
229
|
+
timeout,
|
230
|
+
compression_type,
|
231
|
+
).collect()
|
232
|
+
|
233
|
+
@staticmethod
|
234
|
+
def scan_vcf(
|
235
|
+
path: str,
|
236
|
+
info_fields: Union[list[str], None] = None,
|
237
|
+
thread_num: int = 1,
|
238
|
+
chunk_size: int = 8,
|
239
|
+
concurrent_fetches: int = 1,
|
240
|
+
allow_anonymous: bool = True,
|
241
|
+
enable_request_payer: bool = False,
|
242
|
+
max_retries: int = 5,
|
243
|
+
timeout: int = 300,
|
244
|
+
compression_type: str = "auto",
|
245
|
+
) -> pl.LazyFrame:
|
246
|
+
"""
|
247
|
+
Lazily read a VCF file into a LazyFrame.
|
178
248
|
|
179
249
|
Parameters:
|
180
250
|
path: The path to the VCF file.
|
@@ -187,7 +257,6 @@ class IOOperations:
|
|
187
257
|
max_retries: The maximum number of retries for reading the file from object storage.
|
188
258
|
timeout: The timeout in seconds for reading the file from object storage.
|
189
259
|
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
|
190
|
-
streaming: Whether to read the VCF file in streaming mode.
|
191
260
|
|
192
261
|
!!! note
|
193
262
|
VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
@@ -208,11 +277,7 @@ class IOOperations:
|
|
208
277
|
object_storage_options=object_storage_options,
|
209
278
|
)
|
210
279
|
read_options = ReadOptions(vcf_read_options=vcf_read_options)
|
211
|
-
|
212
|
-
return read_file(path, InputFormat.Vcf, read_options, streaming)
|
213
|
-
else:
|
214
|
-
df = read_file(path, InputFormat.Vcf, read_options)
|
215
|
-
return lazy_scan(df)
|
280
|
+
return _read_file(path, InputFormat.Vcf, read_options)
|
216
281
|
|
217
282
|
@staticmethod
|
218
283
|
def read_gff(
|
@@ -226,10 +291,9 @@ class IOOperations:
|
|
226
291
|
max_retries: int = 5,
|
227
292
|
timeout: int = 300,
|
228
293
|
compression_type: str = "auto",
|
229
|
-
|
230
|
-
) -> Union[pl.LazyFrame, pl.DataFrame]:
|
294
|
+
) -> pl.DataFrame:
|
231
295
|
"""
|
232
|
-
Read a GFF file into a
|
296
|
+
Read a GFF file into a DataFrame.
|
233
297
|
|
234
298
|
Parameters:
|
235
299
|
path: The path to the GFF file.
|
@@ -242,58 +306,51 @@ class IOOperations:
|
|
242
306
|
max_retries: The maximum number of retries for reading the file from object storage.
|
243
307
|
timeout: The timeout in seconds for reading the file from object storage.
|
244
308
|
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
|
245
|
-
streaming: Whether to read the GFF file in streaming mode.
|
246
309
|
|
310
|
+
!!! note
|
311
|
+
GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
312
|
+
"""
|
313
|
+
return IOOperations.scan_gff(
|
314
|
+
path,
|
315
|
+
attr_fields,
|
316
|
+
thread_num,
|
317
|
+
chunk_size,
|
318
|
+
concurrent_fetches,
|
319
|
+
allow_anonymous,
|
320
|
+
enable_request_payer,
|
321
|
+
max_retries,
|
322
|
+
timeout,
|
323
|
+
compression_type,
|
324
|
+
).collect()
|
247
325
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
│ chrom ┆ start ┆ end ┆ type ┆ … ┆ score ┆ strand ┆ phase ┆ attributes │
|
264
|
-
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
|
265
|
-
│ str ┆ u32 ┆ u32 ┆ str ┆ ┆ f32 ┆ str ┆ u32 ┆ list[struct[2]] │
|
266
|
-
╞═══════╪═══════╪═══════╪════════════╪═══╪═══════╪════════╪═══════╪═════════════════════════════════╡
|
267
|
-
│ chr1 ┆ 11869 ┆ 14409 ┆ gene ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","ENSG00000223972.5"}, {… │
|
268
|
-
│ chr1 ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","ENST00000456328.2"}, {… │
|
269
|
-
│ chr1 ┆ 11869 ┆ 12227 ┆ exon ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","exon:ENST00000456328.2… │
|
270
|
-
│ chr1 ┆ 12613 ┆ 12721 ┆ exon ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","exon:ENST00000456328.2… │
|
271
|
-
│ chr1 ┆ 13221 ┆ 14409 ┆ exon ┆ … ┆ null ┆ + ┆ null ┆ [{"ID","exon:ENST00000456328.2… │
|
272
|
-
└───────┴───────┴───────┴────────────┴───┴───────┴────────┴───────┴─────────────────────────────────┘
|
273
|
-
|
274
|
-
```
|
326
|
+
@staticmethod
|
327
|
+
def scan_gff(
|
328
|
+
path: str,
|
329
|
+
attr_fields: Union[list[str], None] = None,
|
330
|
+
thread_num: int = 1,
|
331
|
+
chunk_size: int = 8,
|
332
|
+
concurrent_fetches: int = 1,
|
333
|
+
allow_anonymous: bool = True,
|
334
|
+
enable_request_payer: bool = False,
|
335
|
+
max_retries: int = 5,
|
336
|
+
timeout: int = 300,
|
337
|
+
compression_type: str = "auto",
|
338
|
+
) -> pl.LazyFrame:
|
339
|
+
"""
|
340
|
+
Lazily read a GFF file into a LazyFrame.
|
275
341
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
342
|
+
Parameters:
|
343
|
+
path: The path to the GFF file.
|
344
|
+
attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
|
345
|
+
thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
346
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
347
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
348
|
+
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
349
|
+
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
350
|
+
max_retries: The maximum number of retries for reading the file from object storage.
|
351
|
+
timeout: The timeout in seconds for reading the file from object storage.
|
352
|
+
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
|
283
353
|
|
284
|
-
shape: (5, 10)
|
285
|
-
┌───────┬───────┬───────┬────────────┬───┬────────┬───────┬──────────────────────────┬──────────────────────┐
|
286
|
-
│ chrom ┆ start ┆ end ┆ type ┆ … ┆ strand ┆ phase ┆ ID ┆ havana_transcript │
|
287
|
-
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
|
288
|
-
│ str ┆ u32 ┆ u32 ┆ str ┆ ┆ str ┆ u32 ┆ str ┆ str │
|
289
|
-
╞═══════╪═══════╪═══════╪════════════╪═══╪════════╪═══════╪══════════════════════════╪══════════════════════╡
|
290
|
-
│ chr1 ┆ 11869 ┆ 14409 ┆ gene ┆ … ┆ + ┆ null ┆ ENSG00000223972.5 ┆ null │
|
291
|
-
│ chr1 ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ + ┆ null ┆ ENST00000456328.2 ┆ OTTHUMT00000362751.1 │
|
292
|
-
│ chr1 ┆ 11869 ┆ 12227 ┆ exon ┆ … ┆ + ┆ null ┆ exon:ENST00000456328.2:1 ┆ OTTHUMT00000362751.1 │
|
293
|
-
│ chr1 ┆ 12613 ┆ 12721 ┆ exon ┆ … ┆ + ┆ null ┆ exon:ENST00000456328.2:2 ┆ OTTHUMT00000362751.1 │
|
294
|
-
│ chr1 ┆ 13221 ┆ 14409 ┆ exon ┆ … ┆ + ┆ null ┆ exon:ENST00000456328.2:3 ┆ OTTHUMT00000362751.1 │
|
295
|
-
└───────┴───────┴───────┴────────────┴───┴────────┴───────┴──────────────────────────┴──────────────────────┘
|
296
|
-
```
|
297
354
|
!!! note
|
298
355
|
GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
|
299
356
|
"""
|
@@ -313,11 +370,7 @@ class IOOperations:
|
|
313
370
|
object_storage_options=object_storage_options,
|
314
371
|
)
|
315
372
|
read_options = ReadOptions(gff_read_options=gff_read_options)
|
316
|
-
|
317
|
-
return read_file(path, InputFormat.Gff, read_options, streaming)
|
318
|
-
else:
|
319
|
-
df = read_file(path, InputFormat.Gff, read_options)
|
320
|
-
return lazy_scan(df)
|
373
|
+
return _read_file(path, InputFormat.Gff, read_options)
|
321
374
|
|
322
375
|
@staticmethod
|
323
376
|
def read_bam(
|
@@ -329,10 +382,9 @@ class IOOperations:
|
|
329
382
|
enable_request_payer: bool = False,
|
330
383
|
max_retries: int = 5,
|
331
384
|
timeout: int = 300,
|
332
|
-
|
333
|
-
) -> Union[pl.LazyFrame, pl.DataFrame]:
|
385
|
+
) -> pl.DataFrame:
|
334
386
|
"""
|
335
|
-
Read a BAM file into a
|
387
|
+
Read a BAM file into a DataFrame.
|
336
388
|
|
337
389
|
Parameters:
|
338
390
|
path: The path to the BAM file.
|
@@ -343,33 +395,44 @@ class IOOperations:
|
|
343
395
|
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
344
396
|
max_retries: The maximum number of retries for reading the file from object storage.
|
345
397
|
timeout: The timeout in seconds for reading the file from object storage.
|
346
|
-
streaming: Whether to read the BAM file in streaming mode.
|
347
398
|
|
348
|
-
!!!
|
399
|
+
!!! note
|
400
|
+
BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
|
401
|
+
"""
|
402
|
+
return IOOperations.scan_bam(
|
403
|
+
path,
|
404
|
+
thread_num,
|
405
|
+
chunk_size,
|
406
|
+
concurrent_fetches,
|
407
|
+
allow_anonymous,
|
408
|
+
enable_request_payer,
|
409
|
+
max_retries,
|
410
|
+
timeout,
|
411
|
+
).collect()
|
349
412
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
│ SRR062634.9882510 ┆ chr1 ┆ 10001 ┆ 10044 ┆ … ┆ chr1 ┆ 10069 ┆ TAACCCTAACCCTACCCTAACCCTAACCCT… ┆ 0<>=/0E:7;08FBDIF9;2%=<>+FCDDA… │
|
364
|
-
│ SRR062641.21956756 ┆ chr1 ┆ 10001 ┆ 10049 ┆ … ┆ chr1 ┆ 10051 ┆ TAACCCTACCCTAACCCTAACCCTAACCCT… ┆ 0=MLOOPNNPPJHPOQQROQPQQRIQPRJB… │
|
365
|
-
│ SRR062641.13613107 ┆ chr1 ┆ 10002 ┆ 10072 ┆ … ┆ chr1 ┆ 10110 ┆ AACCCTAACCCCTAACCCCTAACCCCTAAC… ┆ 0KKNPQOQOQIQRPQPRRRRPQPRRRRPRF… │
|
366
|
-
└────────────────────┴───────┴───────┴───────┴───┴────────────┴────────────┴─────────────────────────────────┴─────────────────────────────────┘
|
367
|
-
```
|
413
|
+
@staticmethod
|
414
|
+
def scan_bam(
|
415
|
+
path: str,
|
416
|
+
thread_num: int = 1,
|
417
|
+
chunk_size: int = 8,
|
418
|
+
concurrent_fetches: int = 1,
|
419
|
+
allow_anonymous: bool = True,
|
420
|
+
enable_request_payer: bool = False,
|
421
|
+
max_retries: int = 5,
|
422
|
+
timeout: int = 300,
|
423
|
+
) -> pl.LazyFrame:
|
424
|
+
"""
|
425
|
+
Lazily read a BAM file into a LazyFrame.
|
368
426
|
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
427
|
+
Parameters:
|
428
|
+
path: The path to the BAM file.
|
429
|
+
thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
430
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
431
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
432
|
+
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
433
|
+
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
434
|
+
max_retries: The maximum number of retries for reading the file from object storage.
|
435
|
+
timeout: The timeout in seconds for reading the file from object storage.
|
373
436
|
|
374
437
|
!!! note
|
375
438
|
BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
|
@@ -389,11 +452,7 @@ class IOOperations:
|
|
389
452
|
object_storage_options=object_storage_options,
|
390
453
|
)
|
391
454
|
read_options = ReadOptions(bam_read_options=bam_read_options)
|
392
|
-
|
393
|
-
return read_file(path, InputFormat.Bam, read_options, streaming)
|
394
|
-
else:
|
395
|
-
df = read_file(path, InputFormat.Bam, read_options)
|
396
|
-
return lazy_scan(df)
|
455
|
+
return _read_file(path, InputFormat.Bam, read_options)
|
397
456
|
|
398
457
|
@staticmethod
|
399
458
|
def read_fastq(
|
@@ -405,11 +464,10 @@ class IOOperations:
|
|
405
464
|
max_retries: int = 5,
|
406
465
|
timeout: int = 300,
|
407
466
|
compression_type: str = "auto",
|
408
|
-
streaming: bool = False,
|
409
467
|
parallel: bool = False,
|
410
|
-
) ->
|
468
|
+
) -> pl.DataFrame:
|
411
469
|
"""
|
412
|
-
Read a FASTQ file into a
|
470
|
+
Read a FASTQ file into a DataFrame.
|
413
471
|
|
414
472
|
Parameters:
|
415
473
|
path: The path to the FASTQ file.
|
@@ -420,44 +478,46 @@ class IOOperations:
|
|
420
478
|
max_retries: The maximum number of retries for reading the file from object storage.
|
421
479
|
timeout: The timeout in seconds for reading the file from object storage.
|
422
480
|
compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
423
|
-
streaming: Whether to read the FASTQ file in streaming mode.
|
424
481
|
parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
|
482
|
+
"""
|
483
|
+
return IOOperations.scan_fastq(
|
484
|
+
path,
|
485
|
+
chunk_size,
|
486
|
+
concurrent_fetches,
|
487
|
+
allow_anonymous,
|
488
|
+
enable_request_payer,
|
489
|
+
max_retries,
|
490
|
+
timeout,
|
491
|
+
compression_type,
|
492
|
+
parallel,
|
493
|
+
).collect()
|
425
494
|
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
|
439
|
-
│ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… ┆ TGGAAGGTTCTCGAAAAAAATGGAATCGAA… ┆ ?@;DDBDDBHF??FFB@B)1:CD3*:?DFF… │
|
440
|
-
└─────────────────────┴─────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘
|
441
|
-
|
442
|
-
```
|
443
|
-
|
444
|
-
Parallel reading of BZGF compressed FASTQ files stored locally:
|
445
|
-
```shell
|
446
|
-
ls -1 /tmp/ERR194146.fastq.bgz*
|
447
|
-
ERR194146.fastq.bgz
|
448
|
-
ERR194146.fastq.bgz.gzi
|
449
|
-
```
|
450
|
-
|
451
|
-
```python
|
452
|
-
import polars_bio as pb
|
453
|
-
## Set the number of target partitions (threads) to 2
|
454
|
-
pb.set_option("datafusion.execution.target_partitions", "2")
|
455
|
-
pb.read_fastq("/tmp/ERR194146.fastq.bgz", parallel=True).count().collect()
|
456
|
-
```
|
457
|
-
|
458
|
-
|
495
|
+
@staticmethod
|
496
|
+
def scan_fastq(
|
497
|
+
path: str,
|
498
|
+
chunk_size: int = 8,
|
499
|
+
concurrent_fetches: int = 1,
|
500
|
+
allow_anonymous: bool = True,
|
501
|
+
enable_request_payer: bool = False,
|
502
|
+
max_retries: int = 5,
|
503
|
+
timeout: int = 300,
|
504
|
+
compression_type: str = "auto",
|
505
|
+
parallel: bool = False,
|
506
|
+
) -> pl.LazyFrame:
|
459
507
|
"""
|
508
|
+
Lazily read a FASTQ file into a LazyFrame.
|
460
509
|
|
510
|
+
Parameters:
|
511
|
+
path: The path to the FASTQ file.
|
512
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
513
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
514
|
+
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
515
|
+
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
516
|
+
max_retries: The maximum number of retries for reading the file from object storage.
|
517
|
+
timeout: The timeout in seconds for reading the file from object storage.
|
518
|
+
compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
|
519
|
+
parallel: Whether to use the parallel reader for BGZF compressed files stored **locally**. GZI index is **required**.
|
520
|
+
"""
|
461
521
|
object_storage_options = PyObjectStorageOptions(
|
462
522
|
allow_anonymous=allow_anonymous,
|
463
523
|
enable_request_payer=enable_request_payer,
|
@@ -472,11 +532,7 @@ class IOOperations:
|
|
472
532
|
object_storage_options=object_storage_options, parallel=parallel
|
473
533
|
)
|
474
534
|
read_options = ReadOptions(fastq_read_options=fastq_read_options)
|
475
|
-
|
476
|
-
return read_file(path, InputFormat.Fastq, read_options, streaming)
|
477
|
-
else:
|
478
|
-
df = read_file(path, InputFormat.Fastq, read_options)
|
479
|
-
return lazy_scan(df)
|
535
|
+
return _read_file(path, InputFormat.Fastq, read_options)
|
480
536
|
|
481
537
|
@staticmethod
|
482
538
|
def read_bed(
|
@@ -489,10 +545,9 @@ class IOOperations:
|
|
489
545
|
max_retries: int = 5,
|
490
546
|
timeout: int = 300,
|
491
547
|
compression_type: str = "auto",
|
492
|
-
|
493
|
-
) -> Union[pl.LazyFrame, pl.DataFrame]:
|
548
|
+
) -> pl.DataFrame:
|
494
549
|
"""
|
495
|
-
Read a BED file into a
|
550
|
+
Read a BED file into a DataFrame.
|
496
551
|
|
497
552
|
Parameters:
|
498
553
|
path: The path to the BED file.
|
@@ -504,44 +559,59 @@ class IOOperations:
|
|
504
559
|
max_retries: The maximum number of retries for reading the file from object storage.
|
505
560
|
timeout: The timeout in seconds for reading the file from object storage.
|
506
561
|
compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
|
507
|
-
streaming: Whether to read the BED file in streaming mode.
|
508
562
|
|
509
563
|
!!! Note
|
510
564
|
Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
|
511
565
|
Also unlike other text formats, **GZIP** compression is not supported.
|
512
566
|
|
513
|
-
!!!
|
514
|
-
|
567
|
+
!!! note
|
568
|
+
BED reader uses **1-based** coordinate system for the `start`, `end`.
|
569
|
+
"""
|
570
|
+
return IOOperations.scan_bed(
|
571
|
+
path,
|
572
|
+
thread_num,
|
573
|
+
chunk_size,
|
574
|
+
concurrent_fetches,
|
575
|
+
allow_anonymous,
|
576
|
+
enable_request_payer,
|
577
|
+
max_retries,
|
578
|
+
timeout,
|
579
|
+
compression_type,
|
580
|
+
).collect()
|
515
581
|
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
582
|
+
@staticmethod
|
583
|
+
def scan_bed(
|
584
|
+
path: str,
|
585
|
+
thread_num: int = 1,
|
586
|
+
chunk_size: int = 8,
|
587
|
+
concurrent_fetches: int = 1,
|
588
|
+
allow_anonymous: bool = True,
|
589
|
+
enable_request_payer: bool = False,
|
590
|
+
max_retries: int = 5,
|
591
|
+
timeout: int = 300,
|
592
|
+
compression_type: str = "auto",
|
593
|
+
) -> pl.LazyFrame:
|
594
|
+
"""
|
595
|
+
Lazily read a BED file into a LazyFrame.
|
520
596
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
597
|
+
Parameters:
|
598
|
+
path: The path to the BED file.
|
599
|
+
thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
|
600
|
+
chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
|
601
|
+
concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
|
602
|
+
allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
|
603
|
+
enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
604
|
+
max_retries: The maximum number of retries for reading the file from object storage.
|
605
|
+
timeout: The timeout in seconds for reading the file from object storage.
|
606
|
+
compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
|
525
607
|
|
526
|
-
|
608
|
+
!!! Note
|
609
|
+
Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
|
610
|
+
Also unlike other text formats, **GZIP** compression is not supported.
|
527
611
|
|
528
|
-
shape: (5, 4)
|
529
|
-
┌───────┬───────────┬───────────┬───────┐
|
530
|
-
│ chrom ┆ start ┆ end ┆ name │
|
531
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
532
|
-
│ str ┆ u32 ┆ u32 ┆ str │
|
533
|
-
╞═══════╪═══════════╪═══════════╪═══════╡
|
534
|
-
│ chr5 ┆ 28900001 ┆ 42500000 ┆ FRA5A │
|
535
|
-
│ chr5 ┆ 92300001 ┆ 98200000 ┆ FRA5B │
|
536
|
-
│ chr5 ┆ 130600001 ┆ 136200000 ┆ FRA5C │
|
537
|
-
│ chr5 ┆ 92300001 ┆ 93916228 ┆ FRA5D │
|
538
|
-
│ chr5 ┆ 18400001 ┆ 28900000 ┆ FRA5E │
|
539
|
-
└───────┴───────────┴───────────┴───────┘
|
540
|
-
```
|
541
612
|
!!! note
|
542
613
|
BED reader uses **1-based** coordinate system for the `start`, `end`.
|
543
614
|
"""
|
544
|
-
|
545
615
|
object_storage_options = PyObjectStorageOptions(
|
546
616
|
allow_anonymous=allow_anonymous,
|
547
617
|
enable_request_payer=enable_request_payer,
|
@@ -557,24 +627,31 @@ class IOOperations:
|
|
557
627
|
object_storage_options=object_storage_options,
|
558
628
|
)
|
559
629
|
read_options = ReadOptions(bed_read_options=bed_read_options)
|
560
|
-
|
561
|
-
return read_file(path, InputFormat.Bed, read_options, streaming)
|
562
|
-
else:
|
563
|
-
df = read_file(path, InputFormat.Bed, read_options)
|
564
|
-
return lazy_scan(df)
|
630
|
+
return _read_file(path, InputFormat.Bed, read_options)
|
565
631
|
|
566
632
|
@staticmethod
|
567
|
-
def read_table(path: str, schema: Dict = None, **kwargs) -> pl.
|
633
|
+
def read_table(path: str, schema: Dict = None, **kwargs) -> pl.DataFrame:
|
568
634
|
"""
|
569
|
-
Read a tab-delimited (i.e. BED) file into a Polars
|
635
|
+
Read a tab-delimited (i.e. BED) file into a Polars DataFrame.
|
570
636
|
Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
|
571
|
-
but faster
|
637
|
+
but faster. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
|
572
638
|
|
573
639
|
Parameters:
|
574
640
|
path: The path to the file.
|
575
641
|
schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
|
642
|
+
"""
|
643
|
+
return IOOperations.scan_table(path, schema, **kwargs).collect()
|
576
644
|
|
645
|
+
@staticmethod
|
646
|
+
def scan_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
|
647
|
+
"""
|
648
|
+
Lazily read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
|
649
|
+
Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
|
650
|
+
but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
|
577
651
|
|
652
|
+
Parameters:
|
653
|
+
path: The path to the file.
|
654
|
+
schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
|
578
655
|
"""
|
579
656
|
df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
|
580
657
|
if schema is not None:
|
@@ -602,30 +679,6 @@ class IOOperations:
|
|
602
679
|
allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
|
603
680
|
enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
|
604
681
|
compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
|
605
|
-
|
606
|
-
!!! Example
|
607
|
-
```python
|
608
|
-
import polars_bio as pb
|
609
|
-
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
|
610
|
-
pb.describe_vcf(vcf_1, allow_anonymous=True).sort("name").limit(5)
|
611
|
-
```
|
612
|
-
|
613
|
-
```shell
|
614
|
-
shape: (5, 3)
|
615
|
-
┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
|
616
|
-
│ name ┆ type ┆ description │
|
617
|
-
│ --- ┆ --- ┆ --- │
|
618
|
-
│ str ┆ str ┆ str │
|
619
|
-
╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════╡
|
620
|
-
│ AC ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only). │
|
621
|
-
│ AC_XX ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only). │
|
622
|
-
│ AC_XY ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only). │
|
623
|
-
│ AC_afr ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only). │
|
624
|
-
│ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
|
625
|
-
└───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘
|
626
|
-
|
627
|
-
|
628
|
-
```
|
629
682
|
"""
|
630
683
|
object_storage_options = PyObjectStorageOptions(
|
631
684
|
allow_anonymous=allow_anonymous,
|
@@ -646,30 +699,6 @@ class IOOperations:
|
|
646
699
|
Parameters:
|
647
700
|
name: The name of the table.
|
648
701
|
df: The Polars DataFrame.
|
649
|
-
!!! Example
|
650
|
-
```python
|
651
|
-
import polars as pl
|
652
|
-
import polars_bio as pb
|
653
|
-
df = pl.DataFrame({
|
654
|
-
"a": [1, 2, 3],
|
655
|
-
"b": [4, 5, 6]
|
656
|
-
})
|
657
|
-
pb.from_polars("test_df", df)
|
658
|
-
pb.sql("SELECT * FROM test_df").collect()
|
659
|
-
```
|
660
|
-
```shell
|
661
|
-
3rows [00:00, 2978.91rows/s]
|
662
|
-
shape: (3, 2)
|
663
|
-
┌─────┬─────┐
|
664
|
-
│ a ┆ b │
|
665
|
-
│ --- ┆ --- │
|
666
|
-
│ i64 ┆ i64 │
|
667
|
-
╞═════╪═════╡
|
668
|
-
│ 1 ┆ 4 │
|
669
|
-
│ 2 ┆ 5 │
|
670
|
-
│ 3 ┆ 6 │
|
671
|
-
└─────┴─────┘
|
672
|
-
```
|
673
702
|
"""
|
674
703
|
reader = (
|
675
704
|
df.to_arrow()
|
@@ -685,7 +714,7 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
|
|
685
714
|
return [x.strip() for x in t]
|
686
715
|
|
687
716
|
|
688
|
-
def
|
717
|
+
def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
689
718
|
df_lazy: DataFrame = df
|
690
719
|
arrow_schema = df_lazy.schema()
|
691
720
|
|
@@ -700,8 +729,6 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
|
700
729
|
df = pl.DataFrame(df).limit(n_rows)
|
701
730
|
if predicate is not None:
|
702
731
|
df = df.filter(predicate)
|
703
|
-
# TODO: We can push columns down to the DataFusion plan in the future,
|
704
|
-
# but for now we'll do it here.
|
705
732
|
if with_columns is not None:
|
706
733
|
df = df.select(with_columns)
|
707
734
|
yield df
|
@@ -713,8 +740,6 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
|
713
740
|
df = pl.DataFrame(py_df)
|
714
741
|
if predicate is not None:
|
715
742
|
df = df.filter(predicate)
|
716
|
-
# TODO: We can push columns down to the DataFusion plan in the future,
|
717
|
-
# but for now we'll do it here.
|
718
743
|
if with_columns is not None:
|
719
744
|
df = df.select(with_columns)
|
720
745
|
progress_bar.update(len(df))
|
@@ -723,31 +748,11 @@ def lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
|
723
748
|
return register_io_source(_overlap_source, schema=arrow_schema)
|
724
749
|
|
725
750
|
|
726
|
-
def
|
751
|
+
def _read_file(
|
727
752
|
path: str,
|
728
753
|
input_format: InputFormat,
|
729
754
|
read_options: ReadOptions,
|
730
|
-
|
731
|
-
) -> Union[pl.LazyFrame, pl.DataFrame]:
|
732
|
-
"""
|
733
|
-
Read a file into a DataFrame.
|
734
|
-
|
735
|
-
Parameters
|
736
|
-
----------
|
737
|
-
path : str
|
738
|
-
The path to the file.
|
739
|
-
input_format : InputFormat
|
740
|
-
The input format of the file.
|
741
|
-
read_options : ReadOptions, e.g. VcfReadOptions
|
742
|
-
streaming: Whether to read the file in streaming mode.
|
743
|
-
|
744
|
-
Returns
|
745
|
-
-------
|
746
|
-
pl.DataFrame
|
747
|
-
The DataFrame.
|
748
|
-
"""
|
755
|
+
) -> pl.LazyFrame:
|
749
756
|
table = py_register_table(ctx, path, None, input_format, read_options)
|
750
|
-
|
751
|
-
|
752
|
-
else:
|
753
|
-
return py_read_table(ctx, table.name)
|
757
|
+
df = py_read_table(ctx, table.name)
|
758
|
+
return _lazy_scan(df)
|
polars_bio/polars_bio.abi3.so
CHANGED
Binary file
|
polars_bio/range_op.py
CHANGED
@@ -46,7 +46,6 @@ class IntervalOperations:
|
|
46
46
|
cols2: Union[list[str], None] = ["chrom", "start", "end"],
|
47
47
|
algorithm: str = "Coitrees",
|
48
48
|
output_type: str = "polars.LazyFrame",
|
49
|
-
streaming: bool = False,
|
50
49
|
read_options1: Union[ReadOptions, None] = None,
|
51
50
|
read_options2: Union[ReadOptions, None] = None,
|
52
51
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
@@ -66,7 +65,6 @@ class IntervalOperations:
|
|
66
65
|
on_cols: List of additional column names to join on. default is None.
|
67
66
|
algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper, SuperIntervals
|
68
67
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
69
|
-
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
|
70
68
|
read_options1: Additional options for reading the input files.
|
71
69
|
read_options2: Additional options for reading the input files.
|
72
70
|
|
@@ -122,7 +120,6 @@ class IntervalOperations:
|
|
122
120
|
columns_1=cols1,
|
123
121
|
columns_2=cols2,
|
124
122
|
overlap_alg=algorithm,
|
125
|
-
streaming=streaming,
|
126
123
|
)
|
127
124
|
|
128
125
|
return range_operation(
|
@@ -139,7 +136,6 @@ class IntervalOperations:
|
|
139
136
|
cols1: Union[list[str], None] = ["chrom", "start", "end"],
|
140
137
|
cols2: Union[list[str], None] = ["chrom", "start", "end"],
|
141
138
|
output_type: str = "polars.LazyFrame",
|
142
|
-
streaming: bool = False,
|
143
139
|
read_options: Union[ReadOptions, None] = None,
|
144
140
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
145
141
|
"""
|
@@ -157,7 +153,6 @@ class IntervalOperations:
|
|
157
153
|
suffixes: Suffixes for the columns of the two overlapped sets.
|
158
154
|
on_cols: List of additional column names to join on. default is None.
|
159
155
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
160
|
-
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
|
161
156
|
read_options: Additional options for reading the input files.
|
162
157
|
|
163
158
|
|
@@ -186,7 +181,6 @@ class IntervalOperations:
|
|
186
181
|
suffixes=suffixes,
|
187
182
|
columns_1=cols1,
|
188
183
|
columns_2=cols2,
|
189
|
-
streaming=streaming,
|
190
184
|
)
|
191
185
|
return range_operation(df1, df2, range_options, output_type, ctx, read_options)
|
192
186
|
|
@@ -200,7 +194,6 @@ class IntervalOperations:
|
|
200
194
|
cols1: Union[list[str], None] = ["chrom", "start", "end"],
|
201
195
|
cols2: Union[list[str], None] = ["chrom", "start", "end"],
|
202
196
|
output_type: str = "polars.LazyFrame",
|
203
|
-
streaming: bool = False,
|
204
197
|
read_options: Union[ReadOptions, None] = None,
|
205
198
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
206
199
|
"""
|
@@ -218,7 +211,6 @@ class IntervalOperations:
|
|
218
211
|
suffixes: Suffixes for the columns of the two overlapped sets.
|
219
212
|
on_cols: List of additional column names to join on. default is None.
|
220
213
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
221
|
-
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
|
222
214
|
read_options: Additional options for reading the input files.
|
223
215
|
|
224
216
|
|
@@ -252,7 +244,6 @@ class IntervalOperations:
|
|
252
244
|
suffixes=suffixes,
|
253
245
|
columns_1=cols1,
|
254
246
|
columns_2=cols2,
|
255
|
-
streaming=streaming,
|
256
247
|
)
|
257
248
|
return range_operation(df2, df1, range_options, output_type, ctx, read_options)
|
258
249
|
|
@@ -266,7 +257,6 @@ class IntervalOperations:
|
|
266
257
|
cols2: Union[list[str], None] = ["chrom", "start", "end"],
|
267
258
|
on_cols: Union[list[str], None] = None,
|
268
259
|
output_type: str = "polars.LazyFrame",
|
269
|
-
streaming: bool = False,
|
270
260
|
naive_query: bool = True,
|
271
261
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
272
262
|
"""
|
@@ -285,7 +275,6 @@ class IntervalOperations:
|
|
285
275
|
on_cols: List of additional column names to join on. default is None.
|
286
276
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
287
277
|
naive_query: If True, use naive query for counting overlaps based on overlaps.
|
288
|
-
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
|
289
278
|
Returns:
|
290
279
|
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
|
291
280
|
|
@@ -335,7 +324,6 @@ class IntervalOperations:
|
|
335
324
|
suffixes=suffixes,
|
336
325
|
columns_1=cols1,
|
337
326
|
columns_2=cols2,
|
338
|
-
streaming=streaming,
|
339
327
|
)
|
340
328
|
return range_operation(df2, df1, range_options, output_type, ctx)
|
341
329
|
df1 = read_df_to_datafusion(my_ctx, df1)
|
@@ -423,7 +411,7 @@ class IntervalOperations:
|
|
423
411
|
)
|
424
412
|
)
|
425
413
|
|
426
|
-
return convert_result(df, output_type
|
414
|
+
return convert_result(df, output_type)
|
427
415
|
|
428
416
|
@staticmethod
|
429
417
|
def merge(
|
@@ -433,7 +421,6 @@ class IntervalOperations:
|
|
433
421
|
cols: Union[list[str], None] = ["chrom", "start", "end"],
|
434
422
|
on_cols: Union[list[str], None] = None,
|
435
423
|
output_type: str = "polars.LazyFrame",
|
436
|
-
streaming: bool = False,
|
437
424
|
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
|
438
425
|
"""
|
439
426
|
Merge overlapping intervals. It is assumed that start < end.
|
@@ -446,7 +433,6 @@ class IntervalOperations:
|
|
446
433
|
genomic intervals, provided separately for each set.
|
447
434
|
on_cols: List of additional column names for clustering. default is None.
|
448
435
|
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
|
449
|
-
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
|
450
436
|
|
451
437
|
Returns:
|
452
438
|
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
|
@@ -574,4 +560,4 @@ class IntervalOperations:
|
|
574
560
|
)
|
575
561
|
)
|
576
562
|
|
577
|
-
return convert_result(result, output_type
|
563
|
+
return convert_result(result, output_type)
|
polars_bio/range_op_helpers.py
CHANGED
@@ -44,19 +44,6 @@ def range_operation(
|
|
44
44
|
len(supported_exts.intersection(ext2)) > 0 or len(ext2) == 0
|
45
45
|
), "Dataframe2 must be a Parquet, a BED or CSV or VCF file"
|
46
46
|
# use suffixes to avoid column name conflicts
|
47
|
-
if range_options.streaming:
|
48
|
-
# FIXME: Parallelism is not supported
|
49
|
-
# FIXME: StringViews not supported yet see: https://datafusion.apache.org/blog/2024/12/14/datafusion-python-43.1.0/
|
50
|
-
|
51
|
-
ctx.set_option("datafusion.execution.target_partitions", "1", False)
|
52
|
-
ctx.set_option(
|
53
|
-
"datafusion.execution.parquet.schema_force_view_types", "false", True
|
54
|
-
)
|
55
|
-
return stream_wrapper(
|
56
|
-
stream_range_operation_scan(
|
57
|
-
ctx, df1, df2, range_options, read_options1, read_options2
|
58
|
-
)
|
59
|
-
)
|
60
47
|
|
61
48
|
if range_options.range_op == RangeOp.CountOverlapsNaive:
|
62
49
|
## add count column to the schema
|
polars_bio/sql.py
CHANGED
@@ -22,7 +22,7 @@ from polars_bio.polars_bio import (
|
|
22
22
|
)
|
23
23
|
|
24
24
|
from .context import ctx
|
25
|
-
from .io import _cleanse_fields,
|
25
|
+
from .io import _cleanse_fields, _lazy_scan
|
26
26
|
from .range_op_helpers import stream_wrapper
|
27
27
|
|
28
28
|
|
@@ -436,13 +436,12 @@ class SQL:
|
|
436
436
|
py_register_table(ctx, path, name, InputFormat.Bam, read_options)
|
437
437
|
|
438
438
|
@staticmethod
|
439
|
-
def sql(query: str
|
439
|
+
def sql(query: str) -> pl.LazyFrame:
|
440
440
|
"""
|
441
441
|
Execute a SQL query on the registered tables.
|
442
442
|
|
443
443
|
Parameters:
|
444
444
|
query: The SQL query.
|
445
|
-
streaming: Whether to execute the query in streaming mode.
|
446
445
|
|
447
446
|
!!! Example
|
448
447
|
```python
|
@@ -451,8 +450,5 @@ class SQL:
|
|
451
450
|
pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
|
452
451
|
```
|
453
452
|
"""
|
454
|
-
|
455
|
-
|
456
|
-
else:
|
457
|
-
df = py_read_sql(ctx, query)
|
458
|
-
return lazy_scan(df)
|
453
|
+
df = py_read_sql(ctx, query)
|
454
|
+
return _lazy_scan(df)
|
polars_bio/utils.py
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
from typing import Iterator, Union
|
2
|
+
|
3
|
+
import polars as pl
|
4
|
+
from datafusion import DataFrame
|
5
|
+
from polars.io.plugins import register_io_source
|
6
|
+
from tqdm.auto import tqdm
|
7
|
+
|
8
|
+
|
9
|
+
def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
|
10
|
+
if t is None:
|
11
|
+
return None
|
12
|
+
return [x.strip() for x in t]
|
13
|
+
|
14
|
+
|
15
|
+
def _lazy_scan(df: Union[pl.DataFrame, pl.LazyFrame]) -> pl.LazyFrame:
|
16
|
+
df_lazy: DataFrame = df
|
17
|
+
arrow_schema = df_lazy.schema()
|
18
|
+
|
19
|
+
def _overlap_source(
|
20
|
+
with_columns: Union[pl.Expr, None],
|
21
|
+
predicate: Union[pl.Expr, None],
|
22
|
+
n_rows: Union[int, None],
|
23
|
+
_batch_size: Union[int, None],
|
24
|
+
) -> Iterator[pl.DataFrame]:
|
25
|
+
if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
|
26
|
+
df = df_lazy.limit(n_rows).execute_stream().next().to_pyarrow()
|
27
|
+
df = pl.DataFrame(df).limit(n_rows)
|
28
|
+
if predicate is not None:
|
29
|
+
df = df.filter(predicate)
|
30
|
+
if with_columns is not None:
|
31
|
+
df = df.select(with_columns)
|
32
|
+
yield df
|
33
|
+
return
|
34
|
+
df_stream = df_lazy.execute_stream()
|
35
|
+
progress_bar = tqdm(unit="rows")
|
36
|
+
for r in df_stream:
|
37
|
+
py_df = r.to_pyarrow()
|
38
|
+
df = pl.DataFrame(py_df)
|
39
|
+
if predicate is not None:
|
40
|
+
df = df.filter(predicate)
|
41
|
+
if with_columns is not None:
|
42
|
+
df = df.select(with_columns)
|
43
|
+
progress_bar.update(len(df))
|
44
|
+
yield df
|
45
|
+
|
46
|
+
return register_io_source(_overlap_source, schema=arrow_schema)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: polars-bio
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.13.0
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
7
|
-
Requires-Dist: polars~=1.
|
7
|
+
Requires-Dist: polars~=1.29.0
|
8
8
|
Requires-Dist: pyarrow~=21.0.0
|
9
9
|
Requires-Dist: datafusion~=48.0.0
|
10
10
|
Requires-Dist: tqdm~=4.67.1
|
@@ -0,0 +1,19 @@
|
|
1
|
+
polars_bio-0.13.0.dist-info/METADATA,sha256=kRbYeTHKR-qtdAq4pD5bf8k1iUyHSriIAbUq3IUOO9o,683
|
2
|
+
polars_bio-0.13.0.dist-info/WHEEL,sha256=TiMJekJwYXi-5FCpHPqncJXv9UVKDzSHt4YRv5UDSSg,104
|
3
|
+
polars_bio-0.13.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
4
|
+
polars_bio/__init__.py,sha256=-4QHwzijNcm99A372_-pLCK-F3YrmO2my-HZgVjYEr8,2977
|
5
|
+
polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
|
6
|
+
polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
|
7
|
+
polars_bio/interval_op_helpers.py,sha256=xMWxu2y3jIwt0KCtzIPF_cvbUMdhrb8Mif74MbHU1qY,2834
|
8
|
+
polars_bio/io.py,sha256=YtcNqS0pzeTRZ78ckov4nfNekvWCyz5JGSHVl7LxfFQ,37866
|
9
|
+
polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
|
10
|
+
polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
|
11
|
+
polars_bio/polars_bio.abi3.so,sha256=x8hfvc1l_jRS8o0h0-fbmrb5nexx5URRCfsNSIE7b-I,277132364
|
12
|
+
polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
|
13
|
+
polars_bio/range_op.py,sha256=3LAYTmbJhv7WY8eB7_OJfPLLoR9eonbZSFKkZi_Dp30,24300
|
14
|
+
polars_bio/range_op_helpers.py,sha256=RQw6ZgIGhDh-3-pUTIQ56Vypuy9XQhpFGKQYGd_vrzY,5792
|
15
|
+
polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
|
16
|
+
polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
|
17
|
+
polars_bio/sql.py,sha256=ORvSleiwUpkpewvgcFA3GeuyZhQXToq9RZ_XrO6iGxw,24164
|
18
|
+
polars_bio/utils.py,sha256=KAq8tbIf6yBFhRwzrRLBUfM6zbbdCqK_NYK5bUy1qfA,1565
|
19
|
+
polars_bio-0.13.0.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
polars_bio-0.12.0.dist-info/METADATA,sha256=v04PZhV3EQNyoizWP0J2vbn7gzmWMwStUsSm69EdFMc,683
|
2
|
-
polars_bio-0.12.0.dist-info/WHEEL,sha256=TiMJekJwYXi-5FCpHPqncJXv9UVKDzSHt4YRv5UDSSg,104
|
3
|
-
polars_bio-0.12.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
4
|
-
polars_bio/__init__.py,sha256=t4McGJwYKf1BRS_AUZHdUnEhzz34_OrKBdyGRsdFbPY,1939
|
5
|
-
polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
|
6
|
-
polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
|
7
|
-
polars_bio/interval_op_helpers.py,sha256=3xg6IEpfyTPb3y1QzkSVGFLhtFVypGQfDhbJEGdHpgo,3006
|
8
|
-
polars_bio/io.py,sha256=9y9fYO_xZN1Efc7JJ0_G1SrbQ7xqT5HEfip9h1WyrrI,41285
|
9
|
-
polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
|
10
|
-
polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
|
11
|
-
polars_bio/polars_bio.abi3.so,sha256=XSzH_QB9R_Oije8yjH8RNbS5qOK0Ohh-DY5l3XNMjhc,268636036
|
12
|
-
polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
|
13
|
-
polars_bio/range_op.py,sha256=xLpbomQGUYnzn2ik3PrtJWYJqN9e1s8tRwN1J7MUOdE,25138
|
14
|
-
polars_bio/range_op_helpers.py,sha256=9MRGKhGmx_HnZEWP50tWQ4rdsdhoMf8m-08E0f_YxMs,6407
|
15
|
-
polars_bio/range_op_io.py,sha256=Kcxv9ebQtM25OWNITol1AXpDDMDzIi9uHeI1FhhP0Lk,5717
|
16
|
-
polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
|
17
|
-
polars_bio/sql.py,sha256=PFTHeRyVouPyjSL26w60ByyAKZMLCsaeZ0wiJY_KH2k,24361
|
18
|
-
polars_bio-0.12.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|