polars-bio 0.14.0__cp39-abi3-macosx_11_0_arm64.whl → 0.15.0__cp39-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_bio/__init__.py +1 -1
- polars_bio/io.py +425 -180
- polars_bio/polars_bio.abi3.so +0 -0
- polars_bio/predicate_translator.py +464 -0
- polars_bio/sql_predicate_builder.py +293 -0
- polars_bio/utils.py +29 -4
- {polars_bio-0.14.0.dist-info → polars_bio-0.15.0.dist-info}/METADATA +1 -1
- {polars_bio-0.14.0.dist-info → polars_bio-0.15.0.dist-info}/RECORD +10 -8
- {polars_bio-0.14.0.dist-info → polars_bio-0.15.0.dist-info}/WHEEL +0 -0
- {polars_bio-0.14.0.dist-info → polars_bio-0.15.0.dist-info}/licenses/LICENSE +0 -0
polars_bio/polars_bio.abi3.so
CHANGED
Binary file
|
@@ -0,0 +1,464 @@
|
|
1
|
+
"""
|
2
|
+
Polars to DataFusion predicate translator for GFF table provider.
|
3
|
+
|
4
|
+
This module converts Polars expressions to DataFusion expressions for predicate pushdown optimization.
|
5
|
+
Uses the DataFusion Python DataFrame API instead of SQL string construction for better type safety.
|
6
|
+
|
7
|
+
Supports the following operators based on GFF table provider capabilities:
|
8
|
+
|
9
|
+
| Column | Data Type | Supported Operators | Example |
|
10
|
+
|-----------------------------|-----------|------------------------------|---------------------------------|
|
11
|
+
| chrom, source, type, strand | String | =, !=, IN, NOT IN | chrom = 'chr1' |
|
12
|
+
| start, end | UInt32 | =, !=, <, <=, >, >=, BETWEEN | start > 1000 |
|
13
|
+
| score | Float32 | =, !=, <, <=, >, >=, BETWEEN | score BETWEEN 50.0 AND 100.0 |
|
14
|
+
| Attribute fields | String | =, !=, IN, NOT IN | "ID" = 'gene1' |
|
15
|
+
| Complex | - | AND combinations | chrom = 'chr1' AND start > 1000 |
|
16
|
+
"""
|
17
|
+
|
18
|
+
import re
|
19
|
+
from typing import Any, List, Optional, Union
|
20
|
+
|
21
|
+
import polars as pl
|
22
|
+
from datafusion import col
|
23
|
+
from datafusion import functions as F
|
24
|
+
from datafusion import lit
|
25
|
+
|
26
|
+
# GFF schema column types for validation
|
27
|
+
GFF_STRING_COLUMNS = {"chrom", "source", "type", "strand"}
|
28
|
+
GFF_UINT32_COLUMNS = {"start", "end", "phase"}
|
29
|
+
GFF_FLOAT32_COLUMNS = {"score"}
|
30
|
+
GFF_STATIC_COLUMNS = (
|
31
|
+
GFF_STRING_COLUMNS | GFF_UINT32_COLUMNS | GFF_FLOAT32_COLUMNS | {"attributes"}
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class PredicateTranslationError(Exception):
|
36
|
+
"""Raised when a Polars predicate cannot be translated to DataFusion expression."""
|
37
|
+
|
38
|
+
pass
|
39
|
+
|
40
|
+
|
41
|
+
def translate_polars_predicate_to_datafusion(predicate: pl.Expr):
|
42
|
+
"""
|
43
|
+
Convert Polars predicate expressions to DataFusion expressions.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
predicate: Polars expression representing filter conditions
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
DataFusion Expr object that can be used with DataFrame.filter()
|
50
|
+
|
51
|
+
Raises:
|
52
|
+
PredicateTranslationError: If predicate cannot be translated
|
53
|
+
|
54
|
+
Examples:
|
55
|
+
>>> df_expr = translate_polars_predicate_to_datafusion(pl.col("chrom") == "chr1")
|
56
|
+
>>> datafusion_df.filter(df_expr)
|
57
|
+
|
58
|
+
>>> df_expr = translate_polars_predicate_to_datafusion(
|
59
|
+
... (pl.col("chrom") == "chr1") & (pl.col("start") > 100000)
|
60
|
+
... )
|
61
|
+
>>> datafusion_df.filter(df_expr)
|
62
|
+
"""
|
63
|
+
try:
|
64
|
+
return _translate_polars_expr(predicate)
|
65
|
+
except Exception as e:
|
66
|
+
raise PredicateTranslationError(
|
67
|
+
f"Cannot translate predicate to DataFusion: {e}"
|
68
|
+
) from e
|
69
|
+
|
70
|
+
|
71
|
+
def _translate_polars_expr(expr: pl.Expr):
|
72
|
+
"""Recursively translate Polars expression to DataFusion expression."""
|
73
|
+
|
74
|
+
expr_str = str(expr)
|
75
|
+
|
76
|
+
# Handle binary operations (col op literal)
|
77
|
+
if _is_binary_expr(expr_str):
|
78
|
+
return _translate_binary_expr(expr_str)
|
79
|
+
|
80
|
+
# Handle logical AND operations
|
81
|
+
if _is_and_expr(expr_str):
|
82
|
+
return _translate_and_expr(expr_str)
|
83
|
+
|
84
|
+
# Handle IN operations
|
85
|
+
if _is_in_expr(expr_str):
|
86
|
+
return _translate_in_expr(expr_str)
|
87
|
+
|
88
|
+
# Handle NOT IN operations (negated IN)
|
89
|
+
if _is_not_in_expr(expr_str):
|
90
|
+
return _translate_not_in_expr(expr_str)
|
91
|
+
|
92
|
+
# Handle BETWEEN operations (range checks)
|
93
|
+
if _is_between_expr(expr_str):
|
94
|
+
return _translate_between_expr(expr_str)
|
95
|
+
|
96
|
+
# Handle IS NOT NULL
|
97
|
+
if _is_not_null_expr(expr_str):
|
98
|
+
return _translate_not_null_expr(expr_str)
|
99
|
+
|
100
|
+
# Handle IS NULL
|
101
|
+
if _is_null_expr(expr_str):
|
102
|
+
return _translate_null_expr(expr_str)
|
103
|
+
|
104
|
+
raise PredicateTranslationError(f"Unsupported expression type: {expr_str}")
|
105
|
+
|
106
|
+
|
107
|
+
def _is_binary_expr(expr_str: str) -> bool:
|
108
|
+
"""Check if expression is a binary operation (col op literal)."""
|
109
|
+
binary_patterns = [r"\s==\s", r"\s!=\s", r"\s<\s", r"\s<=\s", r"\s>\s", r"\s>=\s"]
|
110
|
+
return any(re.search(pattern, expr_str) for pattern in binary_patterns)
|
111
|
+
|
112
|
+
|
113
|
+
def _translate_binary_expr(expr_str: str):
|
114
|
+
"""Translate binary expressions like col == value, col > value, etc."""
|
115
|
+
|
116
|
+
# Parse binary operations with regex to handle complex expressions
|
117
|
+
binary_ops = [
|
118
|
+
(r"(.+?)\s==\s(.+)", lambda l, r: col(l) == lit(r)),
|
119
|
+
(r"(.+?)\s!=\s(.+)", lambda l, r: col(l) != lit(r)),
|
120
|
+
(r"(.+?)\s<=\s(.+)", lambda l, r: col(l) <= lit(r)),
|
121
|
+
(r"(.+?)\s>=\s(.+)", lambda l, r: col(l) >= lit(r)),
|
122
|
+
(r"(.+?)\s<\s(.+)", lambda l, r: col(l) < lit(r)),
|
123
|
+
(r"(.+?)\s>\s(.+)", lambda l, r: col(l) > lit(r)),
|
124
|
+
]
|
125
|
+
|
126
|
+
for pattern, op_func in binary_ops:
|
127
|
+
match = re.search(pattern, expr_str)
|
128
|
+
if match:
|
129
|
+
left_part = match.group(1).strip()
|
130
|
+
right_part = match.group(2).strip()
|
131
|
+
|
132
|
+
# Extract column name and literal value
|
133
|
+
column = _extract_column_name(left_part)
|
134
|
+
value = _extract_literal_value(right_part)
|
135
|
+
|
136
|
+
# Validate column and operator combination
|
137
|
+
op_symbol = pattern.split(r"\s")[1].replace("\\", "")
|
138
|
+
_validate_column_operator(column, op_symbol)
|
139
|
+
|
140
|
+
return op_func(column, value)
|
141
|
+
|
142
|
+
raise PredicateTranslationError(f"Cannot parse binary expression: {expr_str}")
|
143
|
+
|
144
|
+
|
145
|
+
def _is_and_expr(expr_str: str) -> bool:
|
146
|
+
"""Check if expression is an AND operation."""
|
147
|
+
return " & " in expr_str or ".and(" in expr_str
|
148
|
+
|
149
|
+
|
150
|
+
def _translate_and_expr(expr_str: str):
|
151
|
+
"""Translate AND expressions."""
|
152
|
+
|
153
|
+
# Handle & operator by finding the main & split point
|
154
|
+
if " & " in expr_str:
|
155
|
+
parts = _split_on_main_operator(expr_str, " & ")
|
156
|
+
if len(parts) == 2:
|
157
|
+
left_part = parts[0].strip().strip("()")
|
158
|
+
right_part = parts[1].strip().strip("()")
|
159
|
+
|
160
|
+
# Recursively translate both parts
|
161
|
+
left_expr = _translate_polars_expr(_create_mock_expr(left_part))
|
162
|
+
right_expr = _translate_polars_expr(_create_mock_expr(right_part))
|
163
|
+
|
164
|
+
return left_expr & right_expr
|
165
|
+
|
166
|
+
raise PredicateTranslationError(f"Cannot parse AND expression: {expr_str}")
|
167
|
+
|
168
|
+
|
169
|
+
def _is_in_expr(expr_str: str) -> bool:
|
170
|
+
"""Check if expression is an IN operation."""
|
171
|
+
return ".is_in(" in expr_str
|
172
|
+
|
173
|
+
|
174
|
+
def _translate_in_expr(expr_str: str):
|
175
|
+
"""Translate IN expressions like col.is_in([val1, val2])."""
|
176
|
+
|
177
|
+
# Parse col("column").is_in([values]) pattern
|
178
|
+
match = re.search(r"(.+?)\.is_in\(\[(.+?)\]\)", expr_str)
|
179
|
+
if match:
|
180
|
+
col_part = match.group(1).strip()
|
181
|
+
values_part = match.group(2).strip()
|
182
|
+
|
183
|
+
column = _extract_column_name(col_part)
|
184
|
+
values = _parse_list_values(values_part)
|
185
|
+
|
186
|
+
# Validate column supports IN operation
|
187
|
+
_validate_column_operator(column, "IN")
|
188
|
+
|
189
|
+
# Convert values to DataFusion literals
|
190
|
+
df_values = [lit(value) for value in values]
|
191
|
+
|
192
|
+
return F.in_list(col(column), df_values)
|
193
|
+
|
194
|
+
|
195
|
+
def _is_not_in_expr(expr_str: str) -> bool:
|
196
|
+
"""Check if expression is a NOT IN operation (negated is_in)."""
|
197
|
+
# Common patterns from Polars repr:
|
198
|
+
# 1) ~(col("x").is_in([..]))
|
199
|
+
# 2) col("x").is_in([..]).not()
|
200
|
+
s = expr_str.replace(" ", "")
|
201
|
+
return (
|
202
|
+
(s.startswith("~(") and ".is_in([" in s and s.endswith(")"))
|
203
|
+
or ".is_in([" in s
|
204
|
+
and ").not()" in s
|
205
|
+
)
|
206
|
+
|
207
|
+
|
208
|
+
def _translate_not_in_expr(expr_str: str):
|
209
|
+
"""Translate NOT IN expressions as negated in_list."""
|
210
|
+
s = expr_str.strip()
|
211
|
+
# Normalize to extract inner is_in([...]) part
|
212
|
+
inner = s
|
213
|
+
if s.startswith("~(") and s.endswith(")"):
|
214
|
+
inner = s[2:-1]
|
215
|
+
# Reuse IN translator on inner and negate
|
216
|
+
in_expr = _translate_in_expr(inner)
|
217
|
+
return ~in_expr
|
218
|
+
|
219
|
+
raise PredicateTranslationError(f"Cannot parse IN expression: {expr_str}")
|
220
|
+
|
221
|
+
|
222
|
+
def _is_between_expr(expr_str: str) -> bool:
|
223
|
+
"""Check if expression represents a BETWEEN operation."""
|
224
|
+
# Look for patterns like (col >= val1) & (col <= val2)
|
225
|
+
return (" >= " in expr_str and " <= " in expr_str and " & " in expr_str) or (
|
226
|
+
" > " in expr_str and " < " in expr_str and " & " in expr_str
|
227
|
+
)
|
228
|
+
|
229
|
+
|
230
|
+
def _translate_between_expr(expr_str: str):
|
231
|
+
"""Translate BETWEEN expressions from range conditions."""
|
232
|
+
|
233
|
+
# Parse (col >= min_val) & (col <= max_val) pattern
|
234
|
+
if " & " in expr_str:
|
235
|
+
parts = _split_on_main_operator(expr_str, " & ")
|
236
|
+
if len(parts) == 2:
|
237
|
+
left_part = parts[0].strip().strip("()")
|
238
|
+
right_part = parts[1].strip().strip("()")
|
239
|
+
|
240
|
+
# Extract column and values from both parts
|
241
|
+
left_col, left_op, left_val = _parse_comparison(left_part)
|
242
|
+
right_col, right_op, right_val = _parse_comparison(right_part)
|
243
|
+
|
244
|
+
# Verify same column in both parts
|
245
|
+
if left_col == right_col:
|
246
|
+
column = left_col
|
247
|
+
|
248
|
+
# Determine BETWEEN bounds
|
249
|
+
if left_op in [">", ">="] and right_op in ["<", "<="]:
|
250
|
+
min_val = left_val
|
251
|
+
max_val = right_val
|
252
|
+
elif left_op in ["<", "<="] and right_op in [">", ">="]:
|
253
|
+
min_val = right_val
|
254
|
+
max_val = left_val
|
255
|
+
else:
|
256
|
+
raise PredicateTranslationError("Invalid BETWEEN pattern")
|
257
|
+
|
258
|
+
# Validate column supports BETWEEN
|
259
|
+
_validate_column_operator(column, "BETWEEN")
|
260
|
+
|
261
|
+
return col(column).between(lit(min_val), lit(max_val))
|
262
|
+
|
263
|
+
raise PredicateTranslationError(f"Cannot parse BETWEEN expression: {expr_str}")
|
264
|
+
|
265
|
+
|
266
|
+
def _is_not_null_expr(expr_str: str) -> bool:
|
267
|
+
"""Check if expression is IS NOT NULL."""
|
268
|
+
return ".is_not_null()" in expr_str
|
269
|
+
|
270
|
+
|
271
|
+
def _translate_not_null_expr(expr_str: str):
|
272
|
+
"""Translate IS NOT NULL expressions."""
|
273
|
+
col_part = expr_str.split(".is_not_null()")[0]
|
274
|
+
column = _extract_column_name(col_part)
|
275
|
+
return col(column).is_not_null()
|
276
|
+
|
277
|
+
|
278
|
+
def _is_null_expr(expr_str: str) -> bool:
|
279
|
+
"""Check if expression is IS NULL."""
|
280
|
+
return ".is_null()" in expr_str
|
281
|
+
|
282
|
+
|
283
|
+
def _translate_null_expr(expr_str: str):
|
284
|
+
"""Translate IS NULL expressions."""
|
285
|
+
col_part = expr_str.split(".is_null()")[0]
|
286
|
+
column = _extract_column_name(col_part)
|
287
|
+
return col(column).is_null()
|
288
|
+
|
289
|
+
|
290
|
+
# Helper functions
|
291
|
+
|
292
|
+
|
293
|
+
def _extract_column_name(col_expr: str) -> str:
|
294
|
+
"""Extract column name from col() expression."""
|
295
|
+
col_expr = col_expr.strip()
|
296
|
+
|
297
|
+
# Handle col("name") or col('name')
|
298
|
+
patterns = [r'col\("([^"]+)"\)', r"col\('([^']+)'\)"]
|
299
|
+
|
300
|
+
for pattern in patterns:
|
301
|
+
match = re.search(pattern, col_expr)
|
302
|
+
if match:
|
303
|
+
return match.group(1)
|
304
|
+
|
305
|
+
# Handle parentheses around the whole expression
|
306
|
+
col_expr = col_expr.strip("()")
|
307
|
+
for pattern in patterns:
|
308
|
+
match = re.search(pattern, col_expr)
|
309
|
+
if match:
|
310
|
+
return match.group(1)
|
311
|
+
|
312
|
+
raise PredicateTranslationError(f"Cannot extract column name from: {col_expr}")
|
313
|
+
|
314
|
+
|
315
|
+
def _extract_literal_value(literal_expr: str) -> Any:
|
316
|
+
"""Extract literal value from expression."""
|
317
|
+
literal_expr = literal_expr.strip()
|
318
|
+
|
319
|
+
# Handle string literals
|
320
|
+
if (literal_expr.startswith('"') and literal_expr.endswith('"')) or (
|
321
|
+
literal_expr.startswith("'") and literal_expr.endswith("'")
|
322
|
+
):
|
323
|
+
return literal_expr[1:-1]
|
324
|
+
|
325
|
+
# Handle numeric literals
|
326
|
+
try:
|
327
|
+
if "." in literal_expr:
|
328
|
+
return float(literal_expr)
|
329
|
+
else:
|
330
|
+
return int(literal_expr)
|
331
|
+
except ValueError:
|
332
|
+
pass
|
333
|
+
|
334
|
+
# Handle boolean literals
|
335
|
+
if literal_expr.lower() == "true":
|
336
|
+
return True
|
337
|
+
elif literal_expr.lower() == "false":
|
338
|
+
return False
|
339
|
+
|
340
|
+
return literal_expr
|
341
|
+
|
342
|
+
|
343
|
+
def _validate_column_operator(column: str, operator: str) -> None:
|
344
|
+
"""Validate that column supports the given operator."""
|
345
|
+
|
346
|
+
# String columns: =, !=, IN, NOT IN
|
347
|
+
if (
|
348
|
+
column in GFF_STRING_COLUMNS or column not in GFF_STATIC_COLUMNS
|
349
|
+
): # Attribute fields
|
350
|
+
if operator not in ["==", "!=", "IN", "NOT IN"]:
|
351
|
+
raise PredicateTranslationError(
|
352
|
+
f"Column '{column}' (String) does not support operator '{operator}'. "
|
353
|
+
f"Supported: ==, !=, IN, NOT IN"
|
354
|
+
)
|
355
|
+
|
356
|
+
# Numeric columns: =, !=, <, <=, >, >=, BETWEEN
|
357
|
+
elif column in GFF_UINT32_COLUMNS or column in GFF_FLOAT32_COLUMNS:
|
358
|
+
if operator not in ["==", "!=", "<", "<=", ">", ">=", "BETWEEN"]:
|
359
|
+
raise PredicateTranslationError(
|
360
|
+
f"Column '{column}' (Numeric) does not support operator '{operator}'. "
|
361
|
+
f"Supported: ==, !=, <, <=, >, >=, BETWEEN"
|
362
|
+
)
|
363
|
+
|
364
|
+
|
365
|
+
def _parse_list_values(values_str: str) -> List[Any]:
|
366
|
+
"""Parse list of values from string."""
|
367
|
+
if not values_str.strip():
|
368
|
+
return []
|
369
|
+
|
370
|
+
items = [item.strip() for item in values_str.split(",")]
|
371
|
+
return [_extract_literal_value(item) for item in items if item.strip()]
|
372
|
+
|
373
|
+
|
374
|
+
def _split_on_main_operator(expr_str: str, operator: str) -> List[str]:
|
375
|
+
"""Split expression on main operator, respecting parentheses."""
|
376
|
+
parts = []
|
377
|
+
current = ""
|
378
|
+
paren_depth = 0
|
379
|
+
i = 0
|
380
|
+
|
381
|
+
while i < len(expr_str):
|
382
|
+
if expr_str[i] == "(":
|
383
|
+
paren_depth += 1
|
384
|
+
elif expr_str[i] == ")":
|
385
|
+
paren_depth -= 1
|
386
|
+
elif paren_depth == 0 and expr_str[i : i + len(operator)] == operator:
|
387
|
+
parts.append(current)
|
388
|
+
current = ""
|
389
|
+
i += len(operator) - 1
|
390
|
+
else:
|
391
|
+
current += expr_str[i]
|
392
|
+
i += 1
|
393
|
+
|
394
|
+
parts.append(current)
|
395
|
+
return parts
|
396
|
+
|
397
|
+
|
398
|
+
def _parse_comparison(comp_str: str) -> tuple:
|
399
|
+
"""Parse comparison string into (column, operator, value)."""
|
400
|
+
comp_str = comp_str.strip("()")
|
401
|
+
|
402
|
+
for op in [" >= ", " <= ", " > ", " < ", " == ", " != "]:
|
403
|
+
if op in comp_str:
|
404
|
+
parts = comp_str.split(op, 1)
|
405
|
+
if len(parts) == 2:
|
406
|
+
col_part = parts[0].strip()
|
407
|
+
val_part = parts[1].strip()
|
408
|
+
column = _extract_column_name(col_part)
|
409
|
+
value = _extract_literal_value(val_part)
|
410
|
+
return column, op.strip(), value
|
411
|
+
|
412
|
+
raise PredicateTranslationError(f"Cannot parse comparison: {comp_str}")
|
413
|
+
|
414
|
+
|
415
|
+
def _create_mock_expr(expr_str: str) -> pl.Expr:
|
416
|
+
"""Create a mock Polars expression from string for recursive parsing."""
|
417
|
+
|
418
|
+
class MockExpr:
|
419
|
+
def __init__(self, expr_str):
|
420
|
+
self.expr_str = expr_str
|
421
|
+
|
422
|
+
def __str__(self):
|
423
|
+
return self.expr_str
|
424
|
+
|
425
|
+
return MockExpr(expr_str.strip())
|
426
|
+
|
427
|
+
|
428
|
+
def is_predicate_pushdown_supported(predicate: pl.Expr) -> bool:
|
429
|
+
"""
|
430
|
+
Check if a Polars predicate can be pushed down to DataFusion.
|
431
|
+
|
432
|
+
Args:
|
433
|
+
predicate: Polars expression to check
|
434
|
+
|
435
|
+
Returns:
|
436
|
+
True if predicate can be translated and pushed down
|
437
|
+
"""
|
438
|
+
try:
|
439
|
+
translate_polars_predicate_to_datafusion(predicate)
|
440
|
+
return True
|
441
|
+
except PredicateTranslationError:
|
442
|
+
return False
|
443
|
+
|
444
|
+
|
445
|
+
def get_supported_predicates_info() -> str:
|
446
|
+
"""Return information about supported predicate types."""
|
447
|
+
return """
|
448
|
+
Supported GFF Predicate Pushdown Operations:
|
449
|
+
|
450
|
+
| Column | Data Type | Supported Operators | Example |
|
451
|
+
|-----------------------------|-----------|------------------------------|---------------------------------|
|
452
|
+
| chrom, source, type, strand | String | =, !=, IN, NOT IN | chrom = 'chr1' |
|
453
|
+
| start, end | UInt32 | =, !=, <, <=, >, >=, BETWEEN | start > 1000 |
|
454
|
+
| score | Float32 | =, !=, <, <=, >, >=, BETWEEN | score BETWEEN 50.0 AND 100.0 |
|
455
|
+
| Attribute fields | String | =, !=, IN, NOT IN | "ID" = 'gene1' |
|
456
|
+
| Complex | - | AND combinations | chrom = 'chr1' AND start > 1000 |
|
457
|
+
|
458
|
+
Examples:
|
459
|
+
- pl.col("chrom") == "chr1"
|
460
|
+
- pl.col("start") > 1000
|
461
|
+
- pl.col("chrom").is_in(["chr1", "chr2"])
|
462
|
+
- (pl.col("chrom") == "chr1") & (pl.col("start") > 1000)
|
463
|
+
- (pl.col("start") >= 1000) & (pl.col("start") <= 2000) # BETWEEN
|
464
|
+
"""
|