polars-bio 0.14.0__cp39-abi3-win_amd64.whl → 0.15.0__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polars_bio/polars_bio.pyd CHANGED
Binary file
@@ -0,0 +1,464 @@
1
+ """
2
+ Polars to DataFusion predicate translator for GFF table provider.
3
+
4
+ This module converts Polars expressions to DataFusion expressions for predicate pushdown optimization.
5
+ Uses the DataFusion Python DataFrame API instead of SQL string construction for better type safety.
6
+
7
+ Supports the following operators based on GFF table provider capabilities:
8
+
9
+ | Column | Data Type | Supported Operators | Example |
10
+ |-----------------------------|-----------|------------------------------|---------------------------------|
11
+ | chrom, source, type, strand | String | =, !=, IN, NOT IN | chrom = 'chr1' |
12
+ | start, end | UInt32 | =, !=, <, <=, >, >=, BETWEEN | start > 1000 |
13
+ | score | Float32 | =, !=, <, <=, >, >=, BETWEEN | score BETWEEN 50.0 AND 100.0 |
14
+ | Attribute fields | String | =, !=, IN, NOT IN | "ID" = 'gene1' |
15
+ | Complex | - | AND combinations | chrom = 'chr1' AND start > 1000 |
16
+ """
17
+
18
+ import re
19
+ from typing import Any, List, Optional, Union
20
+
21
+ import polars as pl
22
+ from datafusion import col
23
+ from datafusion import functions as F
24
+ from datafusion import lit
25
+
26
+ # GFF schema column types for validation
27
+ GFF_STRING_COLUMNS = {"chrom", "source", "type", "strand"}
28
+ GFF_UINT32_COLUMNS = {"start", "end", "phase"}
29
+ GFF_FLOAT32_COLUMNS = {"score"}
30
+ GFF_STATIC_COLUMNS = (
31
+ GFF_STRING_COLUMNS | GFF_UINT32_COLUMNS | GFF_FLOAT32_COLUMNS | {"attributes"}
32
+ )
33
+
34
+
35
+ class PredicateTranslationError(Exception):
36
+ """Raised when a Polars predicate cannot be translated to DataFusion expression."""
37
+
38
+ pass
39
+
40
+
41
+ def translate_polars_predicate_to_datafusion(predicate: pl.Expr):
42
+ """
43
+ Convert Polars predicate expressions to DataFusion expressions.
44
+
45
+ Args:
46
+ predicate: Polars expression representing filter conditions
47
+
48
+ Returns:
49
+ DataFusion Expr object that can be used with DataFrame.filter()
50
+
51
+ Raises:
52
+ PredicateTranslationError: If predicate cannot be translated
53
+
54
+ Examples:
55
+ >>> df_expr = translate_polars_predicate_to_datafusion(pl.col("chrom") == "chr1")
56
+ >>> datafusion_df.filter(df_expr)
57
+
58
+ >>> df_expr = translate_polars_predicate_to_datafusion(
59
+ ... (pl.col("chrom") == "chr1") & (pl.col("start") > 100000)
60
+ ... )
61
+ >>> datafusion_df.filter(df_expr)
62
+ """
63
+ try:
64
+ return _translate_polars_expr(predicate)
65
+ except Exception as e:
66
+ raise PredicateTranslationError(
67
+ f"Cannot translate predicate to DataFusion: {e}"
68
+ ) from e
69
+
70
+
71
+ def _translate_polars_expr(expr: pl.Expr):
72
+ """Recursively translate Polars expression to DataFusion expression."""
73
+
74
+ expr_str = str(expr)
75
+
76
+ # Handle binary operations (col op literal)
77
+ if _is_binary_expr(expr_str):
78
+ return _translate_binary_expr(expr_str)
79
+
80
+ # Handle logical AND operations
81
+ if _is_and_expr(expr_str):
82
+ return _translate_and_expr(expr_str)
83
+
84
+ # Handle IN operations
85
+ if _is_in_expr(expr_str):
86
+ return _translate_in_expr(expr_str)
87
+
88
+ # Handle NOT IN operations (negated IN)
89
+ if _is_not_in_expr(expr_str):
90
+ return _translate_not_in_expr(expr_str)
91
+
92
+ # Handle BETWEEN operations (range checks)
93
+ if _is_between_expr(expr_str):
94
+ return _translate_between_expr(expr_str)
95
+
96
+ # Handle IS NOT NULL
97
+ if _is_not_null_expr(expr_str):
98
+ return _translate_not_null_expr(expr_str)
99
+
100
+ # Handle IS NULL
101
+ if _is_null_expr(expr_str):
102
+ return _translate_null_expr(expr_str)
103
+
104
+ raise PredicateTranslationError(f"Unsupported expression type: {expr_str}")
105
+
106
+
107
+ def _is_binary_expr(expr_str: str) -> bool:
108
+ """Check if expression is a binary operation (col op literal)."""
109
+ binary_patterns = [r"\s==\s", r"\s!=\s", r"\s<\s", r"\s<=\s", r"\s>\s", r"\s>=\s"]
110
+ return any(re.search(pattern, expr_str) for pattern in binary_patterns)
111
+
112
+
113
+ def _translate_binary_expr(expr_str: str):
114
+ """Translate binary expressions like col == value, col > value, etc."""
115
+
116
+ # Parse binary operations with regex to handle complex expressions
117
+ binary_ops = [
118
+ (r"(.+?)\s==\s(.+)", lambda l, r: col(l) == lit(r)),
119
+ (r"(.+?)\s!=\s(.+)", lambda l, r: col(l) != lit(r)),
120
+ (r"(.+?)\s<=\s(.+)", lambda l, r: col(l) <= lit(r)),
121
+ (r"(.+?)\s>=\s(.+)", lambda l, r: col(l) >= lit(r)),
122
+ (r"(.+?)\s<\s(.+)", lambda l, r: col(l) < lit(r)),
123
+ (r"(.+?)\s>\s(.+)", lambda l, r: col(l) > lit(r)),
124
+ ]
125
+
126
+ for pattern, op_func in binary_ops:
127
+ match = re.search(pattern, expr_str)
128
+ if match:
129
+ left_part = match.group(1).strip()
130
+ right_part = match.group(2).strip()
131
+
132
+ # Extract column name and literal value
133
+ column = _extract_column_name(left_part)
134
+ value = _extract_literal_value(right_part)
135
+
136
+ # Validate column and operator combination
137
+ op_symbol = pattern.split(r"\s")[1].replace("\\", "")
138
+ _validate_column_operator(column, op_symbol)
139
+
140
+ return op_func(column, value)
141
+
142
+ raise PredicateTranslationError(f"Cannot parse binary expression: {expr_str}")
143
+
144
+
145
+ def _is_and_expr(expr_str: str) -> bool:
146
+ """Check if expression is an AND operation."""
147
+ return " & " in expr_str or ".and(" in expr_str
148
+
149
+
150
+ def _translate_and_expr(expr_str: str):
151
+ """Translate AND expressions."""
152
+
153
+ # Handle & operator by finding the main & split point
154
+ if " & " in expr_str:
155
+ parts = _split_on_main_operator(expr_str, " & ")
156
+ if len(parts) == 2:
157
+ left_part = parts[0].strip().strip("()")
158
+ right_part = parts[1].strip().strip("()")
159
+
160
+ # Recursively translate both parts
161
+ left_expr = _translate_polars_expr(_create_mock_expr(left_part))
162
+ right_expr = _translate_polars_expr(_create_mock_expr(right_part))
163
+
164
+ return left_expr & right_expr
165
+
166
+ raise PredicateTranslationError(f"Cannot parse AND expression: {expr_str}")
167
+
168
+
169
+ def _is_in_expr(expr_str: str) -> bool:
170
+ """Check if expression is an IN operation."""
171
+ return ".is_in(" in expr_str
172
+
173
+
174
+ def _translate_in_expr(expr_str: str):
175
+ """Translate IN expressions like col.is_in([val1, val2])."""
176
+
177
+ # Parse col("column").is_in([values]) pattern
178
+ match = re.search(r"(.+?)\.is_in\(\[(.+?)\]\)", expr_str)
179
+ if match:
180
+ col_part = match.group(1).strip()
181
+ values_part = match.group(2).strip()
182
+
183
+ column = _extract_column_name(col_part)
184
+ values = _parse_list_values(values_part)
185
+
186
+ # Validate column supports IN operation
187
+ _validate_column_operator(column, "IN")
188
+
189
+ # Convert values to DataFusion literals
190
+ df_values = [lit(value) for value in values]
191
+
192
+ return F.in_list(col(column), df_values)
193
+
194
+
195
+ def _is_not_in_expr(expr_str: str) -> bool:
196
+ """Check if expression is a NOT IN operation (negated is_in)."""
197
+ # Common patterns from Polars repr:
198
+ # 1) ~(col("x").is_in([..]))
199
+ # 2) col("x").is_in([..]).not()
200
+ s = expr_str.replace(" ", "")
201
+ return (
202
+ (s.startswith("~(") and ".is_in([" in s and s.endswith(")"))
203
+ or ".is_in([" in s
204
+ and ").not()" in s
205
+ )
206
+
207
+
208
+ def _translate_not_in_expr(expr_str: str):
209
+ """Translate NOT IN expressions as negated in_list."""
210
+ s = expr_str.strip()
211
+ # Normalize to extract inner is_in([...]) part
212
+ inner = s
213
+ if s.startswith("~(") and s.endswith(")"):
214
+ inner = s[2:-1]
215
+ # Reuse IN translator on inner and negate
216
+ in_expr = _translate_in_expr(inner)
217
+ return ~in_expr
218
+
219
+ raise PredicateTranslationError(f"Cannot parse IN expression: {expr_str}")
220
+
221
+
222
+ def _is_between_expr(expr_str: str) -> bool:
223
+ """Check if expression represents a BETWEEN operation."""
224
+ # Look for patterns like (col >= val1) & (col <= val2)
225
+ return (" >= " in expr_str and " <= " in expr_str and " & " in expr_str) or (
226
+ " > " in expr_str and " < " in expr_str and " & " in expr_str
227
+ )
228
+
229
+
230
+ def _translate_between_expr(expr_str: str):
231
+ """Translate BETWEEN expressions from range conditions."""
232
+
233
+ # Parse (col >= min_val) & (col <= max_val) pattern
234
+ if " & " in expr_str:
235
+ parts = _split_on_main_operator(expr_str, " & ")
236
+ if len(parts) == 2:
237
+ left_part = parts[0].strip().strip("()")
238
+ right_part = parts[1].strip().strip("()")
239
+
240
+ # Extract column and values from both parts
241
+ left_col, left_op, left_val = _parse_comparison(left_part)
242
+ right_col, right_op, right_val = _parse_comparison(right_part)
243
+
244
+ # Verify same column in both parts
245
+ if left_col == right_col:
246
+ column = left_col
247
+
248
+ # Determine BETWEEN bounds
249
+ if left_op in [">", ">="] and right_op in ["<", "<="]:
250
+ min_val = left_val
251
+ max_val = right_val
252
+ elif left_op in ["<", "<="] and right_op in [">", ">="]:
253
+ min_val = right_val
254
+ max_val = left_val
255
+ else:
256
+ raise PredicateTranslationError("Invalid BETWEEN pattern")
257
+
258
+ # Validate column supports BETWEEN
259
+ _validate_column_operator(column, "BETWEEN")
260
+
261
+ return col(column).between(lit(min_val), lit(max_val))
262
+
263
+ raise PredicateTranslationError(f"Cannot parse BETWEEN expression: {expr_str}")
264
+
265
+
266
+ def _is_not_null_expr(expr_str: str) -> bool:
267
+ """Check if expression is IS NOT NULL."""
268
+ return ".is_not_null()" in expr_str
269
+
270
+
271
+ def _translate_not_null_expr(expr_str: str):
272
+ """Translate IS NOT NULL expressions."""
273
+ col_part = expr_str.split(".is_not_null()")[0]
274
+ column = _extract_column_name(col_part)
275
+ return col(column).is_not_null()
276
+
277
+
278
+ def _is_null_expr(expr_str: str) -> bool:
279
+ """Check if expression is IS NULL."""
280
+ return ".is_null()" in expr_str
281
+
282
+
283
+ def _translate_null_expr(expr_str: str):
284
+ """Translate IS NULL expressions."""
285
+ col_part = expr_str.split(".is_null()")[0]
286
+ column = _extract_column_name(col_part)
287
+ return col(column).is_null()
288
+
289
+
290
+ # Helper functions
291
+
292
+
293
+ def _extract_column_name(col_expr: str) -> str:
294
+ """Extract column name from col() expression."""
295
+ col_expr = col_expr.strip()
296
+
297
+ # Handle col("name") or col('name')
298
+ patterns = [r'col\("([^"]+)"\)', r"col\('([^']+)'\)"]
299
+
300
+ for pattern in patterns:
301
+ match = re.search(pattern, col_expr)
302
+ if match:
303
+ return match.group(1)
304
+
305
+ # Handle parentheses around the whole expression
306
+ col_expr = col_expr.strip("()")
307
+ for pattern in patterns:
308
+ match = re.search(pattern, col_expr)
309
+ if match:
310
+ return match.group(1)
311
+
312
+ raise PredicateTranslationError(f"Cannot extract column name from: {col_expr}")
313
+
314
+
315
+ def _extract_literal_value(literal_expr: str) -> Any:
316
+ """Extract literal value from expression."""
317
+ literal_expr = literal_expr.strip()
318
+
319
+ # Handle string literals
320
+ if (literal_expr.startswith('"') and literal_expr.endswith('"')) or (
321
+ literal_expr.startswith("'") and literal_expr.endswith("'")
322
+ ):
323
+ return literal_expr[1:-1]
324
+
325
+ # Handle numeric literals
326
+ try:
327
+ if "." in literal_expr:
328
+ return float(literal_expr)
329
+ else:
330
+ return int(literal_expr)
331
+ except ValueError:
332
+ pass
333
+
334
+ # Handle boolean literals
335
+ if literal_expr.lower() == "true":
336
+ return True
337
+ elif literal_expr.lower() == "false":
338
+ return False
339
+
340
+ return literal_expr
341
+
342
+
343
+ def _validate_column_operator(column: str, operator: str) -> None:
344
+ """Validate that column supports the given operator."""
345
+
346
+ # String columns: =, !=, IN, NOT IN
347
+ if (
348
+ column in GFF_STRING_COLUMNS or column not in GFF_STATIC_COLUMNS
349
+ ): # Attribute fields
350
+ if operator not in ["==", "!=", "IN", "NOT IN"]:
351
+ raise PredicateTranslationError(
352
+ f"Column '{column}' (String) does not support operator '{operator}'. "
353
+ f"Supported: ==, !=, IN, NOT IN"
354
+ )
355
+
356
+ # Numeric columns: =, !=, <, <=, >, >=, BETWEEN
357
+ elif column in GFF_UINT32_COLUMNS or column in GFF_FLOAT32_COLUMNS:
358
+ if operator not in ["==", "!=", "<", "<=", ">", ">=", "BETWEEN"]:
359
+ raise PredicateTranslationError(
360
+ f"Column '{column}' (Numeric) does not support operator '{operator}'. "
361
+ f"Supported: ==, !=, <, <=, >, >=, BETWEEN"
362
+ )
363
+
364
+
365
+ def _parse_list_values(values_str: str) -> List[Any]:
366
+ """Parse list of values from string."""
367
+ if not values_str.strip():
368
+ return []
369
+
370
+ items = [item.strip() for item in values_str.split(",")]
371
+ return [_extract_literal_value(item) for item in items if item.strip()]
372
+
373
+
374
+ def _split_on_main_operator(expr_str: str, operator: str) -> List[str]:
375
+ """Split expression on main operator, respecting parentheses."""
376
+ parts = []
377
+ current = ""
378
+ paren_depth = 0
379
+ i = 0
380
+
381
+ while i < len(expr_str):
382
+ if expr_str[i] == "(":
383
+ paren_depth += 1
384
+ elif expr_str[i] == ")":
385
+ paren_depth -= 1
386
+ elif paren_depth == 0 and expr_str[i : i + len(operator)] == operator:
387
+ parts.append(current)
388
+ current = ""
389
+ i += len(operator) - 1
390
+ else:
391
+ current += expr_str[i]
392
+ i += 1
393
+
394
+ parts.append(current)
395
+ return parts
396
+
397
+
398
+ def _parse_comparison(comp_str: str) -> tuple:
399
+ """Parse comparison string into (column, operator, value)."""
400
+ comp_str = comp_str.strip("()")
401
+
402
+ for op in [" >= ", " <= ", " > ", " < ", " == ", " != "]:
403
+ if op in comp_str:
404
+ parts = comp_str.split(op, 1)
405
+ if len(parts) == 2:
406
+ col_part = parts[0].strip()
407
+ val_part = parts[1].strip()
408
+ column = _extract_column_name(col_part)
409
+ value = _extract_literal_value(val_part)
410
+ return column, op.strip(), value
411
+
412
+ raise PredicateTranslationError(f"Cannot parse comparison: {comp_str}")
413
+
414
+
415
+ def _create_mock_expr(expr_str: str) -> pl.Expr:
416
+ """Create a mock Polars expression from string for recursive parsing."""
417
+
418
+ class MockExpr:
419
+ def __init__(self, expr_str):
420
+ self.expr_str = expr_str
421
+
422
+ def __str__(self):
423
+ return self.expr_str
424
+
425
+ return MockExpr(expr_str.strip())
426
+
427
+
428
+ def is_predicate_pushdown_supported(predicate: pl.Expr) -> bool:
429
+ """
430
+ Check if a Polars predicate can be pushed down to DataFusion.
431
+
432
+ Args:
433
+ predicate: Polars expression to check
434
+
435
+ Returns:
436
+ True if predicate can be translated and pushed down
437
+ """
438
+ try:
439
+ translate_polars_predicate_to_datafusion(predicate)
440
+ return True
441
+ except PredicateTranslationError:
442
+ return False
443
+
444
+
445
+ def get_supported_predicates_info() -> str:
446
+ """Return information about supported predicate types."""
447
+ return """
448
+ Supported GFF Predicate Pushdown Operations:
449
+
450
+ | Column | Data Type | Supported Operators | Example |
451
+ |-----------------------------|-----------|------------------------------|---------------------------------|
452
+ | chrom, source, type, strand | String | =, !=, IN, NOT IN | chrom = 'chr1' |
453
+ | start, end | UInt32 | =, !=, <, <=, >, >=, BETWEEN | start > 1000 |
454
+ | score | Float32 | =, !=, <, <=, >, >=, BETWEEN | score BETWEEN 50.0 AND 100.0 |
455
+ | Attribute fields | String | =, !=, IN, NOT IN | "ID" = 'gene1' |
456
+ | Complex | - | AND combinations | chrom = 'chr1' AND start > 1000 |
457
+
458
+ Examples:
459
+ - pl.col("chrom") == "chr1"
460
+ - pl.col("start") > 1000
461
+ - pl.col("chrom").is_in(["chr1", "chr2"])
462
+ - (pl.col("chrom") == "chr1") & (pl.col("start") > 1000)
463
+ - (pl.col("start") >= 1000) & (pl.col("start") <= 2000) # BETWEEN
464
+ """