polars-bio 0.14.1__cp39-abi3-win_amd64.whl → 0.15.0__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_bio/__init__.py +1 -1
- polars_bio/io.py +425 -184
- polars_bio/polars_bio.pyd +0 -0
- polars_bio/predicate_translator.py +464 -0
- polars_bio/sql_predicate_builder.py +293 -0
- polars_bio/utils.py +29 -4
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/METADATA +1 -1
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/RECORD +10 -8
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/WHEEL +0 -0
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,293 @@
|
|
1
|
+
"""
|
2
|
+
Polars predicate -> SQL WHERE builder for DataFusion.
|
3
|
+
|
4
|
+
Supports GFF pushdown operators:
|
5
|
+
- Strings (chrom, source, type, strand, attribute fields): =, !=, IN, NOT IN
|
6
|
+
- UInt32 (start, end, phase): =, !=, <, <=, >, >=, BETWEEN
|
7
|
+
- Float32 (score): same as numeric
|
8
|
+
- AND combinations
|
9
|
+
- IS NULL / IS NOT NULL
|
10
|
+
"""
|
11
|
+
|
12
|
+
from __future__ import annotations
|
13
|
+
|
14
|
+
import re
|
15
|
+
from typing import Any, List, Tuple
|
16
|
+
|
17
|
+
import polars as pl
|
18
|
+
|
19
|
+
GFF_STRING_COLUMNS = {"chrom", "source", "type", "strand"}
|
20
|
+
GFF_UINT32_COLUMNS = {"start", "end", "phase"}
|
21
|
+
GFF_FLOAT32_COLUMNS = {"score"}
|
22
|
+
GFF_STATIC_COLUMNS = (
|
23
|
+
GFF_STRING_COLUMNS | GFF_UINT32_COLUMNS | GFF_FLOAT32_COLUMNS | {"attributes"}
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
class SqlPredicateBuildError(Exception):
|
28
|
+
pass
|
29
|
+
|
30
|
+
|
31
|
+
def polars_predicate_to_sql(predicate: pl.Expr) -> str:
|
32
|
+
expr_str = str(predicate)
|
33
|
+
|
34
|
+
# Binary comparisons
|
35
|
+
if _is_binary_expr(expr_str):
|
36
|
+
return _translate_binary_expr(expr_str)
|
37
|
+
|
38
|
+
# AND combinations
|
39
|
+
if _is_and_expr(expr_str):
|
40
|
+
return _translate_and_expr(expr_str)
|
41
|
+
|
42
|
+
# IN / NOT IN
|
43
|
+
if _is_in_expr(expr_str):
|
44
|
+
return _translate_in_expr(expr_str)
|
45
|
+
if _is_not_in_expr(expr_str):
|
46
|
+
return _translate_not_in_expr(expr_str)
|
47
|
+
|
48
|
+
# BETWEEN via range combination
|
49
|
+
if _is_between_expr(expr_str):
|
50
|
+
return _translate_between_expr(expr_str)
|
51
|
+
|
52
|
+
# IS NULL / IS NOT NULL
|
53
|
+
if _is_not_null_expr(expr_str):
|
54
|
+
return _translate_not_null_expr(expr_str)
|
55
|
+
if _is_null_expr(expr_str):
|
56
|
+
return _translate_null_expr(expr_str)
|
57
|
+
|
58
|
+
raise SqlPredicateBuildError(f"Unsupported predicate: {expr_str}")
|
59
|
+
|
60
|
+
|
61
|
+
def _is_binary_expr(expr_str: str) -> bool:
|
62
|
+
return any(op in expr_str for op in [" == ", " != ", " <= ", " >= ", " < ", " > "])
|
63
|
+
|
64
|
+
|
65
|
+
def _translate_binary_expr(expr_str: str) -> str:
|
66
|
+
patterns: List[Tuple[str, str]] = [
|
67
|
+
(r"(.+?)\s==\s(.+)", "="),
|
68
|
+
(r"(.+?)\s!=\s(.+)", "!="),
|
69
|
+
(r"(.+?)\s<=\s(.+)", "<="),
|
70
|
+
(r"(.+?)\s>=\s(.+)", ">="),
|
71
|
+
(r"(.+?)\s<\s(.+)", "<"),
|
72
|
+
(r"(.+?)\s>\s(.+)", ">"),
|
73
|
+
]
|
74
|
+
for pattern, op in patterns:
|
75
|
+
m = re.search(pattern, expr_str)
|
76
|
+
if m:
|
77
|
+
left = m.group(1).strip()
|
78
|
+
right = m.group(2).strip()
|
79
|
+
col = _extract_column_name(left)
|
80
|
+
lit = _extract_sql_literal(right)
|
81
|
+
_validate_column_operator(col, op)
|
82
|
+
return f'"{col}" {op} {lit}'
|
83
|
+
raise SqlPredicateBuildError(f"Cannot parse binary expr: {expr_str}")
|
84
|
+
|
85
|
+
|
86
|
+
def _is_and_expr(expr_str: str) -> bool:
|
87
|
+
return " & " in expr_str
|
88
|
+
|
89
|
+
|
90
|
+
def _translate_and_expr(expr_str: str) -> str:
|
91
|
+
parts = _split_on(expr_str, " & ")
|
92
|
+
if len(parts) != 2:
|
93
|
+
raise SqlPredicateBuildError(f"Cannot parse AND expression: {expr_str}")
|
94
|
+
left = polars_predicate_to_sql(_mock_expr(parts[0]))
|
95
|
+
right = polars_predicate_to_sql(_mock_expr(parts[1]))
|
96
|
+
return f"({left}) AND ({right})"
|
97
|
+
|
98
|
+
|
99
|
+
def _is_in_expr(expr_str: str) -> bool:
|
100
|
+
return ".is_in([" in expr_str
|
101
|
+
|
102
|
+
|
103
|
+
def _translate_in_expr(expr_str: str) -> str:
|
104
|
+
m = re.search(r"(.+?)\.is_in\(\[(.+?)\]\)", expr_str)
|
105
|
+
if not m:
|
106
|
+
raise SqlPredicateBuildError(f"Cannot parse IN expr: {expr_str}")
|
107
|
+
col_part = m.group(1).strip()
|
108
|
+
vals_part = m.group(2).strip()
|
109
|
+
col = _extract_column_name(col_part)
|
110
|
+
_validate_column_operator(col, "IN")
|
111
|
+
vals = [_extract_sql_literal(v.strip()) for v in vals_part.split(",") if v.strip()]
|
112
|
+
return f'"{col}" IN ({", ".join(vals)})'
|
113
|
+
|
114
|
+
|
115
|
+
def _is_not_in_expr(expr_str: str) -> bool:
|
116
|
+
s = expr_str.replace(" ", "")
|
117
|
+
return (
|
118
|
+
(s.startswith("~(") and ".is_in([" in s and s.endswith(")"))
|
119
|
+
or ".is_in([" in s
|
120
|
+
and ").not()" in s
|
121
|
+
)
|
122
|
+
|
123
|
+
|
124
|
+
def _translate_not_in_expr(expr_str: str) -> str:
|
125
|
+
s = expr_str.strip()
|
126
|
+
inner = s
|
127
|
+
if s.startswith("~(") and s.endswith(")"):
|
128
|
+
inner = s[2:-1]
|
129
|
+
in_sql = _translate_in_expr(inner)
|
130
|
+
# turn 'col IN (...)' into 'col NOT IN (...)'
|
131
|
+
return in_sql.replace(" IN ", " NOT IN ")
|
132
|
+
|
133
|
+
|
134
|
+
def _is_between_expr(expr_str: str) -> bool:
|
135
|
+
return (
|
136
|
+
(" & " in expr_str)
|
137
|
+
and any(op in expr_str for op in [" >= ", " > "])
|
138
|
+
and any(op in expr_str for op in [" <= ", " < "])
|
139
|
+
)
|
140
|
+
|
141
|
+
|
142
|
+
def _translate_between_expr(expr_str: str) -> str:
|
143
|
+
parts = _split_on(expr_str, " & ")
|
144
|
+
if len(parts) != 2:
|
145
|
+
raise SqlPredicateBuildError(f"Cannot parse BETWEEN: {expr_str}")
|
146
|
+
l_col, l_op, l_val = _parse_comparison(parts[0])
|
147
|
+
r_col, r_op, r_val = _parse_comparison(parts[1])
|
148
|
+
if l_col != r_col:
|
149
|
+
raise SqlPredicateBuildError("BETWEEN parts refer to different columns")
|
150
|
+
col = l_col
|
151
|
+
_validate_column_operator(col, "BETWEEN")
|
152
|
+
# Determine bounds regardless of ordering
|
153
|
+
lower = l_val if l_op in (">", ">=") else r_val
|
154
|
+
upper = r_val if r_op in ("<", "<=") else l_val
|
155
|
+
return (
|
156
|
+
f'"{col}" BETWEEN {_to_sql_number(col, lower)} AND {_to_sql_number(col, upper)}'
|
157
|
+
)
|
158
|
+
|
159
|
+
|
160
|
+
def _is_not_null_expr(expr_str: str) -> bool:
|
161
|
+
return ".is_not_null()" in expr_str
|
162
|
+
|
163
|
+
|
164
|
+
def _translate_not_null_expr(expr_str: str) -> str:
|
165
|
+
col = _extract_column_name(expr_str.split(".is_not_null()", 1)[0])
|
166
|
+
return f'"{col}" IS NOT NULL'
|
167
|
+
|
168
|
+
|
169
|
+
def _is_null_expr(expr_str: str) -> bool:
|
170
|
+
return ".is_null()" in expr_str
|
171
|
+
|
172
|
+
|
173
|
+
def _translate_null_expr(expr_str: str) -> str:
|
174
|
+
col = _extract_column_name(expr_str.split(".is_null()", 1)[0])
|
175
|
+
return f'"{col}" IS NULL'
|
176
|
+
|
177
|
+
|
178
|
+
# Helpers
|
179
|
+
|
180
|
+
|
181
|
+
def _extract_column_name(col_expr: str) -> str:
|
182
|
+
s = col_expr.strip().strip("()").strip()
|
183
|
+
for pat in [r'col\("([^"]+)"\)', r"col\('([^']+)'\)"]:
|
184
|
+
m = re.search(pat, s)
|
185
|
+
if m:
|
186
|
+
return m.group(1)
|
187
|
+
# Sometimes string form may already be bare column
|
188
|
+
return s
|
189
|
+
|
190
|
+
|
191
|
+
def _extract_sql_literal(literal_expr: str) -> str:
|
192
|
+
s = literal_expr.strip()
|
193
|
+
# Strip parentheses
|
194
|
+
if s.startswith("(") and s.endswith(")"):
|
195
|
+
s = s[1:-1].strip()
|
196
|
+
# Strip Polars debug prefixes like 'dyn int:'
|
197
|
+
if ":" in s:
|
198
|
+
head, tail = s.split(":", 1)
|
199
|
+
head_l = head.strip().lower()
|
200
|
+
if head_l.startswith("dyn ") or head_l in {
|
201
|
+
"int",
|
202
|
+
"float",
|
203
|
+
"string",
|
204
|
+
"lit",
|
205
|
+
"literal",
|
206
|
+
}:
|
207
|
+
s = tail.strip()
|
208
|
+
# String literal
|
209
|
+
if (s.startswith('"') and s.endswith('"')) or (
|
210
|
+
s.startswith("'") and s.endswith("'")
|
211
|
+
):
|
212
|
+
return _quote_string(s[1:-1])
|
213
|
+
# Boolean
|
214
|
+
if s.lower() in ("true", "false"):
|
215
|
+
return s.upper()
|
216
|
+
# Numeric (leave unquoted)
|
217
|
+
try:
|
218
|
+
float(s) # validate numeric
|
219
|
+
return s
|
220
|
+
except Exception:
|
221
|
+
pass
|
222
|
+
# Fallback: quote as string
|
223
|
+
return _quote_string(s)
|
224
|
+
|
225
|
+
|
226
|
+
def _to_sql_number(column: str, value: Any) -> str:
|
227
|
+
# value may already be numeric; ensure output is numeric literal (no quotes)
|
228
|
+
if isinstance(value, (int, float)):
|
229
|
+
return str(value)
|
230
|
+
try:
|
231
|
+
return str(int(value)) if "." not in str(value) else str(float(value))
|
232
|
+
except Exception:
|
233
|
+
return str(value)
|
234
|
+
|
235
|
+
|
236
|
+
def _quote_string(val: str) -> str:
|
237
|
+
return "'" + val.replace("'", "''") + "'"
|
238
|
+
|
239
|
+
|
240
|
+
def _validate_column_operator(column: str, operator: str) -> None:
|
241
|
+
if column in GFF_STRING_COLUMNS or column not in GFF_STATIC_COLUMNS:
|
242
|
+
if operator not in ("=", "!=", "IN", "NOT IN"):
|
243
|
+
raise SqlPredicateBuildError(
|
244
|
+
f"Column '{column}' (String) unsupported op '{operator}'"
|
245
|
+
)
|
246
|
+
elif column in GFF_UINT32_COLUMNS or column in GFF_FLOAT32_COLUMNS:
|
247
|
+
if operator not in ("=", "!=", "<", "<=", ">", ">=", "BETWEEN"):
|
248
|
+
raise SqlPredicateBuildError(
|
249
|
+
f"Column '{column}' (Numeric) unsupported op '{operator}'"
|
250
|
+
)
|
251
|
+
|
252
|
+
|
253
|
+
def _parse_comparison(comp_str: str) -> tuple[str, str, Any]:
|
254
|
+
s = comp_str.strip().strip("()")
|
255
|
+
for op in [" >= ", " <= ", " > ", " < ", " == ", " != "]:
|
256
|
+
if op in s:
|
257
|
+
left, right = s.split(op, 1)
|
258
|
+
col = _extract_column_name(left)
|
259
|
+
lit = _extract_sql_literal(right)
|
260
|
+
return col, op.strip(), lit
|
261
|
+
raise SqlPredicateBuildError(f"Cannot parse comparison: {comp_str}")
|
262
|
+
|
263
|
+
|
264
|
+
def _split_on(expr: str, sep: str) -> List[str]:
|
265
|
+
parts: List[str] = []
|
266
|
+
cur = ""
|
267
|
+
depth = 0
|
268
|
+
i = 0
|
269
|
+
while i < len(expr):
|
270
|
+
if expr[i] == "(":
|
271
|
+
depth += 1
|
272
|
+
elif expr[i] == ")":
|
273
|
+
depth -= 1
|
274
|
+
if depth == 0 and expr[i : i + len(sep)] == sep:
|
275
|
+
parts.append(cur)
|
276
|
+
cur = ""
|
277
|
+
i += len(sep)
|
278
|
+
continue
|
279
|
+
cur += expr[i]
|
280
|
+
i += 1
|
281
|
+
parts.append(cur)
|
282
|
+
return [p.strip().strip("()") for p in parts if p.strip()]
|
283
|
+
|
284
|
+
|
285
|
+
def _mock_expr(s: str) -> pl.Expr:
|
286
|
+
class E:
|
287
|
+
def __init__(self, expr_str: str) -> None:
|
288
|
+
self._s = expr_str
|
289
|
+
|
290
|
+
def __str__(self) -> str:
|
291
|
+
return self._s
|
292
|
+
|
293
|
+
return E(s)
|
polars_bio/utils.py
CHANGED
@@ -13,7 +13,12 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
|
|
13
13
|
|
14
14
|
|
15
15
|
def _lazy_scan(
|
16
|
-
df: Union[pl.DataFrame, pl.LazyFrame],
|
16
|
+
df: Union[pl.DataFrame, pl.LazyFrame],
|
17
|
+
projection_pushdown: bool = False,
|
18
|
+
predicate_pushdown: bool = False,
|
19
|
+
table_name: str = None,
|
20
|
+
input_format=None,
|
21
|
+
file_path: str = None,
|
17
22
|
) -> pl.LazyFrame:
|
18
23
|
df_lazy: DataFrame = df
|
19
24
|
original_schema = df_lazy.schema()
|
@@ -29,9 +34,27 @@ def _lazy_scan(
|
|
29
34
|
if projection_pushdown and with_columns is not None:
|
30
35
|
projected_columns = _extract_column_names_from_expr(with_columns)
|
31
36
|
|
32
|
-
# Apply column projection to DataFusion query if enabled
|
37
|
+
# Apply column projection and predicate pushdown to DataFusion query if enabled
|
33
38
|
query_df = df_lazy
|
34
39
|
datafusion_projection_applied = False
|
40
|
+
datafusion_predicate_applied = False
|
41
|
+
|
42
|
+
# Handle predicate pushdown first
|
43
|
+
if predicate_pushdown and predicate is not None:
|
44
|
+
try:
|
45
|
+
from .predicate_translator import (
|
46
|
+
translate_polars_predicate_to_datafusion,
|
47
|
+
)
|
48
|
+
|
49
|
+
datafusion_predicate = translate_polars_predicate_to_datafusion(
|
50
|
+
predicate
|
51
|
+
)
|
52
|
+
query_df = query_df.filter(datafusion_predicate)
|
53
|
+
datafusion_predicate_applied = True
|
54
|
+
except Exception as e:
|
55
|
+
# Fallback to Python-level filtering if predicate pushdown fails
|
56
|
+
datafusion_predicate_applied = False
|
57
|
+
# Note: error handling for debugging could be added here if needed
|
35
58
|
if projection_pushdown and projected_columns:
|
36
59
|
try:
|
37
60
|
query_df = df_lazy.select(projected_columns)
|
@@ -65,7 +88,8 @@ def _lazy_scan(
|
|
65
88
|
if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
|
66
89
|
df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
|
67
90
|
df = pl.DataFrame(df).limit(n_rows)
|
68
|
-
if predicate
|
91
|
+
# Apply Python-level predicate only if DataFusion predicate pushdown failed
|
92
|
+
if predicate is not None and not datafusion_predicate_applied:
|
69
93
|
df = df.filter(predicate)
|
70
94
|
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
71
95
|
if with_columns is not None and (
|
@@ -80,7 +104,8 @@ def _lazy_scan(
|
|
80
104
|
for r in df_stream:
|
81
105
|
py_df = r.to_pyarrow()
|
82
106
|
df = pl.DataFrame(py_df)
|
83
|
-
if predicate
|
107
|
+
# Apply Python-level predicate only if DataFusion predicate pushdown failed
|
108
|
+
if predicate is not None and not datafusion_predicate_applied:
|
84
109
|
df = df.filter(predicate)
|
85
110
|
# Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
|
86
111
|
if with_columns is not None and (
|
@@ -1,19 +1,21 @@
|
|
1
|
-
polars_bio-0.
|
2
|
-
polars_bio-0.
|
3
|
-
polars_bio-0.
|
4
|
-
polars_bio/__init__.py,sha256=
|
1
|
+
polars_bio-0.15.0.dist-info/METADATA,sha256=v_bR6U9u1JBrZzOPsw35M6oJJN-Ip21TnBoCPNYp1KE,729
|
2
|
+
polars_bio-0.15.0.dist-info/WHEEL,sha256=-M5O7l5EczTA8VFaBQsg2Fpg0dKz0WOuvpt3nEh86bo,94
|
3
|
+
polars_bio-0.15.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
4
|
+
polars_bio/__init__.py,sha256=1goHEmgkMMqXLQRHAJMDQgwNp_qoPtwIte0uMC8IC34,3097
|
5
5
|
polars_bio/constants.py,sha256=m9jMLB8PpbmWcsrCQZhRBGsBAE6X8frsSlgteVeEyo4,119
|
6
6
|
polars_bio/context.py,sha256=AP5EM2TsB9zcomlsPEz8xMwQnEXwqpRsgBTnZsHYQwA,1723
|
7
7
|
polars_bio/interval_op_helpers.py,sha256=DQIo4lUxzd-ySUbjfwNSk5zYcxpprwQe32kTPE28ypw,2930
|
8
|
-
polars_bio/io.py,sha256=
|
8
|
+
polars_bio/io.py,sha256=Dgk4Ei8jm-9a50wNrzpyHaUNxpYYWl17MfrMymJE4os,60651
|
9
9
|
polars_bio/logging.py,sha256=Q25cv4qiwLmAiGJq6ZlqYJn2WJ_uN-c5_eopib2z8bc,1354
|
10
10
|
polars_bio/operations.py,sha256=amhaff8Ha3UuQmS8OCVFXRQWvQOW_4G2T5U8tF1f7mc,2272
|
11
|
-
polars_bio/polars_bio.pyd,sha256=
|
11
|
+
polars_bio/polars_bio.pyd,sha256=kYXCJLsAqMwWweeeOlxZl0UL8M_LVlL0tTyuXU0NvnQ,276247552
|
12
12
|
polars_bio/polars_ext.py,sha256=lT8-cYAvSyhbzbpozjlF59VWTCYOzLafSZ-7bi9f49Y,9658
|
13
|
+
polars_bio/predicate_translator.py,sha256=Zg0Zq1yX_K3BKAbKW3xZMI8xpK3RAaRUznkt_QT22mc,16442
|
13
14
|
polars_bio/range_op.py,sha256=UbWKBf06rPf2GXAQT0TzXR6H0rVZeCcFCqxISMuzNpk,26289
|
14
15
|
polars_bio/range_op_helpers.py,sha256=RcvXc52cJVnK4fyCtwEcYvOB5TmKItGyiReiHBGHDng,6200
|
15
16
|
polars_bio/range_op_io.py,sha256=XTBTclFCCe4utMRAju9rOUzHvLkpKo5dCn-aCBwzRfY,7275
|
16
17
|
polars_bio/range_utils.py,sha256=Q0UPB7DV4mPjOlQ_xDVLN3vJaY9ZEr4IHFVfVBnPLDY,1446
|
17
18
|
polars_bio/sql.py,sha256=vWdZCyAXTPUHTko9al90JK8tgrChnB7Fn2hUiE0bw5c,24986
|
18
|
-
polars_bio/
|
19
|
-
polars_bio
|
19
|
+
polars_bio/sql_predicate_builder.py,sha256=dx46fZNNfFNSi_B8iv3dTjvl2gvNH4X0YapQacNhrOU,9059
|
20
|
+
polars_bio/utils.py,sha256=U3-z0Laz9OB8OliVX8MHuFHpRmt4B6DCPXfrwkMOCJQ,6308
|
21
|
+
polars_bio-0.15.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|