polars-bio 0.14.1__cp39-abi3-macosx_11_0_arm64.whl → 0.15.0__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ """
2
+ Polars predicate -> SQL WHERE builder for DataFusion.
3
+
4
+ Supports GFF pushdown operators:
5
+ - Strings (chrom, source, type, strand, attribute fields): =, !=, IN, NOT IN
6
+ - UInt32 (start, end, phase): =, !=, <, <=, >, >=, BETWEEN
7
+ - Float32 (score): same as numeric
8
+ - AND combinations
9
+ - IS NULL / IS NOT NULL
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from typing import Any, List, Tuple
16
+
17
+ import polars as pl
18
+
19
+ GFF_STRING_COLUMNS = {"chrom", "source", "type", "strand"}
20
+ GFF_UINT32_COLUMNS = {"start", "end", "phase"}
21
+ GFF_FLOAT32_COLUMNS = {"score"}
22
+ GFF_STATIC_COLUMNS = (
23
+ GFF_STRING_COLUMNS | GFF_UINT32_COLUMNS | GFF_FLOAT32_COLUMNS | {"attributes"}
24
+ )
25
+
26
+
27
+ class SqlPredicateBuildError(Exception):
28
+ pass
29
+
30
+
31
+ def polars_predicate_to_sql(predicate: pl.Expr) -> str:
32
+ expr_str = str(predicate)
33
+
34
+ # Binary comparisons
35
+ if _is_binary_expr(expr_str):
36
+ return _translate_binary_expr(expr_str)
37
+
38
+ # AND combinations
39
+ if _is_and_expr(expr_str):
40
+ return _translate_and_expr(expr_str)
41
+
42
+ # IN / NOT IN
43
+ if _is_in_expr(expr_str):
44
+ return _translate_in_expr(expr_str)
45
+ if _is_not_in_expr(expr_str):
46
+ return _translate_not_in_expr(expr_str)
47
+
48
+ # BETWEEN via range combination
49
+ if _is_between_expr(expr_str):
50
+ return _translate_between_expr(expr_str)
51
+
52
+ # IS NULL / IS NOT NULL
53
+ if _is_not_null_expr(expr_str):
54
+ return _translate_not_null_expr(expr_str)
55
+ if _is_null_expr(expr_str):
56
+ return _translate_null_expr(expr_str)
57
+
58
+ raise SqlPredicateBuildError(f"Unsupported predicate: {expr_str}")
59
+
60
+
61
+ def _is_binary_expr(expr_str: str) -> bool:
62
+ return any(op in expr_str for op in [" == ", " != ", " <= ", " >= ", " < ", " > "])
63
+
64
+
65
+ def _translate_binary_expr(expr_str: str) -> str:
66
+ patterns: List[Tuple[str, str]] = [
67
+ (r"(.+?)\s==\s(.+)", "="),
68
+ (r"(.+?)\s!=\s(.+)", "!="),
69
+ (r"(.+?)\s<=\s(.+)", "<="),
70
+ (r"(.+?)\s>=\s(.+)", ">="),
71
+ (r"(.+?)\s<\s(.+)", "<"),
72
+ (r"(.+?)\s>\s(.+)", ">"),
73
+ ]
74
+ for pattern, op in patterns:
75
+ m = re.search(pattern, expr_str)
76
+ if m:
77
+ left = m.group(1).strip()
78
+ right = m.group(2).strip()
79
+ col = _extract_column_name(left)
80
+ lit = _extract_sql_literal(right)
81
+ _validate_column_operator(col, op)
82
+ return f'"{col}" {op} {lit}'
83
+ raise SqlPredicateBuildError(f"Cannot parse binary expr: {expr_str}")
84
+
85
+
86
+ def _is_and_expr(expr_str: str) -> bool:
87
+ return " & " in expr_str
88
+
89
+
90
+ def _translate_and_expr(expr_str: str) -> str:
91
+ parts = _split_on(expr_str, " & ")
92
+ if len(parts) != 2:
93
+ raise SqlPredicateBuildError(f"Cannot parse AND expression: {expr_str}")
94
+ left = polars_predicate_to_sql(_mock_expr(parts[0]))
95
+ right = polars_predicate_to_sql(_mock_expr(parts[1]))
96
+ return f"({left}) AND ({right})"
97
+
98
+
99
+ def _is_in_expr(expr_str: str) -> bool:
100
+ return ".is_in([" in expr_str
101
+
102
+
103
+ def _translate_in_expr(expr_str: str) -> str:
104
+ m = re.search(r"(.+?)\.is_in\(\[(.+?)\]\)", expr_str)
105
+ if not m:
106
+ raise SqlPredicateBuildError(f"Cannot parse IN expr: {expr_str}")
107
+ col_part = m.group(1).strip()
108
+ vals_part = m.group(2).strip()
109
+ col = _extract_column_name(col_part)
110
+ _validate_column_operator(col, "IN")
111
+ vals = [_extract_sql_literal(v.strip()) for v in vals_part.split(",") if v.strip()]
112
+ return f'"{col}" IN ({", ".join(vals)})'
113
+
114
+
115
+ def _is_not_in_expr(expr_str: str) -> bool:
116
+ s = expr_str.replace(" ", "")
117
+ return (
118
+ (s.startswith("~(") and ".is_in([" in s and s.endswith(")"))
119
+ or ".is_in([" in s
120
+ and ").not()" in s
121
+ )
122
+
123
+
124
+ def _translate_not_in_expr(expr_str: str) -> str:
125
+ s = expr_str.strip()
126
+ inner = s
127
+ if s.startswith("~(") and s.endswith(")"):
128
+ inner = s[2:-1]
129
+ in_sql = _translate_in_expr(inner)
130
+ # turn 'col IN (...)' into 'col NOT IN (...)'
131
+ return in_sql.replace(" IN ", " NOT IN ")
132
+
133
+
134
+ def _is_between_expr(expr_str: str) -> bool:
135
+ return (
136
+ (" & " in expr_str)
137
+ and any(op in expr_str for op in [" >= ", " > "])
138
+ and any(op in expr_str for op in [" <= ", " < "])
139
+ )
140
+
141
+
142
+ def _translate_between_expr(expr_str: str) -> str:
143
+ parts = _split_on(expr_str, " & ")
144
+ if len(parts) != 2:
145
+ raise SqlPredicateBuildError(f"Cannot parse BETWEEN: {expr_str}")
146
+ l_col, l_op, l_val = _parse_comparison(parts[0])
147
+ r_col, r_op, r_val = _parse_comparison(parts[1])
148
+ if l_col != r_col:
149
+ raise SqlPredicateBuildError("BETWEEN parts refer to different columns")
150
+ col = l_col
151
+ _validate_column_operator(col, "BETWEEN")
152
+ # Determine bounds regardless of ordering
153
+ lower = l_val if l_op in (">", ">=") else r_val
154
+ upper = r_val if r_op in ("<", "<=") else l_val
155
+ return (
156
+ f'"{col}" BETWEEN {_to_sql_number(col, lower)} AND {_to_sql_number(col, upper)}'
157
+ )
158
+
159
+
160
+ def _is_not_null_expr(expr_str: str) -> bool:
161
+ return ".is_not_null()" in expr_str
162
+
163
+
164
+ def _translate_not_null_expr(expr_str: str) -> str:
165
+ col = _extract_column_name(expr_str.split(".is_not_null()", 1)[0])
166
+ return f'"{col}" IS NOT NULL'
167
+
168
+
169
+ def _is_null_expr(expr_str: str) -> bool:
170
+ return ".is_null()" in expr_str
171
+
172
+
173
+ def _translate_null_expr(expr_str: str) -> str:
174
+ col = _extract_column_name(expr_str.split(".is_null()", 1)[0])
175
+ return f'"{col}" IS NULL'
176
+
177
+
178
+ # Helpers
179
+
180
+
181
+ def _extract_column_name(col_expr: str) -> str:
182
+ s = col_expr.strip().strip("()").strip()
183
+ for pat in [r'col\("([^"]+)"\)', r"col\('([^']+)'\)"]:
184
+ m = re.search(pat, s)
185
+ if m:
186
+ return m.group(1)
187
+ # Sometimes string form may already be bare column
188
+ return s
189
+
190
+
191
+ def _extract_sql_literal(literal_expr: str) -> str:
192
+ s = literal_expr.strip()
193
+ # Strip parentheses
194
+ if s.startswith("(") and s.endswith(")"):
195
+ s = s[1:-1].strip()
196
+ # Strip Polars debug prefixes like 'dyn int:'
197
+ if ":" in s:
198
+ head, tail = s.split(":", 1)
199
+ head_l = head.strip().lower()
200
+ if head_l.startswith("dyn ") or head_l in {
201
+ "int",
202
+ "float",
203
+ "string",
204
+ "lit",
205
+ "literal",
206
+ }:
207
+ s = tail.strip()
208
+ # String literal
209
+ if (s.startswith('"') and s.endswith('"')) or (
210
+ s.startswith("'") and s.endswith("'")
211
+ ):
212
+ return _quote_string(s[1:-1])
213
+ # Boolean
214
+ if s.lower() in ("true", "false"):
215
+ return s.upper()
216
+ # Numeric (leave unquoted)
217
+ try:
218
+ float(s) # validate numeric
219
+ return s
220
+ except Exception:
221
+ pass
222
+ # Fallback: quote as string
223
+ return _quote_string(s)
224
+
225
+
226
+ def _to_sql_number(column: str, value: Any) -> str:
227
+ # value may already be numeric; ensure output is numeric literal (no quotes)
228
+ if isinstance(value, (int, float)):
229
+ return str(value)
230
+ try:
231
+ return str(int(value)) if "." not in str(value) else str(float(value))
232
+ except Exception:
233
+ return str(value)
234
+
235
+
236
+ def _quote_string(val: str) -> str:
237
+ return "'" + val.replace("'", "''") + "'"
238
+
239
+
240
+ def _validate_column_operator(column: str, operator: str) -> None:
241
+ if column in GFF_STRING_COLUMNS or column not in GFF_STATIC_COLUMNS:
242
+ if operator not in ("=", "!=", "IN", "NOT IN"):
243
+ raise SqlPredicateBuildError(
244
+ f"Column '{column}' (String) unsupported op '{operator}'"
245
+ )
246
+ elif column in GFF_UINT32_COLUMNS or column in GFF_FLOAT32_COLUMNS:
247
+ if operator not in ("=", "!=", "<", "<=", ">", ">=", "BETWEEN"):
248
+ raise SqlPredicateBuildError(
249
+ f"Column '{column}' (Numeric) unsupported op '{operator}'"
250
+ )
251
+
252
+
253
+ def _parse_comparison(comp_str: str) -> tuple[str, str, Any]:
254
+ s = comp_str.strip().strip("()")
255
+ for op in [" >= ", " <= ", " > ", " < ", " == ", " != "]:
256
+ if op in s:
257
+ left, right = s.split(op, 1)
258
+ col = _extract_column_name(left)
259
+ lit = _extract_sql_literal(right)
260
+ return col, op.strip(), lit
261
+ raise SqlPredicateBuildError(f"Cannot parse comparison: {comp_str}")
262
+
263
+
264
+ def _split_on(expr: str, sep: str) -> List[str]:
265
+ parts: List[str] = []
266
+ cur = ""
267
+ depth = 0
268
+ i = 0
269
+ while i < len(expr):
270
+ if expr[i] == "(":
271
+ depth += 1
272
+ elif expr[i] == ")":
273
+ depth -= 1
274
+ if depth == 0 and expr[i : i + len(sep)] == sep:
275
+ parts.append(cur)
276
+ cur = ""
277
+ i += len(sep)
278
+ continue
279
+ cur += expr[i]
280
+ i += 1
281
+ parts.append(cur)
282
+ return [p.strip().strip("()") for p in parts if p.strip()]
283
+
284
+
285
+ def _mock_expr(s: str) -> pl.Expr:
286
+ class E:
287
+ def __init__(self, expr_str: str) -> None:
288
+ self._s = expr_str
289
+
290
+ def __str__(self) -> str:
291
+ return self._s
292
+
293
+ return E(s)
polars_bio/utils.py CHANGED
@@ -13,7 +13,12 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
13
13
 
14
14
 
15
15
  def _lazy_scan(
16
- df: Union[pl.DataFrame, pl.LazyFrame], projection_pushdown: bool = False
16
+ df: Union[pl.DataFrame, pl.LazyFrame],
17
+ projection_pushdown: bool = False,
18
+ predicate_pushdown: bool = False,
19
+ table_name: str = None,
20
+ input_format=None,
21
+ file_path: str = None,
17
22
  ) -> pl.LazyFrame:
18
23
  df_lazy: DataFrame = df
19
24
  original_schema = df_lazy.schema()
@@ -29,9 +34,27 @@ def _lazy_scan(
29
34
  if projection_pushdown and with_columns is not None:
30
35
  projected_columns = _extract_column_names_from_expr(with_columns)
31
36
 
32
- # Apply column projection to DataFusion query if enabled
37
+ # Apply column projection and predicate pushdown to DataFusion query if enabled
33
38
  query_df = df_lazy
34
39
  datafusion_projection_applied = False
40
+ datafusion_predicate_applied = False
41
+
42
+ # Handle predicate pushdown first
43
+ if predicate_pushdown and predicate is not None:
44
+ try:
45
+ from .predicate_translator import (
46
+ translate_polars_predicate_to_datafusion,
47
+ )
48
+
49
+ datafusion_predicate = translate_polars_predicate_to_datafusion(
50
+ predicate
51
+ )
52
+ query_df = query_df.filter(datafusion_predicate)
53
+ datafusion_predicate_applied = True
54
+ except Exception as e:
55
+ # Fallback to Python-level filtering if predicate pushdown fails
56
+ datafusion_predicate_applied = False
57
+ # Note: error handling for debugging could be added here if needed
35
58
  if projection_pushdown and projected_columns:
36
59
  try:
37
60
  query_df = df_lazy.select(projected_columns)
@@ -65,7 +88,8 @@ def _lazy_scan(
65
88
  if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
66
89
  df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
67
90
  df = pl.DataFrame(df).limit(n_rows)
68
- if predicate is not None:
91
+ # Apply Python-level predicate only if DataFusion predicate pushdown failed
92
+ if predicate is not None and not datafusion_predicate_applied:
69
93
  df = df.filter(predicate)
70
94
  # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
71
95
  if with_columns is not None and (
@@ -80,7 +104,8 @@ def _lazy_scan(
80
104
  for r in df_stream:
81
105
  py_df = r.to_pyarrow()
82
106
  df = pl.DataFrame(py_df)
83
- if predicate is not None:
107
+ # Apply Python-level predicate only if DataFusion predicate pushdown failed
108
+ if predicate is not None and not datafusion_predicate_applied:
84
109
  df = df.filter(predicate)
85
110
  # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
86
111
  if with_columns is not None and (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polars-bio
3
- Version: 0.14.1
3
+ Version: 0.15.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -1,19 +1,21 @@
1
- polars_bio-0.14.1.dist-info/METADATA,sha256=8KJaeMbWl9JubvvkFtXgXIqguS51h_Q9By9zS_LqgDQ,729
2
- polars_bio-0.14.1.dist-info/WHEEL,sha256=DLqF2HZq4W_umZdP6RnfAuqhmtX_UrV4mkqrSIMhipE,102
3
- polars_bio-0.14.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
- polars_bio/__init__.py,sha256=7zRueEP9WGpQFe3U9rgbB6fZlOxLmbFjdS4nS56T74c,2977
1
+ polars_bio-0.15.0.dist-info/METADATA,sha256=v_bR6U9u1JBrZzOPsw35M6oJJN-Ip21TnBoCPNYp1KE,729
2
+ polars_bio-0.15.0.dist-info/WHEEL,sha256=DLqF2HZq4W_umZdP6RnfAuqhmtX_UrV4mkqrSIMhipE,102
3
+ polars_bio-0.15.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
+ polars_bio/__init__.py,sha256=0KO1IT6lsQeMdrw__poE9Ox_vIRHRZJVxkiqcjUfN1o,2977
5
5
  polars_bio/constants.py,sha256=FSElRAICog-CpyfK6OumYcfChKouYfpWLKLpMvi8zaY,116
6
6
  polars_bio/context.py,sha256=1dn-tYqq5S2c_kW4baV6_AE8BzYavnwPoWhCmGdvzBU,1666
7
7
  polars_bio/interval_op_helpers.py,sha256=xMWxu2y3jIwt0KCtzIPF_cvbUMdhrb8Mif74MbHU1qY,2834
8
- polars_bio/io.py,sha256=CZfAd_rJfSvvdSuBXPQJw-BOprpbcgmDBjo2CdBQI2E,51133
8
+ polars_bio/io.py,sha256=iRV95rctEekB3aoK4LaYI56d5qBmLtcB-7w8b1HuVkg,59344
9
9
  polars_bio/logging.py,sha256=7vu1zLq2QOe9C2svD_ZDdwo3w0EI1MWF7ZXoYqdhOjE,1315
10
10
  polars_bio/operations.py,sha256=hYFr40OeoEEq_S4g-zHBvHRQhXpAOiltS1DwxnbFa1I,2212
11
- polars_bio/polars_bio.abi3.so,sha256=BqhH19R2sOoa9nnntVr-a57qapwCNWpSM0rKK8e3pPE,263686080
11
+ polars_bio/polars_bio.abi3.so,sha256=dEuhwrzJx0huIQqyYUQA7sTTTr6xtcW0zQ52Mt8P79Q,264662736
12
12
  polars_bio/polars_ext.py,sha256=zELk_w_ScMFYJsfQl4qI3UucdahkCNovWKY595HrkK8,9416
13
+ polars_bio/predicate_translator.py,sha256=JTelMVd99qiDRhgMbRpeloU5cdSBPJdCvBvBnoKMKws,15978
13
14
  polars_bio/range_op.py,sha256=awmzuCfsy19osJ-M3UwTCr2zT2oSsJtzwl5I3KcB5aI,25693
14
15
  polars_bio/range_op_helpers.py,sha256=pgia2ewu9IzZMMcNvxoeHdaJmBdxVyhSxpHPM6Vc7lw,6040
15
16
  polars_bio/range_op_io.py,sha256=Cs30bagbiJvmjebDaD1go9WIFlSlXeLgmmr5tHvZTII,7076
16
17
  polars_bio/range_utils.py,sha256=Y9vJVfL50gLP3kLE0Z7xjTc8qggoFpHX1WBRzIOvXpU,1398
17
18
  polars_bio/sql.py,sha256=L3uwHEOT6BNmKmJVBD-8Mm0iWqrDyKLVkOwFzV4UlBw,24517
18
- polars_bio/utils.py,sha256=AFrVVGpTwrhwhbVApbra2fH7wqo2IaPNMIwi796P-hQ,4972
19
- polars_bio-0.14.1.dist-info/RECORD,,
19
+ polars_bio/sql_predicate_builder.py,sha256=HboE57drBphIap2c35hUG2YcBchuatT0xFGsR_qEuBA,8766
20
+ polars_bio/utils.py,sha256=u9s626x1ZQN6-6kq8taZ0opxDCY-ZjHt-DqXyhQx6JY,6159
21
+ polars_bio-0.15.0.dist-info/RECORD,,