polars-bio 0.14.1__cp39-abi3-win_amd64.whl → 0.15.0__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_bio/__init__.py +1 -1
- polars_bio/io.py +425 -184
- polars_bio/polars_bio.pyd +0 -0
- polars_bio/predicate_translator.py +464 -0
- polars_bio/sql_predicate_builder.py +293 -0
- polars_bio/utils.py +29 -4
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/METADATA +1 -1
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/RECORD +10 -8
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/WHEEL +0 -0
- {polars_bio-0.14.1.dist-info → polars_bio-0.15.0.dist-info}/licenses/LICENSE +0 -0
polars_bio/io.py
CHANGED
@@ -316,6 +316,7 @@ class IOOperations:
|
|
316
316
|
timeout: int = 300,
|
317
317
|
compression_type: str = "auto",
|
318
318
|
projection_pushdown: bool = False,
|
319
|
+
predicate_pushdown: bool = False,
|
319
320
|
parallel: bool = False,
|
320
321
|
) -> pl.DataFrame:
|
321
322
|
"""
|
@@ -332,6 +333,7 @@ class IOOperations:
|
|
332
333
|
timeout: The timeout in seconds for reading the file from object storage.
|
333
334
|
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically..
|
334
335
|
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
336
|
+
predicate_pushdown: Enable predicate pushdown optimization to push filter conditions down to the DataFusion table provider level, reducing data processing and I/O.
|
335
337
|
parallel: Whether to use the parallel reader for BGZF-compressed local files (uses BGZF chunk-level parallelism similar to FASTQ).
|
336
338
|
|
337
339
|
!!! note
|
@@ -348,6 +350,7 @@ class IOOperations:
|
|
348
350
|
timeout,
|
349
351
|
compression_type,
|
350
352
|
projection_pushdown,
|
353
|
+
predicate_pushdown,
|
351
354
|
parallel,
|
352
355
|
).collect()
|
353
356
|
|
@@ -363,6 +366,7 @@ class IOOperations:
|
|
363
366
|
timeout: int = 300,
|
364
367
|
compression_type: str = "auto",
|
365
368
|
projection_pushdown: bool = False,
|
369
|
+
predicate_pushdown: bool = False,
|
366
370
|
parallel: bool = False,
|
367
371
|
) -> pl.LazyFrame:
|
368
372
|
"""
|
@@ -379,6 +383,7 @@ class IOOperations:
|
|
379
383
|
timeout: The timeout in seconds for reading the file from object storage.
|
380
384
|
compression_type: The compression type of the GFF file. If not specified, it will be detected automatically.
|
381
385
|
projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
|
386
|
+
predicate_pushdown: Enable predicate pushdown optimization to push filter conditions down to the DataFusion table provider level, reducing data processing and I/O.
|
382
387
|
parallel: Whether to use the parallel reader for BGZF-compressed local files (use BGZF chunk-level parallelism similar to FASTQ).
|
383
388
|
|
384
389
|
!!! note
|
@@ -401,7 +406,9 @@ class IOOperations:
|
|
401
406
|
parallel=parallel,
|
402
407
|
)
|
403
408
|
read_options = ReadOptions(gff_read_options=gff_read_options)
|
404
|
-
return _read_file(
|
409
|
+
return _read_file(
|
410
|
+
path, InputFormat.Gff, read_options, projection_pushdown, predicate_pushdown
|
411
|
+
)
|
405
412
|
|
406
413
|
@staticmethod
|
407
414
|
def read_bam(
|
@@ -760,13 +767,160 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
|
|
760
767
|
return [x.strip() for x in t]
|
761
768
|
|
762
769
|
|
770
|
+
def _apply_combined_pushdown_via_sql(
|
771
|
+
ctx,
|
772
|
+
table_name,
|
773
|
+
original_df,
|
774
|
+
predicate,
|
775
|
+
projected_columns,
|
776
|
+
predicate_pushdown,
|
777
|
+
projection_pushdown,
|
778
|
+
):
|
779
|
+
"""Apply both predicate and projection pushdown using SQL approach."""
|
780
|
+
from polars_bio.polars_bio import py_read_sql
|
781
|
+
|
782
|
+
# Build SQL query with combined optimizations
|
783
|
+
select_clause = "*"
|
784
|
+
if projection_pushdown and projected_columns:
|
785
|
+
select_clause = ", ".join([f'"{c}"' for c in projected_columns])
|
786
|
+
|
787
|
+
where_clause = ""
|
788
|
+
if predicate_pushdown and predicate is not None:
|
789
|
+
try:
|
790
|
+
# Use the proven regex-based predicate translation
|
791
|
+
where_clause = _build_sql_where_from_predicate_safe(predicate)
|
792
|
+
except Exception as e:
|
793
|
+
where_clause = ""
|
794
|
+
|
795
|
+
# No fallback - if we can't parse to SQL, just use projection only
|
796
|
+
# This keeps us in pure SQL mode for maximum performance
|
797
|
+
|
798
|
+
# Construct optimized SQL query
|
799
|
+
if where_clause:
|
800
|
+
sql = f"SELECT {select_clause} FROM {table_name} WHERE {where_clause}"
|
801
|
+
else:
|
802
|
+
sql = f"SELECT {select_clause} FROM {table_name}"
|
803
|
+
|
804
|
+
# Execute with DataFusion - this leverages the proven 4x+ optimization
|
805
|
+
return py_read_sql(ctx, sql)
|
806
|
+
|
807
|
+
|
808
|
+
def _build_sql_where_from_predicate_safe(predicate):
|
809
|
+
"""Build SQL WHERE clause by parsing all individual conditions and connecting with AND."""
|
810
|
+
import re
|
811
|
+
|
812
|
+
pred_str = str(predicate).strip("[]")
|
813
|
+
|
814
|
+
# Find all individual conditions in the nested structure
|
815
|
+
conditions = []
|
816
|
+
|
817
|
+
# String equality/inequality patterns (including empty strings)
|
818
|
+
# Accept both with and without surrounding parentheses in Polars repr
|
819
|
+
str_eq_patterns = [
|
820
|
+
r'\(col\("([^"]+)"\)\)\s*==\s*\("([^"]*)"\)', # (col("x")) == ("v")
|
821
|
+
r'col\("([^"]+)"\)\s*==\s*"([^"]*)"', # col("x") == "v"
|
822
|
+
]
|
823
|
+
for pat in str_eq_patterns:
|
824
|
+
for column, value in re.findall(pat, pred_str):
|
825
|
+
conditions.append(f"\"{column}\" = '{value}'")
|
826
|
+
|
827
|
+
# Numeric comparison patterns (handle both formats: with and without "dyn int:")
|
828
|
+
numeric_patterns = [
|
829
|
+
(r'\(col\("([^"]+)"\)\)\s*>\s*\((?:dyn int:\s*)?(\d+)\)', ">"),
|
830
|
+
(r'\(col\("([^"]+)"\)\)\s*<\s*\((?:dyn int:\s*)?(\d+)\)', "<"),
|
831
|
+
(r'\(col\("([^"]+)"\)\)\s*>=\s*\((?:dyn int:\s*)?(\d+)\)', ">="),
|
832
|
+
(r'\(col\("([^"]+)"\)\)\s*<=\s*\((?:dyn int:\s*)?(\d+)\)', "<="),
|
833
|
+
(r'\(col\("([^"]+)"\)\)\s*!=\s*\((?:dyn int:\s*)?(\d+)\)', "!="),
|
834
|
+
(r'\(col\("([^"]+)"\)\)\s*==\s*\((?:dyn int:\s*)?(\d+)\)', "="),
|
835
|
+
(r'col\("([^"]+)"\)\s*>\s*(\d+)', ">"),
|
836
|
+
(r'col\("([^"]+)"\)\s*<\s*(\d+)', "<"),
|
837
|
+
(r'col\("([^"]+)"\)\s*>=\s*(\d+)', ">="),
|
838
|
+
(r'col\("([^"]+)"\)\s*<=\s*(\d+)', "<="),
|
839
|
+
(r'col\("([^"]+)"\)\s*!=\s*(\d+)', "!="),
|
840
|
+
(r'col\("([^"]+)"\)\s*==\s*(\d+)', "="),
|
841
|
+
]
|
842
|
+
|
843
|
+
for pattern, op in numeric_patterns:
|
844
|
+
matches = re.findall(pattern, pred_str)
|
845
|
+
for column, value in matches:
|
846
|
+
conditions.append(f'"{column}" {op} {value}')
|
847
|
+
|
848
|
+
# Float comparison patterns (handle both formats: with and without "dyn float:")
|
849
|
+
float_patterns = [
|
850
|
+
(r'\(col\("([^"]+)"\)\)\s*>\s*\((?:dyn float:\s*)?([\d.]+)\)', ">"),
|
851
|
+
(r'\(col\("([^"]+)"\)\)\s*<\s*\((?:dyn float:\s*)?([\d.]+)\)', "<"),
|
852
|
+
(r'\(col\("([^"]+)"\)\)\s*>=\s*\((?:dyn float:\s*)?([\d.]+)\)', ">="),
|
853
|
+
(r'\(col\("([^"]+)"\)\)\s*<=\s*\((?:dyn float:\s*)?([\d.]+)\)', "<="),
|
854
|
+
(r'\(col\("([^"]+)"\)\)\s*!=\s*\((?:dyn float:\s*)?([\d.]+)\)', "!="),
|
855
|
+
(r'\(col\("([^"]+)"\)\)\s*==\s*\((?:dyn float:\s*)?([\d.]+)\)', "="),
|
856
|
+
(r'col\("([^"]+)"\)\s*>\s*([\d.]+)', ">"),
|
857
|
+
(r'col\("([^"]+)"\)\s*<\s*([\d.]+)', "<"),
|
858
|
+
(r'col\("([^"]+)"\)\s*>=\s*([\d.]+)', ">="),
|
859
|
+
(r'col\("([^"]+)"\)\s*<=\s*([\d.]+)', "<="),
|
860
|
+
(r'col\("([^"]+)"\)\s*!=\s*([\d.]+)', "!="),
|
861
|
+
(r'col\("([^"]+)"\)\s*==\s*([\d.]+)', "="),
|
862
|
+
]
|
863
|
+
|
864
|
+
for pattern, op in float_patterns:
|
865
|
+
matches = re.findall(pattern, pred_str)
|
866
|
+
for column, value in matches:
|
867
|
+
conditions.append(f'"{column}" {op} {value}')
|
868
|
+
|
869
|
+
# IN list pattern: col("x").is_in([v1, v2, ...])
|
870
|
+
in_matches = re.findall(r'col\("([^"]+)"\)\.is_in\(\[(.*?)\]\)', pred_str)
|
871
|
+
for column, values_str in in_matches:
|
872
|
+
# Tokenize values: quoted strings or numbers
|
873
|
+
tokens = re.findall(r"'(?:[^']*)'|\"(?:[^\"]*)\"|\d+(?:\.\d+)?", values_str)
|
874
|
+
items = []
|
875
|
+
for t in tokens:
|
876
|
+
if t.startswith('"') and t.endswith('"'):
|
877
|
+
items.append("'" + t[1:-1] + "'")
|
878
|
+
else:
|
879
|
+
items.append(t)
|
880
|
+
if items:
|
881
|
+
conditions.append(f'"{column}" IN ({", ".join(items)})')
|
882
|
+
|
883
|
+
# Join all conditions with AND
|
884
|
+
if conditions:
|
885
|
+
where = " AND ".join(conditions)
|
886
|
+
# Clean up any residual bracketed list formatting from IN clause (defensive)
|
887
|
+
where = (
|
888
|
+
where.replace("IN ([", "IN (")
|
889
|
+
.replace("])", ")")
|
890
|
+
.replace("[ ", "")
|
891
|
+
.replace(" ]", "")
|
892
|
+
)
|
893
|
+
# Collapse simple >= and <= pairs into BETWEEN when possible
|
894
|
+
try:
|
895
|
+
import re as _re
|
896
|
+
|
897
|
+
where = _re.sub(
|
898
|
+
r'"([^"]+)"\s*>=\s*([\d.]+)\s*AND\s*"\1"\s*<=\s*([\d.]+)',
|
899
|
+
r'"\1" BETWEEN \2 AND \3',
|
900
|
+
where,
|
901
|
+
)
|
902
|
+
where = _re.sub(
|
903
|
+
r'"([^"]+)"\s*<=\s*([\d.]+)\s*AND\s*"\1"\s*>=\s*([\d.]+)',
|
904
|
+
r'"\1" BETWEEN \3 AND \2',
|
905
|
+
where,
|
906
|
+
)
|
907
|
+
except Exception:
|
908
|
+
pass
|
909
|
+
return where
|
910
|
+
|
911
|
+
return ""
|
912
|
+
|
913
|
+
|
763
914
|
def _lazy_scan(
|
764
915
|
df: Union[pl.DataFrame, pl.LazyFrame],
|
765
916
|
projection_pushdown: bool = False,
|
917
|
+
predicate_pushdown: bool = False,
|
766
918
|
table_name: str = None,
|
767
919
|
input_format: InputFormat = None,
|
768
920
|
file_path: str = None,
|
921
|
+
read_options: ReadOptions = None,
|
769
922
|
) -> pl.LazyFrame:
|
923
|
+
|
770
924
|
df_lazy: DataFrame = df
|
771
925
|
original_schema = df_lazy.schema()
|
772
926
|
|
@@ -776,67 +930,160 @@ def _lazy_scan(
|
|
776
930
|
n_rows: Union[int, None],
|
777
931
|
_batch_size: Union[int, None],
|
778
932
|
) -> Iterator[pl.DataFrame]:
|
779
|
-
#
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
933
|
+
# If this is a GFF scan, perform pushdown by building a single SELECT ... WHERE ...
|
934
|
+
if input_format == InputFormat.Gff and file_path is not None:
|
935
|
+
from polars_bio.polars_bio import GffReadOptions, PyObjectStorageOptions
|
936
|
+
from polars_bio.polars_bio import ReadOptions as _ReadOptions
|
937
|
+
from polars_bio.polars_bio import (
|
938
|
+
py_read_sql,
|
939
|
+
py_read_table,
|
940
|
+
py_register_table,
|
941
|
+
py_register_view,
|
942
|
+
)
|
786
943
|
|
787
|
-
|
788
|
-
datafusion_projection_applied = False
|
944
|
+
from .context import ctx
|
789
945
|
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
946
|
+
# Extract columns requested by Polars optimizer
|
947
|
+
requested_cols = (
|
948
|
+
_extract_column_names_from_expr(with_columns)
|
949
|
+
if with_columns is not None
|
950
|
+
else []
|
951
|
+
)
|
795
952
|
|
796
|
-
|
797
|
-
|
953
|
+
# Compute attribute fields to request based on selected columns
|
954
|
+
STATIC = {
|
955
|
+
"chrom",
|
956
|
+
"start",
|
957
|
+
"end",
|
958
|
+
"type",
|
959
|
+
"source",
|
960
|
+
"score",
|
961
|
+
"strand",
|
962
|
+
"phase",
|
963
|
+
"attributes",
|
964
|
+
}
|
965
|
+
attr_fields = [c for c in requested_cols if c not in STATIC]
|
966
|
+
|
967
|
+
# Derive thread/parallel from read_options when available
|
968
|
+
thread_num = 1
|
969
|
+
parallel = False
|
970
|
+
if read_options is not None:
|
971
|
+
try:
|
972
|
+
gopt = getattr(read_options, "gff_read_options", None)
|
973
|
+
if gopt is not None:
|
974
|
+
tn = getattr(gopt, "thread_num", None)
|
975
|
+
if tn is not None:
|
976
|
+
thread_num = tn
|
977
|
+
par = getattr(gopt, "parallel", None)
|
978
|
+
if par is not None:
|
979
|
+
parallel = par
|
980
|
+
except Exception:
|
981
|
+
pass
|
798
982
|
|
799
|
-
|
800
|
-
|
983
|
+
# Build fresh read options (object storage options are not readable from Rust class; use safe defaults)
|
984
|
+
obj = PyObjectStorageOptions(
|
985
|
+
allow_anonymous=True,
|
986
|
+
enable_request_payer=False,
|
987
|
+
chunk_size=8,
|
988
|
+
concurrent_fetches=1,
|
989
|
+
max_retries=5,
|
990
|
+
timeout=300,
|
991
|
+
compression_type="auto",
|
992
|
+
)
|
993
|
+
# Determine attribute parsing behavior:
|
994
|
+
# - if user selected raw "attributes" column: keep provider defaults (None)
|
995
|
+
# - if user selected specific attribute columns: pass that list
|
996
|
+
# - otherwise: disable attribute parsing with empty list for performance
|
997
|
+
if "attributes" in requested_cols:
|
998
|
+
_attr = None
|
999
|
+
elif attr_fields:
|
1000
|
+
_attr = attr_fields
|
1001
|
+
else:
|
1002
|
+
_attr = []
|
801
1003
|
|
802
|
-
|
803
|
-
|
1004
|
+
gff_opts = GffReadOptions(
|
1005
|
+
attr_fields=_attr,
|
1006
|
+
thread_num=thread_num,
|
1007
|
+
object_storage_options=obj,
|
1008
|
+
parallel=parallel,
|
1009
|
+
)
|
1010
|
+
ropts = _ReadOptions(gff_read_options=gff_opts)
|
1011
|
+
|
1012
|
+
# Determine which table to query: reuse original unless we must change attr_fields
|
1013
|
+
table_name_use = table_name
|
1014
|
+
if projection_pushdown and requested_cols:
|
1015
|
+
# Only re-register when projection is active (we know column needs)
|
1016
|
+
table_obj = py_register_table(
|
1017
|
+
ctx, file_path, None, InputFormat.Gff, ropts
|
804
1018
|
)
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
1019
|
+
table_name_use = table_obj.name
|
1020
|
+
|
1021
|
+
# Build SELECT clause respecting projection flag
|
1022
|
+
if projection_pushdown and requested_cols:
|
1023
|
+
select_clause = ", ".join([f'"{c}"' for c in requested_cols])
|
1024
|
+
else:
|
1025
|
+
select_clause = "*"
|
1026
|
+
|
1027
|
+
# Build WHERE clause respecting predicate flag
|
1028
|
+
where_clause = ""
|
1029
|
+
if predicate_pushdown and predicate is not None:
|
1030
|
+
try:
|
1031
|
+
where_clause = _build_sql_where_from_predicate_safe(predicate)
|
1032
|
+
except Exception:
|
1033
|
+
where_clause = ""
|
1034
|
+
|
1035
|
+
sql = f"SELECT {select_clause} FROM {table_name_use}"
|
1036
|
+
if where_clause:
|
1037
|
+
sql += f" WHERE {where_clause}"
|
1038
|
+
if n_rows and n_rows > 0:
|
1039
|
+
sql += f" LIMIT {int(n_rows)}"
|
1040
|
+
|
1041
|
+
query_df = py_read_sql(ctx, sql)
|
1042
|
+
|
1043
|
+
# Stream results, applying any non-pushed operations locally
|
1044
|
+
df_stream = query_df.execute_stream()
|
1045
|
+
progress_bar = tqdm(unit="rows")
|
1046
|
+
for r in df_stream:
|
1047
|
+
py_df = r.to_pyarrow()
|
1048
|
+
out = pl.DataFrame(py_df)
|
1049
|
+
# Apply local filter if we didn't push it down
|
1050
|
+
if predicate is not None and (
|
1051
|
+
not predicate_pushdown or not where_clause
|
1052
|
+
):
|
1053
|
+
out = out.filter(predicate)
|
1054
|
+
# Apply local projection if we didn't push it down
|
1055
|
+
if with_columns is not None and (
|
1056
|
+
not projection_pushdown or not requested_cols
|
1057
|
+
):
|
1058
|
+
out = out.select(with_columns)
|
1059
|
+
progress_bar.update(len(out))
|
1060
|
+
yield out
|
824
1061
|
return
|
825
1062
|
|
1063
|
+
# Default path (non-GFF): stream and optionally apply local filter/projection
|
1064
|
+
query_df = df_lazy
|
826
1065
|
df_stream = query_df.execute_stream()
|
827
1066
|
progress_bar = tqdm(unit="rows")
|
1067
|
+
remaining = int(n_rows) if n_rows is not None else None
|
828
1068
|
for r in df_stream:
|
829
1069
|
py_df = r.to_pyarrow()
|
830
|
-
|
1070
|
+
out = pl.DataFrame(py_df)
|
831
1071
|
if predicate is not None:
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
1072
|
+
out = out.filter(predicate)
|
1073
|
+
if with_columns is not None:
|
1074
|
+
out = out.select(with_columns)
|
1075
|
+
|
1076
|
+
if remaining is not None:
|
1077
|
+
if remaining <= 0:
|
1078
|
+
break
|
1079
|
+
if len(out) > remaining:
|
1080
|
+
out = out.head(remaining)
|
1081
|
+
remaining -= len(out)
|
1082
|
+
|
1083
|
+
progress_bar.update(len(out))
|
1084
|
+
yield out
|
1085
|
+
if remaining is not None and remaining <= 0:
|
1086
|
+
return
|
840
1087
|
|
841
1088
|
return register_io_source(_overlap_source, schema=original_schema)
|
842
1089
|
|
@@ -877,21 +1124,36 @@ def _read_file(
|
|
877
1124
|
input_format: InputFormat,
|
878
1125
|
read_options: ReadOptions,
|
879
1126
|
projection_pushdown: bool = False,
|
1127
|
+
predicate_pushdown: bool = False,
|
880
1128
|
) -> pl.LazyFrame:
|
881
1129
|
table = py_register_table(ctx, path, None, input_format, read_options)
|
882
1130
|
df = py_read_table(ctx, table.name)
|
883
1131
|
|
884
|
-
lf = _lazy_scan(
|
1132
|
+
lf = _lazy_scan(
|
1133
|
+
df,
|
1134
|
+
projection_pushdown,
|
1135
|
+
predicate_pushdown,
|
1136
|
+
table.name,
|
1137
|
+
input_format,
|
1138
|
+
path,
|
1139
|
+
read_options,
|
1140
|
+
)
|
885
1141
|
|
886
1142
|
# Wrap GFF LazyFrames with projection-aware wrapper for consistent attribute field handling
|
887
1143
|
if input_format == InputFormat.Gff:
|
888
|
-
return GffLazyFrameWrapper(
|
1144
|
+
return GffLazyFrameWrapper(
|
1145
|
+
lf, path, read_options, projection_pushdown, predicate_pushdown
|
1146
|
+
)
|
889
1147
|
|
890
1148
|
return lf
|
891
1149
|
|
892
1150
|
|
893
1151
|
class GffLazyFrameWrapper:
|
894
|
-
"""
|
1152
|
+
"""Thin wrapper that preserves type while delegating to the underlying LazyFrame.
|
1153
|
+
|
1154
|
+
Pushdown is decided exclusively inside the io_source callback based on
|
1155
|
+
with_columns and predicate; this wrapper only keeps chain type stable.
|
1156
|
+
"""
|
895
1157
|
|
896
1158
|
def __init__(
|
897
1159
|
self,
|
@@ -899,45 +1161,33 @@ class GffLazyFrameWrapper:
|
|
899
1161
|
file_path: str,
|
900
1162
|
read_options: ReadOptions,
|
901
1163
|
projection_pushdown: bool = True,
|
1164
|
+
predicate_pushdown: bool = True,
|
902
1165
|
):
|
903
1166
|
self._base_lf = base_lf
|
904
1167
|
self._file_path = file_path
|
905
1168
|
self._read_options = read_options
|
906
1169
|
self._projection_pushdown = projection_pushdown
|
1170
|
+
self._predicate_pushdown = predicate_pushdown
|
907
1171
|
|
908
1172
|
def select(self, exprs):
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
columns = []
|
919
|
-
for expr in exprs:
|
920
|
-
if isinstance(expr, str):
|
921
|
-
columns.append(expr)
|
922
|
-
elif hasattr(expr, "meta") and hasattr(expr.meta, "output_name"):
|
923
|
-
try:
|
924
|
-
columns.append(expr.meta.output_name())
|
925
|
-
except:
|
926
|
-
pass
|
927
|
-
else:
|
928
|
-
# Single expression
|
929
|
-
if isinstance(exprs, str):
|
930
|
-
columns = [exprs]
|
931
|
-
elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
|
932
|
-
try:
|
933
|
-
columns = [exprs.meta.output_name()]
|
934
|
-
except:
|
935
|
-
columns = []
|
1173
|
+
# Extract requested column names
|
1174
|
+
columns = []
|
1175
|
+
try:
|
1176
|
+
if isinstance(exprs, (list, tuple)):
|
1177
|
+
for e in exprs:
|
1178
|
+
if isinstance(e, str):
|
1179
|
+
columns.append(e)
|
1180
|
+
elif hasattr(e, "meta") and hasattr(e.meta, "output_name"):
|
1181
|
+
columns.append(e.meta.output_name())
|
936
1182
|
else:
|
937
|
-
|
1183
|
+
if isinstance(exprs, str):
|
1184
|
+
columns = [exprs]
|
1185
|
+
elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
|
1186
|
+
columns = [exprs.meta.output_name()]
|
1187
|
+
except Exception:
|
1188
|
+
columns = []
|
938
1189
|
|
939
|
-
|
940
|
-
GFF_STATIC_COLUMNS = {
|
1190
|
+
STATIC = {
|
941
1191
|
"chrom",
|
942
1192
|
"start",
|
943
1193
|
"end",
|
@@ -948,119 +1198,110 @@ class GffLazyFrameWrapper:
|
|
948
1198
|
"phase",
|
949
1199
|
"attributes",
|
950
1200
|
}
|
951
|
-
|
952
|
-
|
1201
|
+
attr_cols = [c for c in columns if c not in STATIC]
|
1202
|
+
|
1203
|
+
# If selecting attribute fields, run one-shot SQL projection with proper attr_fields
|
1204
|
+
if columns and (attr_cols or "attributes" in columns):
|
1205
|
+
from polars_bio.polars_bio import GffReadOptions
|
1206
|
+
from polars_bio.polars_bio import InputFormat as _InputFormat
|
1207
|
+
from polars_bio.polars_bio import PyObjectStorageOptions
|
1208
|
+
from polars_bio.polars_bio import ReadOptions as _ReadOptions
|
1209
|
+
from polars_bio.polars_bio import (
|
1210
|
+
py_read_sql,
|
1211
|
+
py_read_table,
|
1212
|
+
py_register_table,
|
1213
|
+
py_register_view,
|
1214
|
+
)
|
953
1215
|
|
954
|
-
# If 'attributes' is requested, ensure the registered table exposes it.
|
955
|
-
# Some parallel GFF providers omit the raw 'attributes' column; switch
|
956
|
-
# to a registration that includes it while keeping projection pushdown.
|
957
|
-
if "attributes" in static_cols:
|
958
1216
|
from .context import ctx
|
959
1217
|
|
960
|
-
#
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
1218
|
+
# Pull thread_num/parallel from original read options
|
1219
|
+
thread_num = 1
|
1220
|
+
parallel = False
|
1221
|
+
try:
|
1222
|
+
gopt = getattr(self._read_options, "gff_read_options", None)
|
1223
|
+
if gopt is not None:
|
1224
|
+
tn = getattr(gopt, "thread_num", None)
|
1225
|
+
if tn is not None:
|
1226
|
+
thread_num = tn
|
1227
|
+
par = getattr(gopt, "parallel", None)
|
1228
|
+
if par is not None:
|
1229
|
+
parallel = par
|
1230
|
+
except Exception:
|
1231
|
+
pass
|
1232
|
+
|
1233
|
+
obj = PyObjectStorageOptions(
|
1234
|
+
allow_anonymous=True,
|
1235
|
+
enable_request_payer=False,
|
1236
|
+
chunk_size=8,
|
1237
|
+
concurrent_fetches=1,
|
1238
|
+
max_retries=5,
|
1239
|
+
timeout=300,
|
1240
|
+
compression_type="auto",
|
967
1241
|
)
|
1242
|
+
if "attributes" in columns:
|
1243
|
+
_attr = None
|
1244
|
+
elif attr_cols:
|
1245
|
+
_attr = attr_cols
|
1246
|
+
else:
|
1247
|
+
_attr = []
|
968
1248
|
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
allow_anonymous=True,
|
975
|
-
enable_request_payer=False,
|
976
|
-
chunk_size=8,
|
977
|
-
concurrent_fetches=1,
|
978
|
-
max_retries=5,
|
979
|
-
timeout=300,
|
980
|
-
compression_type="auto",
|
981
|
-
),
|
982
|
-
parallel=orig_parallel,
|
1249
|
+
gff_opts = GffReadOptions(
|
1250
|
+
attr_fields=_attr,
|
1251
|
+
thread_num=thread_num,
|
1252
|
+
object_storage_options=obj,
|
1253
|
+
parallel=parallel,
|
983
1254
|
)
|
984
|
-
|
1255
|
+
ropts = _ReadOptions(gff_read_options=gff_opts)
|
985
1256
|
table = py_register_table(
|
986
|
-
ctx, self._file_path, None,
|
1257
|
+
ctx, self._file_path, None, _InputFormat.Gff, ropts
|
987
1258
|
)
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
if self._projection_pushdown:
|
993
|
-
# Optimized path: when selecting specific unnested attribute fields, re-register
|
994
|
-
# GFF table with those fields so DataFusion can project them efficiently.
|
995
|
-
|
996
|
-
# Use optimized table re-registration (fast path)
|
997
|
-
from .context import ctx
|
998
|
-
|
999
|
-
gff_options = GffReadOptions(
|
1000
|
-
attr_fields=attribute_cols,
|
1001
|
-
thread_num=getattr(
|
1002
|
-
getattr(self._read_options, "gff_read_options", None),
|
1003
|
-
"thread_num",
|
1004
|
-
1,
|
1005
|
-
),
|
1006
|
-
object_storage_options=PyObjectStorageOptions(
|
1007
|
-
allow_anonymous=True,
|
1008
|
-
enable_request_payer=False,
|
1009
|
-
chunk_size=8,
|
1010
|
-
concurrent_fetches=1,
|
1011
|
-
max_retries=5,
|
1012
|
-
timeout=300,
|
1013
|
-
compression_type="auto",
|
1014
|
-
),
|
1015
|
-
# Keep parallel reading consistent with base options when possible
|
1016
|
-
parallel=getattr(
|
1017
|
-
getattr(self._read_options, "gff_read_options", None),
|
1018
|
-
"parallel",
|
1019
|
-
False,
|
1020
|
-
),
|
1259
|
+
select_clause = ", ".join([f'"{c}"' for c in columns])
|
1260
|
+
view_name = f"{table.name}_proj"
|
1261
|
+
py_register_view(
|
1262
|
+
ctx, view_name, f"SELECT {select_clause} FROM {table.name}"
|
1021
1263
|
)
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1264
|
+
df_view = py_read_table(ctx, view_name)
|
1265
|
+
|
1266
|
+
new_lf = _lazy_scan(
|
1267
|
+
df_view,
|
1268
|
+
False,
|
1269
|
+
self._predicate_pushdown,
|
1270
|
+
view_name,
|
1271
|
+
_InputFormat.Gff,
|
1272
|
+
self._file_path,
|
1273
|
+
self._read_options,
|
1274
|
+
)
|
1275
|
+
return GffLazyFrameWrapper(
|
1276
|
+
new_lf,
|
1277
|
+
self._file_path,
|
1278
|
+
self._read_options,
|
1279
|
+
False,
|
1280
|
+
self._predicate_pushdown,
|
1026
1281
|
)
|
1027
|
-
df = py_read_table(ctx, table.name)
|
1028
|
-
|
1029
|
-
# Create new LazyFrame with optimized schema
|
1030
|
-
new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
|
1031
|
-
return new_lf.select(exprs)
|
1032
|
-
|
1033
|
-
elif attribute_cols:
|
1034
|
-
# Extract attribute fields from nested structure (compatibility path)
|
1035
|
-
import polars as pl
|
1036
|
-
|
1037
|
-
# Build selection with attribute field extraction
|
1038
|
-
selection_exprs = []
|
1039
|
-
|
1040
|
-
# Add static columns as-is
|
1041
|
-
for col in static_cols:
|
1042
|
-
selection_exprs.append(pl.col(col))
|
1043
|
-
|
1044
|
-
# Add attribute field extractions
|
1045
|
-
for attr_col in attribute_cols:
|
1046
|
-
attr_expr = (
|
1047
|
-
pl.col("attributes")
|
1048
|
-
.list.eval(
|
1049
|
-
pl.when(pl.element().struct.field("tag") == attr_col).then(
|
1050
|
-
pl.element().struct.field("value")
|
1051
|
-
)
|
1052
|
-
)
|
1053
|
-
.list.drop_nulls()
|
1054
|
-
.list.first()
|
1055
|
-
.alias(attr_col)
|
1056
|
-
)
|
1057
|
-
selection_exprs.append(attr_expr)
|
1058
1282
|
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1283
|
+
# Otherwise delegate to Polars
|
1284
|
+
return GffLazyFrameWrapper(
|
1285
|
+
self._base_lf.select(exprs),
|
1286
|
+
self._file_path,
|
1287
|
+
self._read_options,
|
1288
|
+
self._projection_pushdown,
|
1289
|
+
self._predicate_pushdown,
|
1290
|
+
)
|
1291
|
+
|
1292
|
+
def filter(self, *predicates):
|
1293
|
+
if not predicates:
|
1294
|
+
return self
|
1295
|
+
pred = predicates[0]
|
1296
|
+
for p in predicates[1:]:
|
1297
|
+
pred = pred & p
|
1298
|
+
return GffLazyFrameWrapper(
|
1299
|
+
self._base_lf.filter(pred),
|
1300
|
+
self._file_path,
|
1301
|
+
self._read_options,
|
1302
|
+
self._projection_pushdown,
|
1303
|
+
self._predicate_pushdown,
|
1304
|
+
)
|
1063
1305
|
|
1064
1306
|
def __getattr__(self, name):
|
1065
|
-
"""Delegate all other operations to base LazyFrame."""
|
1066
1307
|
return getattr(self._base_lf, name)
|