polars-bio 0.14.0__cp39-abi3-macosx_11_0_arm64.whl → 0.15.0__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polars_bio/io.py CHANGED
@@ -316,6 +316,7 @@ class IOOperations:
316
316
  timeout: int = 300,
317
317
  compression_type: str = "auto",
318
318
  projection_pushdown: bool = False,
319
+ predicate_pushdown: bool = False,
319
320
  parallel: bool = False,
320
321
  ) -> pl.DataFrame:
321
322
  """
@@ -332,6 +333,7 @@ class IOOperations:
332
333
  timeout: The timeout in seconds for reading the file from object storage.
333
334
  compression_type: The compression type of the GFF file. If not specified, it will be detected automatically..
334
335
  projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
336
+ predicate_pushdown: Enable predicate pushdown optimization to push filter conditions down to the DataFusion table provider level, reducing data processing and I/O.
335
337
  parallel: Whether to use the parallel reader for BGZF-compressed local files (uses BGZF chunk-level parallelism similar to FASTQ).
336
338
 
337
339
  !!! note
@@ -348,6 +350,7 @@ class IOOperations:
348
350
  timeout,
349
351
  compression_type,
350
352
  projection_pushdown,
353
+ predicate_pushdown,
351
354
  parallel,
352
355
  ).collect()
353
356
 
@@ -363,6 +366,7 @@ class IOOperations:
363
366
  timeout: int = 300,
364
367
  compression_type: str = "auto",
365
368
  projection_pushdown: bool = False,
369
+ predicate_pushdown: bool = False,
366
370
  parallel: bool = False,
367
371
  ) -> pl.LazyFrame:
368
372
  """
@@ -379,6 +383,7 @@ class IOOperations:
379
383
  timeout: The timeout in seconds for reading the file from object storage.
380
384
  compression_type: The compression type of the GFF file. If not specified, it will be detected automatically.
381
385
  projection_pushdown: Enable column projection pushdown to optimize query performance by only reading the necessary columns at the DataFusion level.
386
+ predicate_pushdown: Enable predicate pushdown optimization to push filter conditions down to the DataFusion table provider level, reducing data processing and I/O.
382
387
  parallel: Whether to use the parallel reader for BGZF-compressed local files (use BGZF chunk-level parallelism similar to FASTQ).
383
388
 
384
389
  !!! note
@@ -401,7 +406,9 @@ class IOOperations:
401
406
  parallel=parallel,
402
407
  )
403
408
  read_options = ReadOptions(gff_read_options=gff_read_options)
404
- return _read_file(path, InputFormat.Gff, read_options, projection_pushdown)
409
+ return _read_file(
410
+ path, InputFormat.Gff, read_options, projection_pushdown, predicate_pushdown
411
+ )
405
412
 
406
413
  @staticmethod
407
414
  def read_bam(
@@ -760,13 +767,160 @@ def _cleanse_fields(t: Union[list[str], None]) -> Union[list[str], None]:
760
767
  return [x.strip() for x in t]
761
768
 
762
769
 
770
+ def _apply_combined_pushdown_via_sql(
771
+ ctx,
772
+ table_name,
773
+ original_df,
774
+ predicate,
775
+ projected_columns,
776
+ predicate_pushdown,
777
+ projection_pushdown,
778
+ ):
779
+ """Apply both predicate and projection pushdown using SQL approach."""
780
+ from polars_bio.polars_bio import py_read_sql
781
+
782
+ # Build SQL query with combined optimizations
783
+ select_clause = "*"
784
+ if projection_pushdown and projected_columns:
785
+ select_clause = ", ".join([f'"{c}"' for c in projected_columns])
786
+
787
+ where_clause = ""
788
+ if predicate_pushdown and predicate is not None:
789
+ try:
790
+ # Use the proven regex-based predicate translation
791
+ where_clause = _build_sql_where_from_predicate_safe(predicate)
792
+ except Exception as e:
793
+ where_clause = ""
794
+
795
+ # No fallback - if we can't parse to SQL, just use projection only
796
+ # This keeps us in pure SQL mode for maximum performance
797
+
798
+ # Construct optimized SQL query
799
+ if where_clause:
800
+ sql = f"SELECT {select_clause} FROM {table_name} WHERE {where_clause}"
801
+ else:
802
+ sql = f"SELECT {select_clause} FROM {table_name}"
803
+
804
+ # Execute with DataFusion - this leverages the proven 4x+ optimization
805
+ return py_read_sql(ctx, sql)
806
+
807
+
808
+ def _build_sql_where_from_predicate_safe(predicate):
809
+ """Build SQL WHERE clause by parsing all individual conditions and connecting with AND."""
810
+ import re
811
+
812
+ pred_str = str(predicate).strip("[]")
813
+
814
+ # Find all individual conditions in the nested structure
815
+ conditions = []
816
+
817
+ # String equality/inequality patterns (including empty strings)
818
+ # Accept both with and without surrounding parentheses in Polars repr
819
+ str_eq_patterns = [
820
+ r'\(col\("([^"]+)"\)\)\s*==\s*\("([^"]*)"\)', # (col("x")) == ("v")
821
+ r'col\("([^"]+)"\)\s*==\s*"([^"]*)"', # col("x") == "v"
822
+ ]
823
+ for pat in str_eq_patterns:
824
+ for column, value in re.findall(pat, pred_str):
825
+ conditions.append(f"\"{column}\" = '{value}'")
826
+
827
+ # Numeric comparison patterns (handle both formats: with and without "dyn int:")
828
+ numeric_patterns = [
829
+ (r'\(col\("([^"]+)"\)\)\s*>\s*\((?:dyn int:\s*)?(\d+)\)', ">"),
830
+ (r'\(col\("([^"]+)"\)\)\s*<\s*\((?:dyn int:\s*)?(\d+)\)', "<"),
831
+ (r'\(col\("([^"]+)"\)\)\s*>=\s*\((?:dyn int:\s*)?(\d+)\)', ">="),
832
+ (r'\(col\("([^"]+)"\)\)\s*<=\s*\((?:dyn int:\s*)?(\d+)\)', "<="),
833
+ (r'\(col\("([^"]+)"\)\)\s*!=\s*\((?:dyn int:\s*)?(\d+)\)', "!="),
834
+ (r'\(col\("([^"]+)"\)\)\s*==\s*\((?:dyn int:\s*)?(\d+)\)', "="),
835
+ (r'col\("([^"]+)"\)\s*>\s*(\d+)', ">"),
836
+ (r'col\("([^"]+)"\)\s*<\s*(\d+)', "<"),
837
+ (r'col\("([^"]+)"\)\s*>=\s*(\d+)', ">="),
838
+ (r'col\("([^"]+)"\)\s*<=\s*(\d+)', "<="),
839
+ (r'col\("([^"]+)"\)\s*!=\s*(\d+)', "!="),
840
+ (r'col\("([^"]+)"\)\s*==\s*(\d+)', "="),
841
+ ]
842
+
843
+ for pattern, op in numeric_patterns:
844
+ matches = re.findall(pattern, pred_str)
845
+ for column, value in matches:
846
+ conditions.append(f'"{column}" {op} {value}')
847
+
848
+ # Float comparison patterns (handle both formats: with and without "dyn float:")
849
+ float_patterns = [
850
+ (r'\(col\("([^"]+)"\)\)\s*>\s*\((?:dyn float:\s*)?([\d.]+)\)', ">"),
851
+ (r'\(col\("([^"]+)"\)\)\s*<\s*\((?:dyn float:\s*)?([\d.]+)\)', "<"),
852
+ (r'\(col\("([^"]+)"\)\)\s*>=\s*\((?:dyn float:\s*)?([\d.]+)\)', ">="),
853
+ (r'\(col\("([^"]+)"\)\)\s*<=\s*\((?:dyn float:\s*)?([\d.]+)\)', "<="),
854
+ (r'\(col\("([^"]+)"\)\)\s*!=\s*\((?:dyn float:\s*)?([\d.]+)\)', "!="),
855
+ (r'\(col\("([^"]+)"\)\)\s*==\s*\((?:dyn float:\s*)?([\d.]+)\)', "="),
856
+ (r'col\("([^"]+)"\)\s*>\s*([\d.]+)', ">"),
857
+ (r'col\("([^"]+)"\)\s*<\s*([\d.]+)', "<"),
858
+ (r'col\("([^"]+)"\)\s*>=\s*([\d.]+)', ">="),
859
+ (r'col\("([^"]+)"\)\s*<=\s*([\d.]+)', "<="),
860
+ (r'col\("([^"]+)"\)\s*!=\s*([\d.]+)', "!="),
861
+ (r'col\("([^"]+)"\)\s*==\s*([\d.]+)', "="),
862
+ ]
863
+
864
+ for pattern, op in float_patterns:
865
+ matches = re.findall(pattern, pred_str)
866
+ for column, value in matches:
867
+ conditions.append(f'"{column}" {op} {value}')
868
+
869
+ # IN list pattern: col("x").is_in([v1, v2, ...])
870
+ in_matches = re.findall(r'col\("([^"]+)"\)\.is_in\(\[(.*?)\]\)', pred_str)
871
+ for column, values_str in in_matches:
872
+ # Tokenize values: quoted strings or numbers
873
+ tokens = re.findall(r"'(?:[^']*)'|\"(?:[^\"]*)\"|\d+(?:\.\d+)?", values_str)
874
+ items = []
875
+ for t in tokens:
876
+ if t.startswith('"') and t.endswith('"'):
877
+ items.append("'" + t[1:-1] + "'")
878
+ else:
879
+ items.append(t)
880
+ if items:
881
+ conditions.append(f'"{column}" IN ({", ".join(items)})')
882
+
883
+ # Join all conditions with AND
884
+ if conditions:
885
+ where = " AND ".join(conditions)
886
+ # Clean up any residual bracketed list formatting from IN clause (defensive)
887
+ where = (
888
+ where.replace("IN ([", "IN (")
889
+ .replace("])", ")")
890
+ .replace("[ ", "")
891
+ .replace(" ]", "")
892
+ )
893
+ # Collapse simple >= and <= pairs into BETWEEN when possible
894
+ try:
895
+ import re as _re
896
+
897
+ where = _re.sub(
898
+ r'"([^"]+)"\s*>=\s*([\d.]+)\s*AND\s*"\1"\s*<=\s*([\d.]+)',
899
+ r'"\1" BETWEEN \2 AND \3',
900
+ where,
901
+ )
902
+ where = _re.sub(
903
+ r'"([^"]+)"\s*<=\s*([\d.]+)\s*AND\s*"\1"\s*>=\s*([\d.]+)',
904
+ r'"\1" BETWEEN \3 AND \2',
905
+ where,
906
+ )
907
+ except Exception:
908
+ pass
909
+ return where
910
+
911
+ return ""
912
+
913
+
763
914
  def _lazy_scan(
764
915
  df: Union[pl.DataFrame, pl.LazyFrame],
765
916
  projection_pushdown: bool = False,
917
+ predicate_pushdown: bool = False,
766
918
  table_name: str = None,
767
919
  input_format: InputFormat = None,
768
920
  file_path: str = None,
921
+ read_options: ReadOptions = None,
769
922
  ) -> pl.LazyFrame:
923
+
770
924
  df_lazy: DataFrame = df
771
925
  original_schema = df_lazy.schema()
772
926
 
@@ -776,67 +930,160 @@ def _lazy_scan(
776
930
  n_rows: Union[int, None],
777
931
  _batch_size: Union[int, None],
778
932
  ) -> Iterator[pl.DataFrame]:
779
- # Extract column names from with_columns if projection pushdown is enabled
780
- projected_columns = None
781
- if projection_pushdown and with_columns is not None:
782
- projected_columns = _extract_column_names_from_expr(with_columns)
783
-
784
- # Projection pushdown is handled natively by table providers
785
- query_df = df_lazy
933
+ # If this is a GFF scan, perform pushdown by building a single SELECT ... WHERE ...
934
+ if input_format == InputFormat.Gff and file_path is not None:
935
+ from polars_bio.polars_bio import GffReadOptions, PyObjectStorageOptions
936
+ from polars_bio.polars_bio import ReadOptions as _ReadOptions
937
+ from polars_bio.polars_bio import (
938
+ py_read_sql,
939
+ py_read_table,
940
+ py_register_table,
941
+ py_register_view,
942
+ )
786
943
 
787
- # Apply column projection to DataFusion query if enabled
788
- datafusion_projection_applied = False
944
+ from .context import ctx
789
945
 
790
- if projection_pushdown and projected_columns:
791
- try:
792
- # Apply projection at the DataFusion level using SQL
793
- # This approach works reliably with the DataFusion Python API
794
- columns_sql = ", ".join([f'"{c}"' for c in projected_columns])
946
+ # Extract columns requested by Polars optimizer
947
+ requested_cols = (
948
+ _extract_column_names_from_expr(with_columns)
949
+ if with_columns is not None
950
+ else []
951
+ )
795
952
 
796
- # Use the table name passed from _read_file, fallback if not available
797
- table_to_query = table_name if table_name else "temp_table"
953
+ # Compute attribute fields to request based on selected columns
954
+ STATIC = {
955
+ "chrom",
956
+ "start",
957
+ "end",
958
+ "type",
959
+ "source",
960
+ "score",
961
+ "strand",
962
+ "phase",
963
+ "attributes",
964
+ }
965
+ attr_fields = [c for c in requested_cols if c not in STATIC]
966
+
967
+ # Derive thread/parallel from read_options when available
968
+ thread_num = 1
969
+ parallel = False
970
+ if read_options is not None:
971
+ try:
972
+ gopt = getattr(read_options, "gff_read_options", None)
973
+ if gopt is not None:
974
+ tn = getattr(gopt, "thread_num", None)
975
+ if tn is not None:
976
+ thread_num = tn
977
+ par = getattr(gopt, "parallel", None)
978
+ if par is not None:
979
+ parallel = par
980
+ except Exception:
981
+ pass
798
982
 
799
- # Use py_read_sql to execute SQL projection (same as pb.sql() does)
800
- from .context import ctx
983
+ # Build fresh read options (object storage options are not readable from Rust class; use safe defaults)
984
+ obj = PyObjectStorageOptions(
985
+ allow_anonymous=True,
986
+ enable_request_payer=False,
987
+ chunk_size=8,
988
+ concurrent_fetches=1,
989
+ max_retries=5,
990
+ timeout=300,
991
+ compression_type="auto",
992
+ )
993
+ # Determine attribute parsing behavior:
994
+ # - if user selected raw "attributes" column: keep provider defaults (None)
995
+ # - if user selected specific attribute columns: pass that list
996
+ # - otherwise: disable attribute parsing with empty list for performance
997
+ if "attributes" in requested_cols:
998
+ _attr = None
999
+ elif attr_fields:
1000
+ _attr = attr_fields
1001
+ else:
1002
+ _attr = []
801
1003
 
802
- query_df = py_read_sql(
803
- ctx, f"SELECT {columns_sql} FROM {table_to_query}"
1004
+ gff_opts = GffReadOptions(
1005
+ attr_fields=_attr,
1006
+ thread_num=thread_num,
1007
+ object_storage_options=obj,
1008
+ parallel=parallel,
1009
+ )
1010
+ ropts = _ReadOptions(gff_read_options=gff_opts)
1011
+
1012
+ # Determine which table to query: reuse original unless we must change attr_fields
1013
+ table_name_use = table_name
1014
+ if projection_pushdown and requested_cols:
1015
+ # Only re-register when projection is active (we know column needs)
1016
+ table_obj = py_register_table(
1017
+ ctx, file_path, None, InputFormat.Gff, ropts
804
1018
  )
805
- datafusion_projection_applied = True
806
- except Exception as e:
807
- # Fallback to original behavior if projection fails
808
- print(f"DataFusion projection failed: {e}")
809
- query_df = df_lazy
810
- projected_columns = None
811
- datafusion_projection_applied = False
812
-
813
- if n_rows and n_rows < 8192: # 8192 is the default batch size in datafusion
814
- df = query_df.limit(n_rows).execute_stream().next().to_pyarrow()
815
- df = pl.DataFrame(df).limit(n_rows)
816
- if predicate is not None:
817
- df = df.filter(predicate)
818
- # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
819
- if with_columns is not None and (
820
- not projection_pushdown or not datafusion_projection_applied
821
- ):
822
- df = df.select(with_columns)
823
- yield df
1019
+ table_name_use = table_obj.name
1020
+
1021
+ # Build SELECT clause respecting projection flag
1022
+ if projection_pushdown and requested_cols:
1023
+ select_clause = ", ".join([f'"{c}"' for c in requested_cols])
1024
+ else:
1025
+ select_clause = "*"
1026
+
1027
+ # Build WHERE clause respecting predicate flag
1028
+ where_clause = ""
1029
+ if predicate_pushdown and predicate is not None:
1030
+ try:
1031
+ where_clause = _build_sql_where_from_predicate_safe(predicate)
1032
+ except Exception:
1033
+ where_clause = ""
1034
+
1035
+ sql = f"SELECT {select_clause} FROM {table_name_use}"
1036
+ if where_clause:
1037
+ sql += f" WHERE {where_clause}"
1038
+ if n_rows and n_rows > 0:
1039
+ sql += f" LIMIT {int(n_rows)}"
1040
+
1041
+ query_df = py_read_sql(ctx, sql)
1042
+
1043
+ # Stream results, applying any non-pushed operations locally
1044
+ df_stream = query_df.execute_stream()
1045
+ progress_bar = tqdm(unit="rows")
1046
+ for r in df_stream:
1047
+ py_df = r.to_pyarrow()
1048
+ out = pl.DataFrame(py_df)
1049
+ # Apply local filter if we didn't push it down
1050
+ if predicate is not None and (
1051
+ not predicate_pushdown or not where_clause
1052
+ ):
1053
+ out = out.filter(predicate)
1054
+ # Apply local projection if we didn't push it down
1055
+ if with_columns is not None and (
1056
+ not projection_pushdown or not requested_cols
1057
+ ):
1058
+ out = out.select(with_columns)
1059
+ progress_bar.update(len(out))
1060
+ yield out
824
1061
  return
825
1062
 
1063
+ # Default path (non-GFF): stream and optionally apply local filter/projection
1064
+ query_df = df_lazy
826
1065
  df_stream = query_df.execute_stream()
827
1066
  progress_bar = tqdm(unit="rows")
1067
+ remaining = int(n_rows) if n_rows is not None else None
828
1068
  for r in df_stream:
829
1069
  py_df = r.to_pyarrow()
830
- df = pl.DataFrame(py_df)
1070
+ out = pl.DataFrame(py_df)
831
1071
  if predicate is not None:
832
- df = df.filter(predicate)
833
- # Apply Python-level projection if DataFusion projection failed or projection pushdown is disabled
834
- if with_columns is not None and (
835
- not projection_pushdown or not datafusion_projection_applied
836
- ):
837
- df = df.select(with_columns)
838
- progress_bar.update(len(df))
839
- yield df
1072
+ out = out.filter(predicate)
1073
+ if with_columns is not None:
1074
+ out = out.select(with_columns)
1075
+
1076
+ if remaining is not None:
1077
+ if remaining <= 0:
1078
+ break
1079
+ if len(out) > remaining:
1080
+ out = out.head(remaining)
1081
+ remaining -= len(out)
1082
+
1083
+ progress_bar.update(len(out))
1084
+ yield out
1085
+ if remaining is not None and remaining <= 0:
1086
+ return
840
1087
 
841
1088
  return register_io_source(_overlap_source, schema=original_schema)
842
1089
 
@@ -877,21 +1124,36 @@ def _read_file(
877
1124
  input_format: InputFormat,
878
1125
  read_options: ReadOptions,
879
1126
  projection_pushdown: bool = False,
1127
+ predicate_pushdown: bool = False,
880
1128
  ) -> pl.LazyFrame:
881
1129
  table = py_register_table(ctx, path, None, input_format, read_options)
882
1130
  df = py_read_table(ctx, table.name)
883
1131
 
884
- lf = _lazy_scan(df, projection_pushdown, table.name, input_format, path)
1132
+ lf = _lazy_scan(
1133
+ df,
1134
+ projection_pushdown,
1135
+ predicate_pushdown,
1136
+ table.name,
1137
+ input_format,
1138
+ path,
1139
+ read_options,
1140
+ )
885
1141
 
886
1142
  # Wrap GFF LazyFrames with projection-aware wrapper for consistent attribute field handling
887
1143
  if input_format == InputFormat.Gff:
888
- return GffLazyFrameWrapper(lf, path, read_options, projection_pushdown)
1144
+ return GffLazyFrameWrapper(
1145
+ lf, path, read_options, projection_pushdown, predicate_pushdown
1146
+ )
889
1147
 
890
1148
  return lf
891
1149
 
892
1150
 
893
1151
  class GffLazyFrameWrapper:
894
- """Wrapper for GFF LazyFrames that handles attribute field detection in select operations."""
1152
+ """Thin wrapper that preserves type while delegating to the underlying LazyFrame.
1153
+
1154
+ Pushdown is decided exclusively inside the io_source callback based on
1155
+ with_columns and predicate; this wrapper only keeps chain type stable.
1156
+ """
895
1157
 
896
1158
  def __init__(
897
1159
  self,
@@ -899,45 +1161,33 @@ class GffLazyFrameWrapper:
899
1161
  file_path: str,
900
1162
  read_options: ReadOptions,
901
1163
  projection_pushdown: bool = True,
1164
+ predicate_pushdown: bool = True,
902
1165
  ):
903
1166
  self._base_lf = base_lf
904
1167
  self._file_path = file_path
905
1168
  self._read_options = read_options
906
1169
  self._projection_pushdown = projection_pushdown
1170
+ self._predicate_pushdown = predicate_pushdown
907
1171
 
908
1172
  def select(self, exprs):
909
- """Override select to handle GFF attribute field detection.
910
-
911
- Ensures queries requesting the raw `attributes` column use a registration
912
- that exposes it, while preserving projection pushdown. For unnested
913
- attribute fields (e.g., `gene_id`), re-registers with those fields to
914
- enable efficient projection.
915
- """
916
- # Extract column names from expressions
917
- if isinstance(exprs, (list, tuple)):
918
- columns = []
919
- for expr in exprs:
920
- if isinstance(expr, str):
921
- columns.append(expr)
922
- elif hasattr(expr, "meta") and hasattr(expr.meta, "output_name"):
923
- try:
924
- columns.append(expr.meta.output_name())
925
- except:
926
- pass
927
- else:
928
- # Single expression
929
- if isinstance(exprs, str):
930
- columns = [exprs]
931
- elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
932
- try:
933
- columns = [exprs.meta.output_name()]
934
- except:
935
- columns = []
1173
+ # Extract requested column names
1174
+ columns = []
1175
+ try:
1176
+ if isinstance(exprs, (list, tuple)):
1177
+ for e in exprs:
1178
+ if isinstance(e, str):
1179
+ columns.append(e)
1180
+ elif hasattr(e, "meta") and hasattr(e.meta, "output_name"):
1181
+ columns.append(e.meta.output_name())
936
1182
  else:
937
- columns = []
1183
+ if isinstance(exprs, str):
1184
+ columns = [exprs]
1185
+ elif hasattr(exprs, "meta") and hasattr(exprs.meta, "output_name"):
1186
+ columns = [exprs.meta.output_name()]
1187
+ except Exception:
1188
+ columns = []
938
1189
 
939
- # Categorize columns
940
- GFF_STATIC_COLUMNS = {
1190
+ STATIC = {
941
1191
  "chrom",
942
1192
  "start",
943
1193
  "end",
@@ -948,115 +1198,110 @@ class GffLazyFrameWrapper:
948
1198
  "phase",
949
1199
  "attributes",
950
1200
  }
951
- static_cols = [col for col in columns if col in GFF_STATIC_COLUMNS]
952
- attribute_cols = [col for col in columns if col not in GFF_STATIC_COLUMNS]
1201
+ attr_cols = [c for c in columns if c not in STATIC]
1202
+
1203
+ # If selecting attribute fields, run one-shot SQL projection with proper attr_fields
1204
+ if columns and (attr_cols or "attributes" in columns):
1205
+ from polars_bio.polars_bio import GffReadOptions
1206
+ from polars_bio.polars_bio import InputFormat as _InputFormat
1207
+ from polars_bio.polars_bio import PyObjectStorageOptions
1208
+ from polars_bio.polars_bio import ReadOptions as _ReadOptions
1209
+ from polars_bio.polars_bio import (
1210
+ py_read_sql,
1211
+ py_read_table,
1212
+ py_register_table,
1213
+ py_register_view,
1214
+ )
953
1215
 
954
- # If 'attributes' is requested, ensure the registered table exposes it.
955
- # Some parallel GFF providers omit the raw 'attributes' column; switch
956
- # to a registration that includes it while keeping projection pushdown.
957
- if "attributes" in static_cols:
958
1216
  from .context import ctx
959
1217
 
960
- # Preserve original parallelism and thread config when re-registering
961
- orig_gff_opts = getattr(self._read_options, "gff_read_options", None)
962
- orig_parallel = (
963
- getattr(orig_gff_opts, "parallel", False) if orig_gff_opts else False
964
- )
965
- orig_thread = (
966
- getattr(orig_gff_opts, "thread_num", None) if orig_gff_opts else None
1218
+ # Pull thread_num/parallel from original read options
1219
+ thread_num = 1
1220
+ parallel = False
1221
+ try:
1222
+ gopt = getattr(self._read_options, "gff_read_options", None)
1223
+ if gopt is not None:
1224
+ tn = getattr(gopt, "thread_num", None)
1225
+ if tn is not None:
1226
+ thread_num = tn
1227
+ par = getattr(gopt, "parallel", None)
1228
+ if par is not None:
1229
+ parallel = par
1230
+ except Exception:
1231
+ pass
1232
+
1233
+ obj = PyObjectStorageOptions(
1234
+ allow_anonymous=True,
1235
+ enable_request_payer=False,
1236
+ chunk_size=8,
1237
+ concurrent_fetches=1,
1238
+ max_retries=5,
1239
+ timeout=300,
1240
+ compression_type="auto",
967
1241
  )
1242
+ if "attributes" in columns:
1243
+ _attr = None
1244
+ elif attr_cols:
1245
+ _attr = attr_cols
1246
+ else:
1247
+ _attr = []
968
1248
 
969
- # Build read options that ensure raw attributes are present
970
- gff_options = GffReadOptions(
971
- attr_fields=None, # keep nested 'attributes' column
972
- thread_num=orig_thread if orig_thread is not None else 1,
973
- object_storage_options=PyObjectStorageOptions(
974
- allow_anonymous=True,
975
- enable_request_payer=False,
976
- chunk_size=8,
977
- concurrent_fetches=1,
978
- max_retries=5,
979
- timeout=300,
980
- compression_type="auto",
981
- ),
982
- parallel=orig_parallel,
1249
+ gff_opts = GffReadOptions(
1250
+ attr_fields=_attr,
1251
+ thread_num=thread_num,
1252
+ object_storage_options=obj,
1253
+ parallel=parallel,
983
1254
  )
984
- read_options = ReadOptions(gff_read_options=gff_options)
1255
+ ropts = _ReadOptions(gff_read_options=gff_opts)
985
1256
  table = py_register_table(
986
- ctx, self._file_path, None, InputFormat.Gff, read_options
1257
+ ctx, self._file_path, None, _InputFormat.Gff, ropts
987
1258
  )
988
- df = py_read_table(ctx, table.name)
989
- new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
990
- return new_lf.select(exprs)
991
-
992
- if self._projection_pushdown:
993
- # Optimized path: when selecting specific unnested attribute fields, re-register
994
- # GFF table with those fields so DataFusion can project them efficiently.
995
-
996
- # Use optimized table re-registration (fast path)
997
- from .context import ctx
998
-
999
- gff_options = GffReadOptions(
1000
- attr_fields=attribute_cols if attribute_cols else None,
1001
- thread_num=1,
1002
- object_storage_options=PyObjectStorageOptions(
1003
- allow_anonymous=True,
1004
- enable_request_payer=False,
1005
- chunk_size=8,
1006
- concurrent_fetches=1,
1007
- max_retries=5,
1008
- timeout=300,
1009
- compression_type="auto",
1010
- ),
1011
- # Keep parallel reading consistent with base options when possible
1012
- parallel=getattr(
1013
- getattr(self._read_options, "gff_read_options", None),
1014
- "parallel",
1015
- False,
1016
- ),
1259
+ select_clause = ", ".join([f'"{c}"' for c in columns])
1260
+ view_name = f"{table.name}_proj"
1261
+ py_register_view(
1262
+ ctx, view_name, f"SELECT {select_clause} FROM {table.name}"
1017
1263
  )
1018
-
1019
- read_options = ReadOptions(gff_read_options=gff_options)
1020
- table = py_register_table(
1021
- ctx, self._file_path, None, InputFormat.Gff, read_options
1264
+ df_view = py_read_table(ctx, view_name)
1265
+
1266
+ new_lf = _lazy_scan(
1267
+ df_view,
1268
+ False,
1269
+ self._predicate_pushdown,
1270
+ view_name,
1271
+ _InputFormat.Gff,
1272
+ self._file_path,
1273
+ self._read_options,
1274
+ )
1275
+ return GffLazyFrameWrapper(
1276
+ new_lf,
1277
+ self._file_path,
1278
+ self._read_options,
1279
+ False,
1280
+ self._predicate_pushdown,
1022
1281
  )
1023
- df = py_read_table(ctx, table.name)
1024
-
1025
- # Create new LazyFrame with optimized schema
1026
- new_lf = _lazy_scan(df, True, table.name, InputFormat.Gff, self._file_path)
1027
- return new_lf.select(exprs)
1028
-
1029
- elif attribute_cols:
1030
- # Extract attribute fields from nested structure (compatibility path)
1031
- import polars as pl
1032
-
1033
- # Build selection with attribute field extraction
1034
- selection_exprs = []
1035
-
1036
- # Add static columns as-is
1037
- for col in static_cols:
1038
- selection_exprs.append(pl.col(col))
1039
-
1040
- # Add attribute field extractions
1041
- for attr_col in attribute_cols:
1042
- attr_expr = (
1043
- pl.col("attributes")
1044
- .list.eval(
1045
- pl.when(pl.element().struct.field("tag") == attr_col).then(
1046
- pl.element().struct.field("value")
1047
- )
1048
- )
1049
- .list.drop_nulls()
1050
- .list.first()
1051
- .alias(attr_col)
1052
- )
1053
- selection_exprs.append(attr_expr)
1054
1282
 
1055
- return self._base_lf.select(selection_exprs)
1056
- else:
1057
- # Static columns only, use base LazyFrame
1058
- return self._base_lf.select(exprs)
1283
+ # Otherwise delegate to Polars
1284
+ return GffLazyFrameWrapper(
1285
+ self._base_lf.select(exprs),
1286
+ self._file_path,
1287
+ self._read_options,
1288
+ self._projection_pushdown,
1289
+ self._predicate_pushdown,
1290
+ )
1291
+
1292
+ def filter(self, *predicates):
1293
+ if not predicates:
1294
+ return self
1295
+ pred = predicates[0]
1296
+ for p in predicates[1:]:
1297
+ pred = pred & p
1298
+ return GffLazyFrameWrapper(
1299
+ self._base_lf.filter(pred),
1300
+ self._file_path,
1301
+ self._read_options,
1302
+ self._projection_pushdown,
1303
+ self._predicate_pushdown,
1304
+ )
1059
1305
 
1060
1306
  def __getattr__(self, name):
1061
- """Delegate all other operations to base LazyFrame."""
1062
1307
  return getattr(self._base_lf, name)