rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1914 @@
1
+ """
2
+ SPARQL★ Executor for the new dictionary-encoded storage layer.
3
+
4
+ Implements efficient query execution using integer-only operations
5
+ with expansion patterns for RDF-Star metadata queries (Q6-Q12).
6
+ """
7
+
8
+ from __future__ import annotations
9
+ from typing import Optional, Union, Any
10
+ from datetime import datetime
11
+
12
+ import polars as pl
13
+
14
+ from rdf_starbase.sparql.ast import (
15
+ Query, SelectQuery, AskQuery, ConstructQuery, DescribeQuery,
16
+ TriplePattern, QuotedTriplePattern,
17
+ Variable, IRI, Literal, BlankNode,
18
+ Filter, Comparison, LogicalExpression, FunctionCall,
19
+ ComparisonOp, LogicalOp,
20
+ WhereClause, ProvenanceFilter,
21
+ Term,
22
+ # Property path types
23
+ PropertyPath, PathIRI, PathSequence, PathAlternative,
24
+ PathInverse, PathMod, PathNegatedPropertySet, PropertyPathModifier,
25
+ # Pattern types
26
+ MinusPattern, OptionalPattern, UnionPattern,
27
+ # Aggregate types
28
+ AggregateExpression,
29
+ )
30
+ from rdf_starbase.storage.terms import TermDict, TermId, TermKind
31
+ from rdf_starbase.storage.quoted_triples import QtDict
32
+ from rdf_starbase.storage.facts import FactStore
33
+
34
+
35
+ class StorageExecutor:
36
+ """
37
+ Executes SPARQL★ queries against the new dictionary-encoded storage.
38
+
39
+ Key optimizations:
40
+ - All comparisons use integer IDs (no string comparisons in hot path)
41
+ - Quoted triple expansion via efficient joins
42
+ - Predicate partitioning for scan pruning
43
+ - Lazy evaluation for query optimization
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ term_dict: TermDict,
49
+ qt_dict: QtDict,
50
+ fact_store: FactStore
51
+ ):
52
+ """
53
+ Initialize executor with storage components.
54
+
55
+ Args:
56
+ term_dict: Dictionary mapping terms to integer IDs
57
+ qt_dict: Dictionary mapping quoted triples to IDs
58
+ fact_store: Integer-based fact storage
59
+ """
60
+ self.term_dict = term_dict
61
+ self.qt_dict = qt_dict
62
+ self.fact_store = fact_store
63
+ self._var_counter = 0
64
+
65
+ def execute(self, query: Query) -> Union[pl.DataFrame, bool, list[tuple[str, str, str]]]:
66
+ """
67
+ Execute a SPARQL★ query.
68
+
69
+ Args:
70
+ query: Parsed Query AST
71
+
72
+ Returns:
73
+ DataFrame for SELECT queries, bool for ASK queries,
74
+ list of triples for CONSTRUCT/DESCRIBE queries
75
+ """
76
+ if isinstance(query, SelectQuery):
77
+ return self._execute_select(query)
78
+ elif isinstance(query, AskQuery):
79
+ return self._execute_ask(query)
80
+ elif isinstance(query, ConstructQuery):
81
+ return self._execute_construct(query)
82
+ elif isinstance(query, DescribeQuery):
83
+ return self._execute_describe(query)
84
+ else:
85
+ raise NotImplementedError(f"Query type {type(query)} not yet supported")
86
+
87
+ def _execute_select(self, query: SelectQuery) -> pl.DataFrame:
88
+ """Execute a SELECT query."""
89
+ # Execute WHERE clause with integer IDs
90
+ df = self._execute_where(query.where, query.prefixes)
91
+
92
+ # Check if we have aggregates
93
+ has_aggregates = any(isinstance(v, AggregateExpression) for v in query.variables)
94
+
95
+ if has_aggregates or query.group_by:
96
+ # Handle GROUP BY and aggregates
97
+ df = self._apply_aggregates(df, query)
98
+
99
+ # Decode term IDs back to lexical forms for output
100
+ df = self._decode_result(df)
101
+
102
+ # Apply DISTINCT if requested
103
+ if query.distinct:
104
+ df = df.unique()
105
+
106
+ # Apply ORDER BY
107
+ if query.order_by:
108
+ order_cols = []
109
+ descending = []
110
+ for var, asc in query.order_by:
111
+ if var.name in df.columns:
112
+ order_cols.append(var.name)
113
+ descending.append(not asc)
114
+ if order_cols:
115
+ df = df.sort(order_cols, descending=descending)
116
+
117
+ # Apply LIMIT and OFFSET
118
+ if query.offset:
119
+ df = df.slice(query.offset, query.limit or len(df))
120
+ elif query.limit:
121
+ df = df.head(query.limit)
122
+
123
+ # Select only requested variables (or all if SELECT *)
124
+ if not query.is_select_all():
125
+ select_cols = []
126
+ for v in query.variables:
127
+ if isinstance(v, Variable):
128
+ if v.name in df.columns:
129
+ select_cols.append(v.name)
130
+ elif isinstance(v, AggregateExpression):
131
+ # Use the alias name
132
+ if v.alias and v.alias.name in df.columns:
133
+ select_cols.append(v.alias.name)
134
+ if select_cols:
135
+ df = df.select(select_cols)
136
+
137
+ return df
138
+
139
+ def _apply_aggregates(self, df: pl.DataFrame, query: SelectQuery) -> pl.DataFrame:
140
+ """Apply GROUP BY and aggregate functions."""
141
+ if len(df) == 0:
142
+ # Create empty result with correct columns
143
+ result_cols = {}
144
+ for v in query.variables:
145
+ if isinstance(v, Variable):
146
+ result_cols[v.name] = pl.Series([], dtype=pl.Utf8)
147
+ elif isinstance(v, AggregateExpression) and v.alias:
148
+ result_cols[v.alias.name] = pl.Series([], dtype=pl.Int64)
149
+ return pl.DataFrame(result_cols)
150
+
151
+ # First decode the columns we'll need for grouping and aggregation
152
+ df = self._decode_result(df)
153
+
154
+ # Build the aggregate expressions
155
+ agg_exprs = []
156
+ for v in query.variables:
157
+ if isinstance(v, AggregateExpression):
158
+ agg_expr = self._build_aggregate_expr(v, df)
159
+ if agg_expr is not None:
160
+ agg_exprs.append(agg_expr)
161
+
162
+ if not agg_exprs:
163
+ return df
164
+
165
+ # Apply GROUP BY if specified
166
+ if query.group_by:
167
+ group_cols = [g.name for g in query.group_by if isinstance(g, Variable) and g.name in df.columns]
168
+ if group_cols:
169
+ result = df.group_by(group_cols).agg(agg_exprs)
170
+ else:
171
+ # No valid group columns - aggregate all
172
+ result = df.select(agg_exprs)
173
+ else:
174
+ # No GROUP BY - aggregate the entire dataset
175
+ result = df.select(agg_exprs)
176
+
177
+ return result
178
+
179
+ def _build_aggregate_expr(self, agg: AggregateExpression, df: pl.DataFrame) -> Optional[pl.Expr]:
180
+ """Build a Polars aggregate expression from an AggregateExpression."""
181
+ func = agg.function.upper()
182
+ alias = agg.alias.name if agg.alias else f"_{func}"
183
+
184
+ if agg.argument is None:
185
+ # COUNT(*) - count all rows
186
+ if func == "COUNT":
187
+ return pl.len().alias(alias)
188
+ return None
189
+
190
+ if isinstance(agg.argument, Variable):
191
+ col_name = agg.argument.name
192
+ if col_name not in df.columns:
193
+ return None
194
+
195
+ col = pl.col(col_name)
196
+
197
+ if func == "COUNT":
198
+ if agg.distinct:
199
+ return col.n_unique().alias(alias)
200
+ else:
201
+ return col.count().alias(alias)
202
+ elif func == "SUM":
203
+ # Need to convert to numeric first
204
+ return col.cast(pl.Float64, strict=False).sum().alias(alias)
205
+ elif func == "AVG":
206
+ return col.cast(pl.Float64, strict=False).mean().alias(alias)
207
+ elif func == "MIN":
208
+ return col.min().alias(alias)
209
+ elif func == "MAX":
210
+ return col.max().alias(alias)
211
+ elif func == "GROUP_CONCAT":
212
+ sep = agg.separator or " "
213
+ return col.str.concat(sep).alias(alias)
214
+ elif func == "SAMPLE":
215
+ return col.first().alias(alias)
216
+
217
+ return None
218
+
219
+ def _execute_ask(self, query: AskQuery) -> bool:
220
+ """Execute an ASK query."""
221
+ df = self._execute_where(query.where, query.prefixes)
222
+ return len(df) > 0
223
+
224
+ def _execute_construct(self, query: ConstructQuery) -> list[tuple[str, str, str]]:
225
+ """
226
+ Execute a CONSTRUCT query.
227
+
228
+ Returns a list of triples (s, p, o) as strings, with template variables
229
+ substituted from the WHERE clause results.
230
+ """
231
+ # Execute WHERE clause to get bindings
232
+ df = self._execute_where(query.where, query.prefixes)
233
+
234
+ if len(df) == 0:
235
+ return []
236
+
237
+ # Decode term IDs to strings
238
+ df = self._decode_result(df)
239
+
240
+ # Generate triples by substituting template with each binding
241
+ triples = []
242
+ for row in df.iter_rows(named=True):
243
+ for pattern in query.template:
244
+ s = self._substitute_term(pattern.subject, row, query.prefixes)
245
+ p = self._substitute_term(pattern.predicate, row, query.prefixes)
246
+ o = self._substitute_term(pattern.object, row, query.prefixes)
247
+
248
+ if s is not None and p is not None and o is not None:
249
+ triples.append((s, p, o))
250
+
251
+ # Remove duplicates while preserving order
252
+ seen = set()
253
+ unique_triples = []
254
+ for t in triples:
255
+ if t not in seen:
256
+ seen.add(t)
257
+ unique_triples.append(t)
258
+
259
+ return unique_triples
260
+
261
+ def _substitute_term(
262
+ self,
263
+ term: Term,
264
+ bindings: dict[str, Any],
265
+ prefixes: dict[str, str]
266
+ ) -> Optional[str]:
267
+ """Substitute a term using variable bindings."""
268
+ if isinstance(term, Variable):
269
+ value = bindings.get(term.name)
270
+ return value if value is not None else None
271
+ elif isinstance(term, IRI):
272
+ return self._expand_iri(term.value, prefixes)
273
+ elif isinstance(term, Literal):
274
+ if term.language:
275
+ return f'"{term.value}"@{term.language}'
276
+ elif term.datatype:
277
+ return f'"{term.value}"^^<{term.datatype}>'
278
+ else:
279
+ return f'"{term.value}"'
280
+ elif isinstance(term, BlankNode):
281
+ return f"_:{term.label}"
282
+ return None
283
+
284
+ def _execute_describe(self, query: DescribeQuery) -> list[tuple[str, str, str]]:
285
+ """
286
+ Execute a DESCRIBE query.
287
+
288
+ Returns all triples where the described resources appear as subject or object.
289
+ """
290
+ # Collect resources to describe
291
+ resources_to_describe = set()
292
+
293
+ if query.where:
294
+ # Execute WHERE to get variable bindings
295
+ df = self._execute_where(query.where, query.prefixes)
296
+ df = self._decode_result(df)
297
+
298
+ for resource in query.resources:
299
+ if isinstance(resource, Variable):
300
+ # Get all values for this variable
301
+ if resource.name in df.columns:
302
+ resources_to_describe.update(df[resource.name].to_list())
303
+ elif isinstance(resource, IRI):
304
+ resources_to_describe.add(self._expand_iri(resource.value, query.prefixes))
305
+ else:
306
+ # No WHERE clause - just describe the listed resources
307
+ for resource in query.resources:
308
+ if isinstance(resource, IRI):
309
+ resources_to_describe.add(self._expand_iri(resource.value, query.prefixes))
310
+
311
+ if not resources_to_describe:
312
+ return []
313
+
314
+ # Get all triples about these resources
315
+ triples = []
316
+ df = self.fact_store.scan_facts()
317
+
318
+ for resource in resources_to_describe:
319
+ resource_id = self.term_dict.lookup_iri(resource)
320
+ if resource_id is None:
321
+ continue
322
+
323
+ # As subject
324
+ subj_df = df.filter(pl.col("s") == resource_id)
325
+ for row in subj_df.iter_rows(named=True):
326
+ s_lex = self.term_dict.get_lex(row["s"]) or resource
327
+ p_lex = self.term_dict.get_lex(row["p"]) or f"<unknown:{row['p']}>"
328
+ o_lex = self.term_dict.get_lex(row["o"]) or f"<unknown:{row['o']}>"
329
+ triples.append((s_lex, p_lex, o_lex))
330
+
331
+ # As object
332
+ obj_df = df.filter(pl.col("o") == resource_id)
333
+ for row in obj_df.iter_rows(named=True):
334
+ s_lex = self.term_dict.get_lex(row["s"]) or f"<unknown:{row['s']}>"
335
+ p_lex = self.term_dict.get_lex(row["p"]) or f"<unknown:{row['p']}>"
336
+ o_lex = self.term_dict.get_lex(row["o"]) or resource
337
+ triples.append((s_lex, p_lex, o_lex))
338
+
339
+ # Remove duplicates
340
+ return list(set(triples))
341
+
342
+ def _execute_where(
343
+ self,
344
+ where: WhereClause,
345
+ prefixes: dict[str, str]
346
+ ) -> pl.DataFrame:
347
+ """Execute WHERE clause, returning DataFrame with integer term IDs."""
348
+ # Check if there's no work to do at all
349
+ if not where.patterns and not where.union_patterns and not where.optional_patterns:
350
+ return pl.DataFrame()
351
+
352
+ result_df: Optional[pl.DataFrame] = None
353
+
354
+ # Process basic triple patterns
355
+ for i, pattern in enumerate(where.patterns):
356
+ if isinstance(pattern, QuotedTriplePattern):
357
+ # Handle quoted triple patterns (Q6 style)
358
+ pattern_df = self._execute_quoted_pattern(pattern, prefixes, i)
359
+ else:
360
+ pattern_df = self._execute_pattern(pattern, prefixes, i)
361
+
362
+ if result_df is None:
363
+ result_df = pattern_df
364
+ else:
365
+ # Join on shared variables
366
+ shared_cols = set(result_df.columns) & set(pattern_df.columns)
367
+ shared_cols -= {"_pattern_idx"}
368
+
369
+ if shared_cols:
370
+ result_df = result_df.join(
371
+ pattern_df,
372
+ on=list(shared_cols),
373
+ how="inner"
374
+ )
375
+ else:
376
+ result_df = result_df.join(pattern_df, how="cross")
377
+
378
+ # If we only have UNION patterns (no basic patterns), process UNION first
379
+ if result_df is None and where.union_patterns:
380
+ # Process first UNION to establish result_df
381
+ first_union = where.union_patterns[0]
382
+ result_df = self._apply_union_standalone(first_union, prefixes)
383
+
384
+ # Process remaining UNION patterns
385
+ for union in where.union_patterns[1:]:
386
+ result_df = self._apply_union(result_df, union, prefixes)
387
+ elif result_df is None:
388
+ return pl.DataFrame()
389
+
390
+ # Apply OPTIONAL patterns (left outer join) - must come before FILTER
391
+ # so that FILTER can reference optional variables
392
+ for optional in where.optional_patterns:
393
+ result_df = self._apply_optional(result_df, optional, prefixes)
394
+
395
+ # Apply MINUS patterns (anti-join)
396
+ for minus in where.minus_patterns:
397
+ result_df = self._apply_minus(result_df, minus, prefixes)
398
+
399
+ # Apply FILTER clauses - after OPTIONAL so all variables are available
400
+ for filter_clause in where.filters:
401
+ if isinstance(filter_clause, Filter):
402
+ result_df = self._apply_filter(result_df, filter_clause, prefixes)
403
+ elif isinstance(filter_clause, ProvenanceFilter):
404
+ result_df = self._apply_provenance_filter(result_df, filter_clause)
405
+
406
+ # Apply UNION patterns
407
+ for union in where.union_patterns:
408
+ result_df = self._apply_union(result_df, union, prefixes)
409
+
410
+ # Remove internal columns
411
+ internal_cols = [c for c in result_df.columns if c.startswith("_")]
412
+ if internal_cols:
413
+ result_df = result_df.drop(internal_cols)
414
+
415
+ return result_df
416
+
417
+ def _execute_pattern(
418
+ self,
419
+ pattern: TriplePattern,
420
+ prefixes: dict[str, str],
421
+ pattern_idx: int
422
+ ) -> pl.DataFrame:
423
+ """Execute a triple pattern using integer comparisons."""
424
+ # Check if this pattern has a property path predicate
425
+ if pattern.has_property_path():
426
+ return self._execute_property_path_pattern(pattern, prefixes, pattern_idx)
427
+
428
+ # Get the facts DataFrame
429
+ df = self.fact_store.scan_facts()
430
+
431
+ # Apply filters for concrete terms using integer IDs
432
+ if not isinstance(pattern.subject, Variable):
433
+ term_id = self._resolve_term_id(pattern.subject, prefixes)
434
+ if term_id is None:
435
+ return pl.DataFrame() # Term not in store
436
+ df = df.filter(pl.col("s") == term_id)
437
+
438
+ if not isinstance(pattern.predicate, Variable):
439
+ term_id = self._resolve_term_id(pattern.predicate, prefixes)
440
+ if term_id is None:
441
+ return pl.DataFrame()
442
+ df = df.filter(pl.col("p") == term_id)
443
+
444
+ if not isinstance(pattern.object, (Variable, QuotedTriplePattern)):
445
+ term_id = self._resolve_term_id(pattern.object, prefixes)
446
+ if term_id is None:
447
+ return pl.DataFrame()
448
+ df = df.filter(pl.col("o") == term_id)
449
+
450
+ # Build result with variable bindings
451
+ renames = {}
452
+ select_cols = []
453
+
454
+ if isinstance(pattern.subject, Variable):
455
+ renames["s"] = pattern.subject.name
456
+ select_cols.append("s")
457
+
458
+ if isinstance(pattern.predicate, Variable):
459
+ renames["p"] = pattern.predicate.name
460
+ select_cols.append("p")
461
+
462
+ if isinstance(pattern.object, Variable):
463
+ renames["o"] = pattern.object.name
464
+ select_cols.append("o")
465
+
466
+ # Include metadata columns for provenance filters
467
+ for col in ["source", "confidence", "t_added", "process"]:
468
+ if col in df.columns:
469
+ renames[col] = f"_prov_{pattern_idx}_{col}"
470
+ select_cols.append(col)
471
+
472
+ if select_cols:
473
+ result = df.select(select_cols).rename(renames)
474
+ else:
475
+ result = pl.DataFrame({"_match": [True] * len(df)})
476
+
477
+ return result
478
+
479
+ # =========================================================================
480
+ # Property Path Execution
481
+ # =========================================================================
482
+
483
+ def _execute_property_path_pattern(
484
+ self,
485
+ pattern: TriplePattern,
486
+ prefixes: dict[str, str],
487
+ pattern_idx: int
488
+ ) -> pl.DataFrame:
489
+ """
490
+ Execute a triple pattern with a property path predicate.
491
+
492
+ Supports:
493
+ - PathIRI: simple predicate (treated as normal pattern)
494
+ - PathSequence: a/b/c navigation
495
+ - PathAlternative: a|b|c any of these predicates
496
+ - PathInverse: ^a reverse direction
497
+ - PathMod: a*, a+, a? repetition
498
+ - PathNegatedPropertySet: !(a|b) any predicate except these
499
+ """
500
+ path = pattern.predicate
501
+
502
+ # Resolve subject/object
503
+ subj_id = None
504
+ if not isinstance(pattern.subject, Variable):
505
+ subj_id = self._resolve_term_id(pattern.subject, prefixes)
506
+ if subj_id is None:
507
+ return pl.DataFrame()
508
+
509
+ obj_id = None
510
+ if not isinstance(pattern.object, Variable):
511
+ obj_id = self._resolve_term_id(pattern.object, prefixes)
512
+ if obj_id is None:
513
+ return pl.DataFrame()
514
+
515
+ # Execute path
516
+ result = self._execute_path(path, subj_id, obj_id, prefixes)
517
+
518
+ # Build output with variable bindings
519
+ renames = {}
520
+ if isinstance(pattern.subject, Variable):
521
+ renames["start"] = pattern.subject.name
522
+ if isinstance(pattern.object, Variable):
523
+ renames["end"] = pattern.object.name
524
+
525
+ if renames:
526
+ result = result.rename(renames)
527
+
528
+ # Select only needed columns
529
+ select_cols = list(renames.values()) if renames else ["start", "end"]
530
+ select_cols = [c for c in select_cols if c in result.columns]
531
+ if select_cols:
532
+ result = result.select(select_cols)
533
+
534
+ return result.unique()
535
+
536
+ def _execute_path(
537
+ self,
538
+ path: PropertyPath,
539
+ start_id: Optional[int],
540
+ end_id: Optional[int],
541
+ prefixes: dict[str, str]
542
+ ) -> pl.DataFrame:
543
+ """
544
+ Execute a property path, returning (start, end) pairs.
545
+
546
+ Args:
547
+ path: The property path to execute
548
+ start_id: Fixed start node (or None for variable)
549
+ end_id: Fixed end node (or None for variable)
550
+ prefixes: Namespace prefixes
551
+
552
+ Returns:
553
+ DataFrame with 'start' and 'end' columns
554
+ """
555
+ if isinstance(path, PathIRI):
556
+ return self._execute_path_iri(path, start_id, end_id, prefixes)
557
+ elif isinstance(path, PathSequence):
558
+ return self._execute_path_sequence(path, start_id, end_id, prefixes)
559
+ elif isinstance(path, PathAlternative):
560
+ return self._execute_path_alternative(path, start_id, end_id, prefixes)
561
+ elif isinstance(path, PathInverse):
562
+ return self._execute_path_inverse(path, start_id, end_id, prefixes)
563
+ elif isinstance(path, PathMod):
564
+ return self._execute_path_mod(path, start_id, end_id, prefixes)
565
+ elif isinstance(path, PathNegatedPropertySet):
566
+ return self._execute_path_negated(path, start_id, end_id, prefixes)
567
+ else:
568
+ raise NotImplementedError(f"Path type {type(path)} not implemented")
569
+
570
+ def _execute_path_iri(
571
+ self,
572
+ path: PathIRI,
573
+ start_id: Optional[int],
574
+ end_id: Optional[int],
575
+ prefixes: dict[str, str]
576
+ ) -> pl.DataFrame:
577
+ """Execute a simple IRI path (single predicate)."""
578
+ pred_id = self._resolve_term_id(path.iri, prefixes)
579
+ if pred_id is None:
580
+ return pl.DataFrame({"start": [], "end": []})
581
+
582
+ df = self.fact_store.scan_facts()
583
+ df = df.filter(pl.col("p") == pred_id)
584
+
585
+ if start_id is not None:
586
+ df = df.filter(pl.col("s") == start_id)
587
+ if end_id is not None:
588
+ df = df.filter(pl.col("o") == end_id)
589
+
590
+ return df.select([
591
+ pl.col("s").alias("start"),
592
+ pl.col("o").alias("end")
593
+ ])
594
+
595
+ def _execute_path_sequence(
596
+ self,
597
+ path: PathSequence,
598
+ start_id: Optional[int],
599
+ end_id: Optional[int],
600
+ prefixes: dict[str, str]
601
+ ) -> pl.DataFrame:
602
+ """Execute a path sequence (a/b/c)."""
603
+ if not path.paths:
604
+ return pl.DataFrame({"start": [], "end": []})
605
+
606
+ # Execute first path
607
+ result = self._execute_path(path.paths[0], start_id, None, prefixes)
608
+
609
+ # Chain through remaining paths
610
+ for i, subpath in enumerate(path.paths[1:], 1):
611
+ is_last = i == len(path.paths) - 1
612
+
613
+ # Execute next path segment
614
+ next_end = end_id if is_last else None
615
+ next_df = self._execute_path(subpath, None, next_end, prefixes)
616
+
617
+ # Join: result.end = next_df.start
618
+ result = result.join(
619
+ next_df.rename({"start": "_join_start", "end": "_next_end"}),
620
+ left_on="end",
621
+ right_on="_join_start",
622
+ how="inner"
623
+ ).select([
624
+ pl.col("start"),
625
+ pl.col("_next_end").alias("end")
626
+ ])
627
+
628
+ return result
629
+
630
+ def _execute_path_alternative(
631
+ self,
632
+ path: PathAlternative,
633
+ start_id: Optional[int],
634
+ end_id: Optional[int],
635
+ prefixes: dict[str, str]
636
+ ) -> pl.DataFrame:
637
+ """Execute a path alternative (a|b|c)."""
638
+ results = []
639
+ for subpath in path.paths:
640
+ df = self._execute_path(subpath, start_id, end_id, prefixes)
641
+ if len(df) > 0:
642
+ results.append(df)
643
+
644
+ if not results:
645
+ return pl.DataFrame({"start": [], "end": []})
646
+
647
+ return pl.concat(results).unique()
648
+
649
+ def _execute_path_inverse(
650
+ self,
651
+ path: PathInverse,
652
+ start_id: Optional[int],
653
+ end_id: Optional[int],
654
+ prefixes: dict[str, str]
655
+ ) -> pl.DataFrame:
656
+ """Execute an inverse path (^a) - swap start and end."""
657
+ # For inverse, we swap the direction
658
+ inner = self._execute_path(path.path, end_id, start_id, prefixes)
659
+
660
+ # Swap columns
661
+ return inner.select([
662
+ pl.col("end").alias("start"),
663
+ pl.col("start").alias("end")
664
+ ])
665
+
666
+ def _execute_path_mod(
667
+ self,
668
+ path: PathMod,
669
+ start_id: Optional[int],
670
+ end_id: Optional[int],
671
+ prefixes: dict[str, str],
672
+ max_depth: int = 10
673
+ ) -> pl.DataFrame:
674
+ """Execute a modified path (a*, a+, a?)."""
675
+ if path.modifier == PropertyPathModifier.ZERO_OR_ONE:
676
+ # a? = identity OR one step
677
+ one_step = self._execute_path(path.path, start_id, end_id, prefixes)
678
+
679
+ # Add identity (start = end) for nodes
680
+ if start_id is not None:
681
+ identity = pl.DataFrame({"start": [start_id], "end": [start_id]})
682
+ elif end_id is not None:
683
+ identity = pl.DataFrame({"start": [end_id], "end": [end_id]})
684
+ else:
685
+ # Get all nodes
686
+ all_nodes = self._get_all_nodes()
687
+ identity = pl.DataFrame({"start": all_nodes, "end": all_nodes})
688
+
689
+ return pl.concat([one_step, identity]).unique()
690
+
691
+ elif path.modifier == PropertyPathModifier.ZERO_OR_MORE:
692
+ # a* = identity + transitive closure
693
+ return self._execute_transitive_closure(
694
+ path.path, start_id, end_id, prefixes,
695
+ include_identity=True, max_depth=max_depth
696
+ )
697
+
698
+ elif path.modifier == PropertyPathModifier.ONE_OR_MORE:
699
+ # a+ = at least one step, then transitive closure
700
+ return self._execute_transitive_closure(
701
+ path.path, start_id, end_id, prefixes,
702
+ include_identity=False, max_depth=max_depth
703
+ )
704
+
705
+ else:
706
+ raise NotImplementedError(f"Path modifier {path.modifier} not implemented")
707
+
708
+ def _execute_transitive_closure(
709
+ self,
710
+ path: PropertyPath,
711
+ start_id: Optional[int],
712
+ end_id: Optional[int],
713
+ prefixes: dict[str, str],
714
+ include_identity: bool,
715
+ max_depth: int = 10
716
+ ) -> pl.DataFrame:
717
+ """Compute transitive closure for path+ or path*."""
718
+ # Get single-step edges
719
+ edges = self._execute_path(path, None, None, prefixes)
720
+
721
+ # Ensure edges have proper schema
722
+ if len(edges) == 0:
723
+ if include_identity:
724
+ if start_id is not None:
725
+ return pl.DataFrame({
726
+ "start": pl.Series([start_id], dtype=pl.UInt64),
727
+ "end": pl.Series([start_id], dtype=pl.UInt64)
728
+ })
729
+ elif end_id is not None:
730
+ return pl.DataFrame({
731
+ "start": pl.Series([end_id], dtype=pl.UInt64),
732
+ "end": pl.Series([end_id], dtype=pl.UInt64)
733
+ })
734
+ return pl.DataFrame({
735
+ "start": pl.Series([], dtype=pl.UInt64),
736
+ "end": pl.Series([], dtype=pl.UInt64)
737
+ })
738
+
739
+ # Initialize reachable set with proper schema matching edges
740
+ if include_identity:
741
+ all_nodes = self._get_all_nodes()
742
+ reachable = pl.DataFrame({
743
+ "start": pl.Series(all_nodes, dtype=pl.UInt64),
744
+ "end": pl.Series(all_nodes, dtype=pl.UInt64)
745
+ })
746
+ # Add single-step edges
747
+ reachable = pl.concat([reachable, edges]).unique()
748
+ else:
749
+ # For ONE_OR_MORE, start with just the edges
750
+ reachable = edges.clone()
751
+
752
+ # Iteratively expand (fixed-point computation)
753
+ for _ in range(max_depth):
754
+ prev_len = len(reachable)
755
+
756
+ # Join reachable with edges: (a, b) + (b, c) => (a, c)
757
+ new_pairs = reachable.join(
758
+ edges.rename({"start": "_mid", "end": "_new_end"}),
759
+ left_on="end",
760
+ right_on="_mid",
761
+ how="inner"
762
+ ).select([
763
+ pl.col("start"),
764
+ pl.col("_new_end").alias("end")
765
+ ])
766
+
767
+ reachable = pl.concat([reachable, new_pairs]).unique()
768
+
769
+ if len(reachable) == prev_len:
770
+ break # Fixed point reached
771
+
772
+ # Apply start/end filters
773
+ if start_id is not None:
774
+ reachable = reachable.filter(pl.col("start") == start_id)
775
+ if end_id is not None:
776
+ reachable = reachable.filter(pl.col("end") == end_id)
777
+
778
+ return reachable
779
+
780
+ def _execute_path_negated(
781
+ self,
782
+ path: PathNegatedPropertySet,
783
+ start_id: Optional[int],
784
+ end_id: Optional[int],
785
+ prefixes: dict[str, str]
786
+ ) -> pl.DataFrame:
787
+ """Execute a negated property set !(a|b) - any predicate except these."""
788
+ # Get excluded predicate IDs
789
+ excluded_ids = set()
790
+ for iri in path.iris:
791
+ pred_id = self._resolve_term_id(iri, prefixes)
792
+ if pred_id is not None:
793
+ excluded_ids.add(pred_id)
794
+
795
+ df = self.fact_store.scan_facts()
796
+
797
+ # Exclude the specified predicates
798
+ if excluded_ids:
799
+ df = df.filter(~pl.col("p").is_in(list(excluded_ids)))
800
+
801
+ if start_id is not None:
802
+ df = df.filter(pl.col("s") == start_id)
803
+ if end_id is not None:
804
+ df = df.filter(pl.col("o") == end_id)
805
+
806
+ return df.select([
807
+ pl.col("s").alias("start"),
808
+ pl.col("o").alias("end")
809
+ ])
810
+
811
+ def _get_all_nodes(self) -> list[int]:
812
+ """Get all unique node IDs (subjects and objects)."""
813
+ df = self.fact_store.scan_facts()
814
+ subjects = df.select("s").unique()["s"].to_list()
815
+ objects = df.select("o").unique()["o"].to_list()
816
+ return list(set(subjects) | set(objects))
817
+
818
+ # =========================================================================
819
+ # MINUS Pattern Execution
820
+ # =========================================================================
821
+
822
+ def _apply_minus(
823
+ self,
824
+ result_df: pl.DataFrame,
825
+ minus: MinusPattern,
826
+ prefixes: dict[str, str]
827
+ ) -> pl.DataFrame:
828
+ """
829
+ Apply a MINUS pattern to filter out matching solutions.
830
+
831
+ MINUS implements set difference: returns rows from result_df
832
+ that don't have compatible bindings in the minus pattern.
833
+
834
+ SPARQL semantics: A solution µ1 is removed if there exists
835
+ a solution µ2 in the MINUS clause such that:
836
+ - µ1 and µ2 are compatible (agree on shared variables)
837
+ - dom(µ1) ∩ dom(µ2) ≠ ∅ (they share at least one variable)
838
+ """
839
+ if len(result_df) == 0:
840
+ return result_df
841
+
842
+ # Execute the MINUS patterns to get solutions to exclude
843
+ minus_df: Optional[pl.DataFrame] = None
844
+
845
+ for i, pattern in enumerate(minus.patterns):
846
+ if isinstance(pattern, QuotedTriplePattern):
847
+ pattern_df = self._execute_quoted_pattern(pattern, prefixes, i)
848
+ else:
849
+ pattern_df = self._execute_pattern(pattern, prefixes, i)
850
+
851
+ if minus_df is None:
852
+ minus_df = pattern_df
853
+ else:
854
+ # Join on shared variables
855
+ shared_cols = set(minus_df.columns) & set(pattern_df.columns)
856
+ shared_cols -= {"_pattern_idx"}
857
+
858
+ if shared_cols:
859
+ minus_df = minus_df.join(
860
+ pattern_df,
861
+ on=list(shared_cols),
862
+ how="inner"
863
+ )
864
+ else:
865
+ minus_df = minus_df.join(pattern_df, how="cross")
866
+
867
+ if minus_df is None or len(minus_df) == 0:
868
+ return result_df
869
+
870
+ # Apply filters from the MINUS clause
871
+ for filter_clause in minus.filters:
872
+ if isinstance(filter_clause, Filter):
873
+ minus_df = self._apply_filter(minus_df, filter_clause, prefixes)
874
+
875
+ # Find shared variables between result and minus
876
+ shared_vars = set(result_df.columns) & set(minus_df.columns)
877
+ shared_vars = {c for c in shared_vars if not c.startswith("_")}
878
+
879
+ if not shared_vars:
880
+ # No shared variables - MINUS has no effect (SPARQL semantics)
881
+ return result_df
882
+
883
+ # Perform anti-join: keep rows from result_df that don't match minus_df
884
+ # We do this with a left join and then filter for nulls
885
+ shared_list = list(shared_vars)
886
+
887
+ # Add a marker column to minus_df to detect matches
888
+ minus_df = minus_df.select(shared_list).unique()
889
+ minus_df = minus_df.with_columns(pl.lit(True).alias("_minus_match"))
890
+
891
+ # Left join
892
+ result_df = result_df.join(
893
+ minus_df,
894
+ on=shared_list,
895
+ how="left"
896
+ )
897
+
898
+ # Keep only rows where there was no match
899
+ result_df = result_df.filter(pl.col("_minus_match").is_null())
900
+ result_df = result_df.drop("_minus_match")
901
+
902
+ return result_df
903
+
904
+ def _apply_optional(
905
+ self,
906
+ result_df: pl.DataFrame,
907
+ optional: OptionalPattern,
908
+ prefixes: dict[str, str]
909
+ ) -> pl.DataFrame:
910
+ """
911
+ Apply an OPTIONAL pattern using left outer join.
912
+
913
+ OPTIONAL { ... } patterns add bindings when matched but keep
914
+ rows even when no match exists (with NULL for optional columns).
915
+ """
916
+ if len(result_df) == 0:
917
+ return result_df
918
+
919
+ # Execute the optional patterns
920
+ optional_df: Optional[pl.DataFrame] = None
921
+
922
+ for i, pattern in enumerate(optional.patterns):
923
+ if isinstance(pattern, QuotedTriplePattern):
924
+ pattern_df = self._execute_quoted_pattern(pattern, prefixes, 1000 + i)
925
+ elif isinstance(pattern, TriplePattern):
926
+ pattern_df = self._execute_pattern(pattern, prefixes, 1000 + i)
927
+ else:
928
+ continue # Skip nested patterns for now
929
+
930
+ if optional_df is None:
931
+ optional_df = pattern_df
932
+ else:
933
+ shared_cols = set(optional_df.columns) & set(pattern_df.columns)
934
+ shared_cols -= {"_pattern_idx"}
935
+ shared_cols = {c for c in shared_cols if not c.startswith("_")}
936
+
937
+ if shared_cols:
938
+ optional_df = optional_df.join(pattern_df, on=list(shared_cols), how="inner")
939
+ else:
940
+ optional_df = optional_df.join(pattern_df, how="cross")
941
+
942
+ if optional_df is None or len(optional_df) == 0:
943
+ return result_df
944
+
945
+ # Apply filters within the optional block
946
+ for filter_clause in optional.filters:
947
+ if isinstance(filter_clause, Filter):
948
+ optional_df = self._apply_filter(optional_df, filter_clause, prefixes)
949
+
950
+ # Remove internal columns from optional_df
951
+ internal_cols = [c for c in optional_df.columns if c.startswith("_")]
952
+ if internal_cols:
953
+ optional_df = optional_df.drop(internal_cols)
954
+
955
+ # Find shared columns for the join
956
+ shared_cols = set(result_df.columns) & set(optional_df.columns)
957
+ shared_cols = {c for c in shared_cols if not c.startswith("_")}
958
+
959
+ if shared_cols:
960
+ # Left outer join - keep all rows from result_df, add optional columns where matched
961
+ return result_df.join(optional_df, on=list(shared_cols), how="left")
962
+ else:
963
+ # No shared columns - return original
964
+ return result_df
965
+
966
+ def _apply_union_standalone(
967
+ self,
968
+ union: UnionPattern,
969
+ prefixes: dict[str, str]
970
+ ) -> pl.DataFrame:
971
+ """
972
+ Apply a UNION pattern as the primary query (no prior results).
973
+
974
+ This is used when WHERE clause starts with a UNION.
975
+ """
976
+ union_results = []
977
+
978
+ for i, alternative in enumerate(union.alternatives):
979
+ # Execute each alternative as a mini WHERE clause
980
+ alt_where = WhereClause(patterns=alternative)
981
+ alt_df = self._execute_where(alt_where, prefixes)
982
+
983
+ if len(alt_df) > 0:
984
+ union_results.append(alt_df)
985
+
986
+ if not union_results:
987
+ return pl.DataFrame()
988
+
989
+ # Combine all union results
990
+ if len(union_results) == 1:
991
+ return union_results[0]
992
+
993
+ # Align schemas - add missing columns with null values
994
+ all_columns = set()
995
+ for r in union_results:
996
+ all_columns.update(r.columns)
997
+
998
+ aligned_results = []
999
+ for r in union_results:
1000
+ missing_cols = all_columns - set(r.columns)
1001
+ if missing_cols:
1002
+ # Add null columns for missing variables
1003
+ for col in missing_cols:
1004
+ r = r.with_columns(pl.lit(None).alias(col))
1005
+ aligned_results.append(r.select(sorted(all_columns)))
1006
+
1007
+ return pl.concat(aligned_results)
1008
+
1009
+ def _apply_union(
1010
+ self,
1011
+ result_df: pl.DataFrame,
1012
+ union: UnionPattern,
1013
+ prefixes: dict[str, str]
1014
+ ) -> pl.DataFrame:
1015
+ """
1016
+ Apply a UNION pattern by combining results from alternatives.
1017
+
1018
+ UNION combines results from multiple pattern groups:
1019
+ { ?s ?p ?o } UNION { ?s ?q ?r }
1020
+
1021
+ Returns all rows matching ANY of the alternatives.
1022
+ """
1023
+ union_results = []
1024
+
1025
+ for i, alternative in enumerate(union.alternatives):
1026
+ # Execute each alternative as a mini WHERE clause
1027
+ alt_where = WhereClause(patterns=alternative)
1028
+ alt_df = self._execute_where(alt_where, prefixes)
1029
+
1030
+ if len(alt_df) > 0:
1031
+ union_results.append(alt_df)
1032
+
1033
+ if not union_results:
1034
+ return result_df
1035
+
1036
+ # Combine all union results
1037
+ if len(union_results) == 1:
1038
+ union_df = union_results[0]
1039
+ else:
1040
+ # Align schemas - add missing columns with null values
1041
+ all_columns = set()
1042
+ for r in union_results:
1043
+ all_columns.update(r.columns)
1044
+
1045
+ aligned_results = []
1046
+ for r in union_results:
1047
+ missing_cols = all_columns - set(r.columns)
1048
+ if missing_cols:
1049
+ # Add null columns for missing variables
1050
+ for col in missing_cols:
1051
+ r = r.with_columns(pl.lit(None).alias(col))
1052
+ aligned_results.append(r.select(sorted(all_columns)))
1053
+
1054
+ union_df = pl.concat(aligned_results)
1055
+
1056
+ # If we have existing results, combine them with union
1057
+ if len(result_df) > 0 and len(result_df.columns) > 0:
1058
+ # Find shared columns
1059
+ shared_cols = set(result_df.columns) & set(union_df.columns)
1060
+ shared_cols = {c for c in shared_cols if not c.startswith("_")}
1061
+
1062
+ if shared_cols:
1063
+ # Join union results with existing results
1064
+ return result_df.join(union_df, on=list(shared_cols), how="inner")
1065
+ else:
1066
+ # No shared columns - cross join
1067
+ return result_df.join(union_df, how="cross")
1068
+ else:
1069
+ return union_df
1070
+
1071
+ def _execute_quoted_pattern(
1072
+ self,
1073
+ pattern: QuotedTriplePattern,
1074
+ prefixes: dict[str, str],
1075
+ pattern_idx: int
1076
+ ) -> pl.DataFrame:
1077
+ """
1078
+ Execute a quoted triple pattern (Q6: << s p o >> ?mp ?mo).
1079
+
1080
+ This is the key RDF★ expansion pattern that finds metadata
1081
+ about quoted triples.
1082
+ """
1083
+ # Get the quoted triple components
1084
+ s_term = pattern.subject
1085
+ p_term = pattern.predicate
1086
+ o_term = pattern.object
1087
+
1088
+ # Check if the quoted triple itself is concrete or has variables
1089
+ qt_s_id = None if isinstance(s_term, Variable) else self._resolve_term_id(s_term, prefixes)
1090
+ qt_p_id = None if isinstance(p_term, Variable) else self._resolve_term_id(p_term, prefixes)
1091
+ qt_o_id = None if isinstance(o_term, Variable) else self._resolve_term_id(o_term, prefixes)
1092
+
1093
+ # If all components are concrete, look up the qt_id
1094
+ if qt_s_id is not None and qt_p_id is not None and qt_o_id is not None:
1095
+ qt_id = self.qt_dict.lookup_id(qt_s_id, qt_p_id, qt_o_id)
1096
+ if qt_id is None:
1097
+ return pl.DataFrame() # Quoted triple not found
1098
+
1099
+ # Find facts where this qt_id appears as subject
1100
+ df = self.fact_store.scan_facts().filter(pl.col("s") == qt_id)
1101
+ else:
1102
+ # Need to join with qt_dict to expand
1103
+ df = self._expand_qt_metadata(qt_s_id, qt_p_id, qt_o_id)
1104
+
1105
+ # Rename predicate/object vars if they exist in outer pattern
1106
+ # (for << s p o >> ?mp ?mo patterns)
1107
+ renames = {}
1108
+ select_cols = []
1109
+
1110
+ # The quoted triple's metadata predicate/object
1111
+ if "p" in df.columns:
1112
+ renames["p"] = "mp" if pattern_idx == 0 else f"mp_{pattern_idx}"
1113
+ select_cols.append("p")
1114
+ if "o" in df.columns:
1115
+ renames["o"] = "mo" if pattern_idx == 0 else f"mo_{pattern_idx}"
1116
+ select_cols.append("o")
1117
+
1118
+ # Include base triple components if variables
1119
+ if isinstance(s_term, Variable) and "base_s" in df.columns:
1120
+ renames["base_s"] = s_term.name
1121
+ select_cols.append("base_s")
1122
+ if isinstance(p_term, Variable) and "base_p" in df.columns:
1123
+ renames["base_p"] = p_term.name
1124
+ select_cols.append("base_p")
1125
+ if isinstance(o_term, Variable) and "base_o" in df.columns:
1126
+ renames["base_o"] = o_term.name
1127
+ select_cols.append("base_o")
1128
+
1129
+ if select_cols:
1130
+ result = df.select(select_cols).rename(renames)
1131
+ else:
1132
+ result = df
1133
+
1134
+ return result
1135
+
1136
+ def _expand_qt_metadata(
1137
+ self,
1138
+ qt_s_id: Optional[TermId],
1139
+ qt_p_id: Optional[TermId],
1140
+ qt_o_id: Optional[TermId]
1141
+ ) -> pl.DataFrame:
1142
+ """
1143
+ Expand quoted triple metadata with optional filters on components.
1144
+
1145
+ This implements the RDF★ expansion join: find metadata about
1146
+ quoted triples, optionally filtering by their s/p/o components.
1147
+ """
1148
+ # Get facts about quoted triples (metadata facts)
1149
+ df = self.fact_store.scan_metadata_facts()
1150
+
1151
+ if len(df) == 0:
1152
+ return df
1153
+
1154
+ # Expand qt_id to base (s, p, o)
1155
+ df = self.fact_store.expand_qt_metadata(df, self.qt_dict)
1156
+
1157
+ # Apply filters on quoted triple components
1158
+ if qt_s_id is not None:
1159
+ df = df.filter(pl.col("base_s") == qt_s_id)
1160
+ if qt_p_id is not None:
1161
+ df = df.filter(pl.col("base_p") == qt_p_id)
1162
+ if qt_o_id is not None:
1163
+ df = df.filter(pl.col("base_o") == qt_o_id)
1164
+
1165
+ return df
1166
+
1167
+ def _resolve_term_id(
1168
+ self,
1169
+ term: Term,
1170
+ prefixes: dict[str, str]
1171
+ ) -> Optional[TermId]:
1172
+ """Resolve a term to its integer ID."""
1173
+ if isinstance(term, IRI):
1174
+ iri = self._expand_iri(term.value, prefixes)
1175
+ return self.term_dict.lookup_iri(iri)
1176
+ elif isinstance(term, Literal):
1177
+ return self.term_dict.lookup_literal(
1178
+ str(term.value),
1179
+ term.datatype,
1180
+ term.language
1181
+ )
1182
+ elif isinstance(term, BlankNode):
1183
+ return self.term_dict.lookup_bnode(term.label)
1184
+ return None
1185
+
1186
+ def _expand_iri(self, iri: str, prefixes: dict[str, str]) -> str:
1187
+ """Expand prefixed IRI to full form."""
1188
+ if ":" in iri and not iri.startswith("http"):
1189
+ prefix, local = iri.split(":", 1)
1190
+ if prefix in prefixes:
1191
+ return prefixes[prefix] + local
1192
+ return iri
1193
+
1194
+ def _decode_result(self, df: pl.DataFrame) -> pl.DataFrame:
1195
+ """Decode integer term IDs back to lexical forms for output."""
1196
+ if len(df) == 0:
1197
+ return df
1198
+
1199
+ # Find columns that contain term IDs (variables and metadata predicates)
1200
+ id_columns = [c for c in df.columns if not c.startswith("_prov")]
1201
+
1202
+ for col in id_columns:
1203
+ if df.schema[col] == pl.UInt64:
1204
+ # Decode this column
1205
+ decoded = []
1206
+ for term_id in df[col].to_list():
1207
+ if term_id is None:
1208
+ decoded.append(None) # Keep NULL for OPTIONAL non-matches
1209
+ else:
1210
+ lex = self.term_dict.get_lex(term_id)
1211
+ decoded.append(lex if lex else f"<unknown:{term_id}>")
1212
+ df = df.with_columns(pl.Series(col, decoded))
1213
+
1214
+ return df
1215
+
1216
+ def _apply_filter(
1217
+ self,
1218
+ df: pl.DataFrame,
1219
+ filter_clause: Filter,
1220
+ prefixes: dict[str, str]
1221
+ ) -> pl.DataFrame:
1222
+ """
1223
+ Apply a FILTER clause.
1224
+
1225
+ For filters involving literal comparisons (especially numeric),
1226
+ we need to decode the term IDs to their actual values first,
1227
+ apply the filter, then re-encode if needed.
1228
+ """
1229
+ # Get variables used in the filter expression
1230
+ filter_vars = self._get_filter_variables(filter_clause.expression)
1231
+
1232
+ # Check if all required variables exist in the dataframe
1233
+ missing_vars = [v for v in filter_vars if v.name not in df.columns]
1234
+ if missing_vars:
1235
+ # If any filter variable is missing, no rows can match
1236
+ return df.head(0)
1237
+
1238
+ # Decode the filter-relevant columns to actual values
1239
+ decoded_df = df.clone()
1240
+ for var in filter_vars:
1241
+ if var.name in decoded_df.columns:
1242
+ col = decoded_df[var.name]
1243
+ if col.dtype == pl.UInt64:
1244
+ # Decode this column to its lexical values
1245
+ decoded_values = []
1246
+ for term_id in col.to_list():
1247
+ if term_id is None:
1248
+ decoded_values.append(None)
1249
+ else:
1250
+ lex = self.term_dict.get_lex(term_id)
1251
+ # Try to convert to numeric if possible
1252
+ if lex is not None:
1253
+ try:
1254
+ # Try int first, then float
1255
+ decoded_values.append(int(lex))
1256
+ except ValueError:
1257
+ try:
1258
+ decoded_values.append(float(lex))
1259
+ except ValueError:
1260
+ decoded_values.append(lex)
1261
+ else:
1262
+ decoded_values.append(None)
1263
+ decoded_df = decoded_df.with_columns(
1264
+ pl.Series(f"_decoded_{var.name}", decoded_values)
1265
+ )
1266
+ else:
1267
+ # Column already decoded (e.g., string type) - just alias it
1268
+ decoded_df = decoded_df.with_columns(
1269
+ pl.col(var.name).alias(f"_decoded_{var.name}")
1270
+ )
1271
+
1272
+ # Build filter expression using decoded columns
1273
+ expr = self._build_filter_expression_decoded(filter_clause.expression, prefixes)
1274
+ if expr is not None:
1275
+ # Filter using decoded values, keep original columns
1276
+ filtered = decoded_df.filter(expr)
1277
+ # Drop the decoded columns
1278
+ decoded_cols = [c for c in filtered.columns if c.startswith("_decoded_")]
1279
+ if decoded_cols:
1280
+ filtered = filtered.drop(decoded_cols)
1281
+ return filtered
1282
+
1283
+ return df
1284
+
1285
+ def _get_filter_variables(self, expr) -> set:
1286
+ """Get all variables referenced in a filter expression."""
1287
+ from rdf_starbase.sparql.ast import Variable, Comparison, LogicalExpression
1288
+
1289
+ variables = set()
1290
+ if isinstance(expr, Variable):
1291
+ variables.add(expr)
1292
+ elif isinstance(expr, Comparison):
1293
+ if isinstance(expr.left, Variable):
1294
+ variables.add(expr.left)
1295
+ if isinstance(expr.right, Variable):
1296
+ variables.add(expr.right)
1297
+ elif isinstance(expr, LogicalExpression):
1298
+ for operand in expr.operands:
1299
+ variables.update(self._get_filter_variables(operand))
1300
+ return variables
1301
+
1302
+ def _build_filter_expression_decoded(
1303
+ self,
1304
+ expr: Union[Comparison, LogicalExpression, FunctionCall],
1305
+ prefixes: dict[str, str]
1306
+ ) -> Optional[pl.Expr]:
1307
+ """Build Polars filter expression using decoded column names."""
1308
+ if isinstance(expr, Comparison):
1309
+ left = self._term_to_expr_decoded(expr.left, prefixes)
1310
+ right = self._term_to_expr_decoded(expr.right, prefixes)
1311
+
1312
+ if left is None or right is None:
1313
+ return None
1314
+
1315
+ op_map = {
1316
+ ComparisonOp.EQ: lambda l, r: l == r,
1317
+ ComparisonOp.NE: lambda l, r: l != r,
1318
+ ComparisonOp.LT: lambda l, r: l < r,
1319
+ ComparisonOp.LE: lambda l, r: l <= r,
1320
+ ComparisonOp.GT: lambda l, r: l > r,
1321
+ ComparisonOp.GE: lambda l, r: l >= r,
1322
+ }
1323
+
1324
+ return op_map[expr.operator](left, right)
1325
+
1326
+ elif isinstance(expr, LogicalExpression):
1327
+ operand_exprs = [
1328
+ self._build_filter_expression_decoded(op, prefixes)
1329
+ for op in expr.operands
1330
+ ]
1331
+ valid_exprs = [e for e in operand_exprs if e is not None]
1332
+
1333
+ if not valid_exprs:
1334
+ return None
1335
+
1336
+ if expr.operator == LogicalOp.AND:
1337
+ result = valid_exprs[0]
1338
+ for e in valid_exprs[1:]:
1339
+ result = result & e
1340
+ return result
1341
+ elif expr.operator == LogicalOp.OR:
1342
+ result = valid_exprs[0]
1343
+ for e in valid_exprs[1:]:
1344
+ result = result | e
1345
+ return result
1346
+ elif expr.operator == LogicalOp.NOT:
1347
+ return ~valid_exprs[0]
1348
+
1349
+ return None
1350
+
1351
+ def _term_to_expr_decoded(
1352
+ self,
1353
+ term: Term,
1354
+ prefixes: dict[str, str]
1355
+ ) -> Optional[pl.Expr]:
1356
+ """Convert a term to a Polars expression using decoded column names."""
1357
+ if isinstance(term, Variable):
1358
+ # Use the decoded column if it exists
1359
+ return pl.col(f"_decoded_{term.name}")
1360
+ elif isinstance(term, Literal):
1361
+ # Convert literal to appropriate type
1362
+ try:
1363
+ return pl.lit(int(term.value))
1364
+ except ValueError:
1365
+ try:
1366
+ return pl.lit(float(term.value))
1367
+ except ValueError:
1368
+ return pl.lit(term.value)
1369
+ elif isinstance(term, IRI):
1370
+ # For IRI comparisons, use the full IRI string
1371
+ iri_str = self._expand_iri(term, prefixes)
1372
+ return pl.lit(iri_str)
1373
+ return None
1374
+
1375
+ def _apply_provenance_filter(
1376
+ self,
1377
+ df: pl.DataFrame,
1378
+ filter_clause: ProvenanceFilter
1379
+ ) -> pl.DataFrame:
1380
+ """Apply provenance FILTER (FILTER_CONFIDENCE, FILTER_SOURCE, etc.)."""
1381
+ field = filter_clause.provenance_field
1382
+ prov_cols = [c for c in df.columns if c.endswith(f"_{field}")]
1383
+
1384
+ if not prov_cols:
1385
+ return df
1386
+
1387
+ expr = filter_clause.expression
1388
+ if isinstance(expr, Comparison):
1389
+ combined_expr = None
1390
+ for col in prov_cols:
1391
+ col_expr = self._build_provenance_comparison(expr, col)
1392
+ if col_expr is not None:
1393
+ if combined_expr is None:
1394
+ combined_expr = col_expr
1395
+ else:
1396
+ combined_expr = combined_expr | col_expr
1397
+
1398
+ if combined_expr is not None:
1399
+ return df.filter(combined_expr)
1400
+
1401
+ return df
1402
+
1403
+ def _build_provenance_comparison(
1404
+ self,
1405
+ expr: Comparison,
1406
+ prov_col: str
1407
+ ) -> Optional[pl.Expr]:
1408
+ """Build comparison expression for provenance filtering."""
1409
+ if isinstance(expr.left, Variable):
1410
+ left = pl.col(prov_col)
1411
+ right = self._literal_to_polars(expr.right)
1412
+ elif isinstance(expr.right, Variable):
1413
+ left = self._literal_to_polars(expr.left)
1414
+ right = pl.col(prov_col)
1415
+ else:
1416
+ left = self._literal_to_polars(expr.left)
1417
+ right = self._literal_to_polars(expr.right)
1418
+
1419
+ if left is None or right is None:
1420
+ return None
1421
+
1422
+ op_map = {
1423
+ ComparisonOp.EQ: lambda l, r: l == r,
1424
+ ComparisonOp.NE: lambda l, r: l != r,
1425
+ ComparisonOp.LT: lambda l, r: l < r,
1426
+ ComparisonOp.LE: lambda l, r: l <= r,
1427
+ ComparisonOp.GT: lambda l, r: l > r,
1428
+ ComparisonOp.GE: lambda l, r: l >= r,
1429
+ }
1430
+
1431
+ return op_map[expr.operator](left, right)
1432
+
1433
+ def _build_filter_expression(
1434
+ self,
1435
+ expr: Union[Comparison, LogicalExpression, FunctionCall],
1436
+ prefixes: dict[str, str]
1437
+ ) -> Optional[pl.Expr]:
1438
+ """Build Polars filter expression from SPARQL filter AST."""
1439
+ if isinstance(expr, Comparison):
1440
+ left = self._term_to_expr(expr.left, prefixes)
1441
+ right = self._term_to_expr(expr.right, prefixes)
1442
+
1443
+ if left is None or right is None:
1444
+ return None
1445
+
1446
+ op_map = {
1447
+ ComparisonOp.EQ: lambda l, r: l == r,
1448
+ ComparisonOp.NE: lambda l, r: l != r,
1449
+ ComparisonOp.LT: lambda l, r: l < r,
1450
+ ComparisonOp.LE: lambda l, r: l <= r,
1451
+ ComparisonOp.GT: lambda l, r: l > r,
1452
+ ComparisonOp.GE: lambda l, r: l >= r,
1453
+ }
1454
+
1455
+ return op_map[expr.operator](left, right)
1456
+
1457
+ elif isinstance(expr, LogicalExpression):
1458
+ operand_exprs = [
1459
+ self._build_filter_expression(op, prefixes)
1460
+ for op in expr.operands
1461
+ ]
1462
+ valid_exprs = [e for e in operand_exprs if e is not None]
1463
+
1464
+ if not valid_exprs:
1465
+ return None
1466
+
1467
+ if expr.operator == LogicalOp.AND:
1468
+ result = valid_exprs[0]
1469
+ for e in valid_exprs[1:]:
1470
+ result = result & e
1471
+ return result
1472
+ elif expr.operator == LogicalOp.OR:
1473
+ result = valid_exprs[0]
1474
+ for e in valid_exprs[1:]:
1475
+ result = result | e
1476
+ return result
1477
+ elif expr.operator == LogicalOp.NOT:
1478
+ return ~valid_exprs[0]
1479
+
1480
+ return None
1481
+
1482
+ def _term_to_expr(
1483
+ self,
1484
+ term: Term,
1485
+ prefixes: dict[str, str]
1486
+ ) -> Optional[pl.Expr]:
1487
+ """Convert a term to a Polars expression."""
1488
+ if isinstance(term, Variable):
1489
+ return pl.col(term.name)
1490
+ elif isinstance(term, Literal):
1491
+ return pl.lit(term.value)
1492
+ elif isinstance(term, IRI):
1493
+ # For IRI comparisons, use the term ID
1494
+ term_id = self._resolve_term_id(term, prefixes)
1495
+ return pl.lit(term_id) if term_id else None
1496
+ return None
1497
+
1498
+ def _literal_to_polars(self, term: Term) -> Optional[Any]:
1499
+ """Convert a literal term to a Polars literal."""
1500
+ if isinstance(term, Literal):
1501
+ return pl.lit(term.value)
1502
+ elif isinstance(term, Variable):
1503
+ return pl.col(term.name)
1504
+ return None
1505
+
1506
+
1507
+ # === RDF★ Expansion Query Patterns ===
1508
+ # These implement Q6-Q12 from the SPARQL-Star test suite
1509
+
1510
+ class ExpansionPatterns:
1511
+ """
1512
+ Factory for common RDF★ expansion query patterns.
1513
+
1514
+ These patterns efficiently query metadata about quoted triples
1515
+ and expand them back to (s, p, o) components.
1516
+ """
1517
+
1518
+ def __init__(
1519
+ self,
1520
+ term_dict: TermDict,
1521
+ qt_dict: QtDict,
1522
+ fact_store: FactStore
1523
+ ):
1524
+ self.term_dict = term_dict
1525
+ self.qt_dict = qt_dict
1526
+ self.fact_store = fact_store
1527
+
1528
+ def q6_metadata_for_triple(
1529
+ self,
1530
+ subject: str,
1531
+ predicate: str,
1532
+ obj: str
1533
+ ) -> pl.DataFrame:
1534
+ """
1535
+ Q6: Fetch all metadata about a specific quoted triple.
1536
+
1537
+ SELECT ?mp ?mo WHERE {
1538
+ << subject predicate object >> ?mp ?mo .
1539
+ }
1540
+ """
1541
+ # Look up term IDs
1542
+ s_id = self.term_dict.lookup_iri(subject)
1543
+ p_id = self.term_dict.lookup_iri(predicate)
1544
+ o_id = self.term_dict.lookup_iri(obj)
1545
+
1546
+ if s_id is None or p_id is None or o_id is None:
1547
+ return pl.DataFrame({"mp": [], "mo": []})
1548
+
1549
+ # Look up the quoted triple
1550
+ qt_id = self.qt_dict.get_id(s_id, p_id, o_id)
1551
+ if qt_id is None:
1552
+ return pl.DataFrame({"mp": [], "mo": []})
1553
+
1554
+ # Find facts where qt_id is the subject
1555
+ df = self.fact_store.scan_facts().filter(pl.col("s") == qt_id)
1556
+
1557
+ # Decode predicates and objects
1558
+ result = []
1559
+ for row in df.iter_rows(named=True):
1560
+ mp = self.term_dict.get_lex(row["p"])
1561
+ mo = self.term_dict.get_lex(row["o"])
1562
+ result.append({"mp": mp, "mo": mo})
1563
+
1564
+ return pl.DataFrame(result) if result else pl.DataFrame({"mp": [], "mo": []})
1565
+
1566
+ def q7_expand_by_source(self, source_uri: str) -> pl.DataFrame:
1567
+ """
1568
+ Q7: Given a source, find all quoted triples derived from it
1569
+ and expand them to base (s, p, o).
1570
+
1571
+ SELECT ?s ?p ?o WHERE {
1572
+ ?qt prov:wasDerivedFrom source_uri .
1573
+ # Expand the quoted triple
1574
+ }
1575
+ """
1576
+ # Look up the source term
1577
+ source_id = self.term_dict.lookup_iri(source_uri)
1578
+ if source_id is None:
1579
+ return pl.DataFrame({"s": [], "p": [], "o": []})
1580
+
1581
+ # Look up prov:wasDerivedFrom predicate
1582
+ prov_pred = self.term_dict.lookup_iri(
1583
+ "http://www.w3.org/ns/prov#wasDerivedFrom"
1584
+ )
1585
+ if prov_pred is None:
1586
+ return pl.DataFrame({"s": [], "p": [], "o": []})
1587
+
1588
+ # Find metadata facts with this predicate and source
1589
+ df = self.fact_store.scan_metadata_facts()
1590
+ df = df.filter(
1591
+ (pl.col("p") == prov_pred) &
1592
+ (pl.col("o") == source_id)
1593
+ )
1594
+
1595
+ # Expand qt_ids to (s, p, o)
1596
+ df = self.fact_store.expand_metadata_df(df)
1597
+
1598
+ # Decode to lexical forms
1599
+ result = []
1600
+ for row in df.iter_rows(named=True):
1601
+ s = self.term_dict.get_lex(row["base_s"])
1602
+ p = self.term_dict.get_lex(row["base_p"])
1603
+ o = self.term_dict.get_lex(row["base_o"])
1604
+ result.append({"s": s, "p": p, "o": o})
1605
+
1606
+ return pl.DataFrame(result) if result else pl.DataFrame({"s": [], "p": [], "o": []})
1607
+
1608
+ def q8_expand_by_activity(self, activity_uri: str) -> pl.DataFrame:
1609
+ """
1610
+ Q8: List all statements generated by a given run/activity and expand.
1611
+
1612
+ SELECT ?s ?p ?o WHERE {
1613
+ ?qt prov:wasGeneratedBy activity_uri .
1614
+ }
1615
+ """
1616
+ activity_id = self.term_dict.lookup_iri(activity_uri)
1617
+ if activity_id is None:
1618
+ return pl.DataFrame({"s": [], "p": [], "o": []})
1619
+
1620
+ gen_pred = self.term_dict.lookup_iri(
1621
+ "http://www.w3.org/ns/prov#wasGeneratedBy"
1622
+ )
1623
+ if gen_pred is None:
1624
+ return pl.DataFrame({"s": [], "p": [], "o": []})
1625
+
1626
+ df = self.fact_store.scan_metadata_facts()
1627
+ df = df.filter(
1628
+ (pl.col("p") == gen_pred) &
1629
+ (pl.col("o") == activity_id)
1630
+ )
1631
+
1632
+ df = self.fact_store.expand_metadata_df(df)
1633
+
1634
+ result = []
1635
+ for row in df.iter_rows(named=True):
1636
+ s = self.term_dict.get_lex(row["base_s"])
1637
+ p = self.term_dict.get_lex(row["base_p"])
1638
+ o = self.term_dict.get_lex(row["base_o"])
1639
+ result.append({"s": s, "p": p, "o": o})
1640
+
1641
+ return pl.DataFrame(result) if result else pl.DataFrame({"s": [], "p": [], "o": []})
1642
+
1643
+ def q9_filter_by_confidence(
1644
+ self,
1645
+ min_confidence: float,
1646
+ max_confidence: Optional[float] = None,
1647
+ expand_lex: bool = True,
1648
+ ) -> pl.DataFrame:
1649
+ """
1650
+ Q9: Filter statements by confidence and expand.
1651
+
1652
+ SELECT ?s ?p ?o ?c WHERE {
1653
+ ?qt ex:confidence ?c .
1654
+ FILTER(?c > min_confidence)
1655
+ }
1656
+
1657
+ Uses pure Polars join for vectorized performance.
1658
+
1659
+ Args:
1660
+ min_confidence: Minimum confidence threshold (exclusive)
1661
+ max_confidence: Maximum confidence threshold (inclusive, optional)
1662
+ expand_lex: If True, return lexical forms. If False, return term IDs (faster).
1663
+ """
1664
+ conf_pred = self.term_dict.lookup_iri("http://example.org/confidence")
1665
+ if conf_pred is None:
1666
+ cols = {"s": [], "p": [], "o": [], "c": []}
1667
+ return pl.DataFrame(cols)
1668
+
1669
+ # Get confidence facts
1670
+ df = self.fact_store.scan_metadata_facts()
1671
+ df = df.filter(pl.col("p") == conf_pred)
1672
+
1673
+ if df.is_empty():
1674
+ cols = {"s": [], "p": [], "o": [], "c": []}
1675
+ return pl.DataFrame(cols)
1676
+
1677
+ # Build float map as a Polars DataFrame for vectorized join
1678
+ float_map = self.term_dict.build_literal_to_float_map()
1679
+ if not float_map:
1680
+ cols = {"s": [], "p": [], "o": [], "c": []}
1681
+ return pl.DataFrame(cols)
1682
+
1683
+ # Create lookup DataFrame: term_id -> float value
1684
+ map_df = pl.DataFrame({
1685
+ "term_id": list(float_map.keys()),
1686
+ "conf_value": list(float_map.values()),
1687
+ }).cast({"term_id": pl.UInt64, "conf_value": pl.Float64})
1688
+
1689
+ # Join to get confidence values (pure Polars, no Python iteration!)
1690
+ df = df.join(map_df, left_on="o", right_on="term_id", how="inner")
1691
+
1692
+ # Filter by confidence threshold (vectorized!)
1693
+ df = df.filter(pl.col("conf_value") > min_confidence)
1694
+ if max_confidence is not None:
1695
+ df = df.filter(pl.col("conf_value") <= max_confidence)
1696
+
1697
+ if df.is_empty():
1698
+ cols = {"s": [], "p": [], "o": [], "c": []}
1699
+ return pl.DataFrame(cols)
1700
+
1701
+ # Expand to get base triple components
1702
+ df = self.fact_store.expand_metadata_df(df)
1703
+
1704
+ if not expand_lex:
1705
+ # Return term IDs directly (much faster for large results)
1706
+ return df.select([
1707
+ pl.col("base_s").alias("s"),
1708
+ pl.col("base_p").alias("p"),
1709
+ pl.col("base_o").alias("o"),
1710
+ pl.col("conf_value").alias("c"),
1711
+ ])
1712
+
1713
+ # Map term IDs to lexical forms using vectorized lookup
1714
+ s_lex = self.term_dict.get_lex_series(df["base_s"])
1715
+ p_lex = self.term_dict.get_lex_series(df["base_p"])
1716
+ o_lex = self.term_dict.get_lex_series(df["base_o"])
1717
+
1718
+ return pl.DataFrame({
1719
+ "s": s_lex,
1720
+ "p": p_lex,
1721
+ "o": o_lex,
1722
+ "c": df["conf_value"],
1723
+ })
1724
+
1725
+ def q9_count_by_confidence(
1726
+ self,
1727
+ min_confidence: float,
1728
+ max_confidence: Optional[float] = None,
1729
+ ) -> int:
1730
+ """
1731
+ Count statements above confidence threshold (fast).
1732
+
1733
+ SELECT (COUNT(*) as ?count) WHERE {
1734
+ ?qt ex:confidence ?c .
1735
+ FILTER(?c > min_confidence)
1736
+ }
1737
+ """
1738
+ df = self.q9_filter_by_confidence(min_confidence, max_confidence, expand_lex=False)
1739
+ return len(df)
1740
+
1741
+ def q9_native_filter_by_confidence(
1742
+ self,
1743
+ min_confidence: float,
1744
+ max_confidence: Optional[float] = None,
1745
+ expand_lex: bool = True,
1746
+ ) -> pl.DataFrame:
1747
+ """
1748
+ Q9 (Native): Filter facts by confidence using native column.
1749
+
1750
+ This is the FAST version that uses the native `confidence` column
1751
+ in the FactStore schema, avoiding any string parsing or joins.
1752
+
1753
+ Use this when facts were ingested with `add_facts_with_provenance()`
1754
+ which stores confidence directly in the native column.
1755
+
1756
+ SELECT ?s ?p ?o ?c WHERE {
1757
+ FILTER(confidence > min_confidence)
1758
+ }
1759
+
1760
+ Args:
1761
+ min_confidence: Minimum confidence threshold (exclusive)
1762
+ max_confidence: Maximum confidence threshold (inclusive, optional)
1763
+ expand_lex: If True, return lexical forms. If False, return term IDs.
1764
+ """
1765
+ # Pure vectorized scan on native column - no joins!
1766
+ df = self.fact_store.scan_by_confidence(
1767
+ min_confidence,
1768
+ max_confidence,
1769
+ include_metadata=False, # Only base facts
1770
+ )
1771
+
1772
+ if df.is_empty():
1773
+ return pl.DataFrame({"s": [], "p": [], "o": [], "c": []})
1774
+
1775
+ if not expand_lex:
1776
+ return df.select([
1777
+ pl.col("s"),
1778
+ pl.col("p"),
1779
+ pl.col("o"),
1780
+ pl.col("confidence").alias("c"),
1781
+ ])
1782
+
1783
+ # Map term IDs to lexical forms
1784
+ s_lex = self.term_dict.get_lex_series(df["s"])
1785
+ p_lex = self.term_dict.get_lex_series(df["p"])
1786
+ o_lex = self.term_dict.get_lex_series(df["o"])
1787
+
1788
+ return pl.DataFrame({
1789
+ "s": s_lex,
1790
+ "p": p_lex,
1791
+ "o": o_lex,
1792
+ "c": df["confidence"],
1793
+ })
1794
+
1795
+ def q9_native_count(
1796
+ self,
1797
+ min_confidence: float,
1798
+ max_confidence: Optional[float] = None,
1799
+ ) -> int:
1800
+ """
1801
+ Count facts by confidence using native column (fastest).
1802
+ """
1803
+ df = self.fact_store.scan_by_confidence(
1804
+ min_confidence,
1805
+ max_confidence,
1806
+ include_metadata=False,
1807
+ )
1808
+ return len(df)
1809
+
1810
+ def q10_filter_by_time_range(
1811
+ self,
1812
+ start: datetime,
1813
+ end: datetime
1814
+ ) -> pl.DataFrame:
1815
+ """
1816
+ Q10: Filter by time range on metadata.
1817
+
1818
+ SELECT ?qt ?t WHERE {
1819
+ ?qt prov:generatedAtTime ?t .
1820
+ FILTER(?t >= start && ?t < end)
1821
+ }
1822
+ """
1823
+ time_pred = self.term_dict.lookup_iri(
1824
+ "http://www.w3.org/ns/prov#generatedAtTime"
1825
+ )
1826
+ if time_pred is None:
1827
+ return pl.DataFrame({"qt": [], "t": []})
1828
+
1829
+ df = self.fact_store.scan_metadata_facts()
1830
+ df = df.filter(pl.col("p") == time_pred)
1831
+
1832
+ result = []
1833
+ for row in df.iter_rows(named=True):
1834
+ time_lex = self.term_dict.get_lex(row["o"])
1835
+ if time_lex is None:
1836
+ continue
1837
+ try:
1838
+ # Parse ISO datetime
1839
+ t = datetime.fromisoformat(time_lex.replace("Z", "+00:00"))
1840
+ if start <= t < end:
1841
+ qt_lex = self._qt_to_string(row["s"])
1842
+ result.append({"qt": qt_lex, "t": time_lex})
1843
+ except ValueError:
1844
+ continue
1845
+
1846
+ return pl.DataFrame(result) if result else pl.DataFrame({"qt": [], "t": []})
1847
+
1848
+ def q11_count_by_source(self) -> pl.DataFrame:
1849
+ """
1850
+ Q11: Count statements per source.
1851
+
1852
+ SELECT ?src (COUNT(?qt) AS ?n) WHERE {
1853
+ ?qt prov:wasDerivedFrom ?src .
1854
+ } GROUP BY ?src ORDER BY DESC(?n)
1855
+ """
1856
+ prov_pred = self.term_dict.lookup_iri(
1857
+ "http://www.w3.org/ns/prov#wasDerivedFrom"
1858
+ )
1859
+ if prov_pred is None:
1860
+ return pl.DataFrame({"src": [], "n": []})
1861
+
1862
+ df = self.fact_store.scan_metadata_facts()
1863
+ df = df.filter(pl.col("p") == prov_pred)
1864
+
1865
+ # Group by source (object) and count
1866
+ grouped = df.group_by("o").agg(pl.len().alias("n"))
1867
+ grouped = grouped.sort("n", descending=True)
1868
+
1869
+ # Decode source URIs
1870
+ result = []
1871
+ for row in grouped.iter_rows(named=True):
1872
+ src = self.term_dict.get_lex(row["o"])
1873
+ result.append({"src": src, "n": row["n"]})
1874
+
1875
+ return pl.DataFrame(result) if result else pl.DataFrame({"src": [], "n": []})
1876
+
1877
+ def q12_count_by_run(self) -> pl.DataFrame:
1878
+ """
1879
+ Q12: Count statements per run.
1880
+
1881
+ SELECT ?run (COUNT(?qt) AS ?n) WHERE {
1882
+ ?qt prov:wasGeneratedBy ?run .
1883
+ } GROUP BY ?run ORDER BY DESC(?n)
1884
+ """
1885
+ gen_pred = self.term_dict.lookup_iri(
1886
+ "http://www.w3.org/ns/prov#wasGeneratedBy"
1887
+ )
1888
+ if gen_pred is None:
1889
+ return pl.DataFrame({"run": [], "n": []})
1890
+
1891
+ df = self.fact_store.scan_metadata_facts()
1892
+ df = df.filter(pl.col("p") == gen_pred)
1893
+
1894
+ grouped = df.group_by("o").agg(pl.len().alias("n"))
1895
+ grouped = grouped.sort("n", descending=True)
1896
+
1897
+ result = []
1898
+ for row in grouped.iter_rows(named=True):
1899
+ run = self.term_dict.get_lex(row["o"])
1900
+ result.append({"run": run, "n": row["n"]})
1901
+
1902
+ return pl.DataFrame(result) if result else pl.DataFrame({"run": [], "n": []})
1903
+
1904
+ def _qt_to_string(self, qt_id: TermId) -> str:
1905
+ """Convert a quoted triple ID to << s p o >> string form."""
1906
+ qt = self.qt_dict.lookup(qt_id)
1907
+ if qt is None:
1908
+ return f"<unknown qt:{qt_id}>"
1909
+
1910
+ s = self.term_dict.get_lex(qt.s) or f"<{qt.s}>"
1911
+ p = self.term_dict.get_lex(qt.p) or f"<{qt.p}>"
1912
+ o = self.term_dict.get_lex(qt.o) or f"<{qt.o}>"
1913
+
1914
+ return f"<< {s} {p} {o} >>"