rdf-starbase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdf_starbase/__init__.py +57 -0
- rdf_starbase/ai_grounding.py +728 -0
- rdf_starbase/compat/__init__.py +26 -0
- rdf_starbase/compat/rdflib.py +1104 -0
- rdf_starbase/formats/__init__.py +29 -0
- rdf_starbase/formats/jsonld.py +488 -0
- rdf_starbase/formats/ntriples.py +419 -0
- rdf_starbase/formats/rdfxml.py +434 -0
- rdf_starbase/formats/turtle.py +882 -0
- rdf_starbase/models.py +92 -0
- rdf_starbase/registry.py +540 -0
- rdf_starbase/repositories.py +407 -0
- rdf_starbase/repository_api.py +739 -0
- rdf_starbase/sparql/__init__.py +35 -0
- rdf_starbase/sparql/ast.py +910 -0
- rdf_starbase/sparql/executor.py +1925 -0
- rdf_starbase/sparql/parser.py +1716 -0
- rdf_starbase/storage/__init__.py +44 -0
- rdf_starbase/storage/executor.py +1914 -0
- rdf_starbase/storage/facts.py +850 -0
- rdf_starbase/storage/lsm.py +531 -0
- rdf_starbase/storage/persistence.py +338 -0
- rdf_starbase/storage/quoted_triples.py +292 -0
- rdf_starbase/storage/reasoner.py +1035 -0
- rdf_starbase/storage/terms.py +628 -0
- rdf_starbase/store.py +1049 -0
- rdf_starbase/store_legacy.py +748 -0
- rdf_starbase/web.py +568 -0
- rdf_starbase-0.1.0.dist-info/METADATA +706 -0
- rdf_starbase-0.1.0.dist-info/RECORD +31 -0
- rdf_starbase-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1914 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPARQL★ Executor for the new dictionary-encoded storage layer.
|
|
3
|
+
|
|
4
|
+
Implements efficient query execution using integer-only operations
|
|
5
|
+
with expansion patterns for RDF-Star metadata queries (Q6-Q12).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
from typing import Optional, Union, Any
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
from rdf_starbase.sparql.ast import (
|
|
15
|
+
Query, SelectQuery, AskQuery, ConstructQuery, DescribeQuery,
|
|
16
|
+
TriplePattern, QuotedTriplePattern,
|
|
17
|
+
Variable, IRI, Literal, BlankNode,
|
|
18
|
+
Filter, Comparison, LogicalExpression, FunctionCall,
|
|
19
|
+
ComparisonOp, LogicalOp,
|
|
20
|
+
WhereClause, ProvenanceFilter,
|
|
21
|
+
Term,
|
|
22
|
+
# Property path types
|
|
23
|
+
PropertyPath, PathIRI, PathSequence, PathAlternative,
|
|
24
|
+
PathInverse, PathMod, PathNegatedPropertySet, PropertyPathModifier,
|
|
25
|
+
# Pattern types
|
|
26
|
+
MinusPattern, OptionalPattern, UnionPattern,
|
|
27
|
+
# Aggregate types
|
|
28
|
+
AggregateExpression,
|
|
29
|
+
)
|
|
30
|
+
from rdf_starbase.storage.terms import TermDict, TermId, TermKind
|
|
31
|
+
from rdf_starbase.storage.quoted_triples import QtDict
|
|
32
|
+
from rdf_starbase.storage.facts import FactStore
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class StorageExecutor:
|
|
36
|
+
"""
|
|
37
|
+
Executes SPARQL★ queries against the new dictionary-encoded storage.
|
|
38
|
+
|
|
39
|
+
Key optimizations:
|
|
40
|
+
- All comparisons use integer IDs (no string comparisons in hot path)
|
|
41
|
+
- Quoted triple expansion via efficient joins
|
|
42
|
+
- Predicate partitioning for scan pruning
|
|
43
|
+
- Lazy evaluation for query optimization
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
term_dict: TermDict,
|
|
49
|
+
qt_dict: QtDict,
|
|
50
|
+
fact_store: FactStore
|
|
51
|
+
):
|
|
52
|
+
"""
|
|
53
|
+
Initialize executor with storage components.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
term_dict: Dictionary mapping terms to integer IDs
|
|
57
|
+
qt_dict: Dictionary mapping quoted triples to IDs
|
|
58
|
+
fact_store: Integer-based fact storage
|
|
59
|
+
"""
|
|
60
|
+
self.term_dict = term_dict
|
|
61
|
+
self.qt_dict = qt_dict
|
|
62
|
+
self.fact_store = fact_store
|
|
63
|
+
self._var_counter = 0
|
|
64
|
+
|
|
65
|
+
def execute(self, query: Query) -> Union[pl.DataFrame, bool, list[tuple[str, str, str]]]:
|
|
66
|
+
"""
|
|
67
|
+
Execute a SPARQL★ query.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
query: Parsed Query AST
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
DataFrame for SELECT queries, bool for ASK queries,
|
|
74
|
+
list of triples for CONSTRUCT/DESCRIBE queries
|
|
75
|
+
"""
|
|
76
|
+
if isinstance(query, SelectQuery):
|
|
77
|
+
return self._execute_select(query)
|
|
78
|
+
elif isinstance(query, AskQuery):
|
|
79
|
+
return self._execute_ask(query)
|
|
80
|
+
elif isinstance(query, ConstructQuery):
|
|
81
|
+
return self._execute_construct(query)
|
|
82
|
+
elif isinstance(query, DescribeQuery):
|
|
83
|
+
return self._execute_describe(query)
|
|
84
|
+
else:
|
|
85
|
+
raise NotImplementedError(f"Query type {type(query)} not yet supported")
|
|
86
|
+
|
|
87
|
+
def _execute_select(self, query: SelectQuery) -> pl.DataFrame:
|
|
88
|
+
"""Execute a SELECT query."""
|
|
89
|
+
# Execute WHERE clause with integer IDs
|
|
90
|
+
df = self._execute_where(query.where, query.prefixes)
|
|
91
|
+
|
|
92
|
+
# Check if we have aggregates
|
|
93
|
+
has_aggregates = any(isinstance(v, AggregateExpression) for v in query.variables)
|
|
94
|
+
|
|
95
|
+
if has_aggregates or query.group_by:
|
|
96
|
+
# Handle GROUP BY and aggregates
|
|
97
|
+
df = self._apply_aggregates(df, query)
|
|
98
|
+
|
|
99
|
+
# Decode term IDs back to lexical forms for output
|
|
100
|
+
df = self._decode_result(df)
|
|
101
|
+
|
|
102
|
+
# Apply DISTINCT if requested
|
|
103
|
+
if query.distinct:
|
|
104
|
+
df = df.unique()
|
|
105
|
+
|
|
106
|
+
# Apply ORDER BY
|
|
107
|
+
if query.order_by:
|
|
108
|
+
order_cols = []
|
|
109
|
+
descending = []
|
|
110
|
+
for var, asc in query.order_by:
|
|
111
|
+
if var.name in df.columns:
|
|
112
|
+
order_cols.append(var.name)
|
|
113
|
+
descending.append(not asc)
|
|
114
|
+
if order_cols:
|
|
115
|
+
df = df.sort(order_cols, descending=descending)
|
|
116
|
+
|
|
117
|
+
# Apply LIMIT and OFFSET
|
|
118
|
+
if query.offset:
|
|
119
|
+
df = df.slice(query.offset, query.limit or len(df))
|
|
120
|
+
elif query.limit:
|
|
121
|
+
df = df.head(query.limit)
|
|
122
|
+
|
|
123
|
+
# Select only requested variables (or all if SELECT *)
|
|
124
|
+
if not query.is_select_all():
|
|
125
|
+
select_cols = []
|
|
126
|
+
for v in query.variables:
|
|
127
|
+
if isinstance(v, Variable):
|
|
128
|
+
if v.name in df.columns:
|
|
129
|
+
select_cols.append(v.name)
|
|
130
|
+
elif isinstance(v, AggregateExpression):
|
|
131
|
+
# Use the alias name
|
|
132
|
+
if v.alias and v.alias.name in df.columns:
|
|
133
|
+
select_cols.append(v.alias.name)
|
|
134
|
+
if select_cols:
|
|
135
|
+
df = df.select(select_cols)
|
|
136
|
+
|
|
137
|
+
return df
|
|
138
|
+
|
|
139
|
+
def _apply_aggregates(self, df: pl.DataFrame, query: SelectQuery) -> pl.DataFrame:
|
|
140
|
+
"""Apply GROUP BY and aggregate functions."""
|
|
141
|
+
if len(df) == 0:
|
|
142
|
+
# Create empty result with correct columns
|
|
143
|
+
result_cols = {}
|
|
144
|
+
for v in query.variables:
|
|
145
|
+
if isinstance(v, Variable):
|
|
146
|
+
result_cols[v.name] = pl.Series([], dtype=pl.Utf8)
|
|
147
|
+
elif isinstance(v, AggregateExpression) and v.alias:
|
|
148
|
+
result_cols[v.alias.name] = pl.Series([], dtype=pl.Int64)
|
|
149
|
+
return pl.DataFrame(result_cols)
|
|
150
|
+
|
|
151
|
+
# First decode the columns we'll need for grouping and aggregation
|
|
152
|
+
df = self._decode_result(df)
|
|
153
|
+
|
|
154
|
+
# Build the aggregate expressions
|
|
155
|
+
agg_exprs = []
|
|
156
|
+
for v in query.variables:
|
|
157
|
+
if isinstance(v, AggregateExpression):
|
|
158
|
+
agg_expr = self._build_aggregate_expr(v, df)
|
|
159
|
+
if agg_expr is not None:
|
|
160
|
+
agg_exprs.append(agg_expr)
|
|
161
|
+
|
|
162
|
+
if not agg_exprs:
|
|
163
|
+
return df
|
|
164
|
+
|
|
165
|
+
# Apply GROUP BY if specified
|
|
166
|
+
if query.group_by:
|
|
167
|
+
group_cols = [g.name for g in query.group_by if isinstance(g, Variable) and g.name in df.columns]
|
|
168
|
+
if group_cols:
|
|
169
|
+
result = df.group_by(group_cols).agg(agg_exprs)
|
|
170
|
+
else:
|
|
171
|
+
# No valid group columns - aggregate all
|
|
172
|
+
result = df.select(agg_exprs)
|
|
173
|
+
else:
|
|
174
|
+
# No GROUP BY - aggregate the entire dataset
|
|
175
|
+
result = df.select(agg_exprs)
|
|
176
|
+
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
def _build_aggregate_expr(self, agg: AggregateExpression, df: pl.DataFrame) -> Optional[pl.Expr]:
|
|
180
|
+
"""Build a Polars aggregate expression from an AggregateExpression."""
|
|
181
|
+
func = agg.function.upper()
|
|
182
|
+
alias = agg.alias.name if agg.alias else f"_{func}"
|
|
183
|
+
|
|
184
|
+
if agg.argument is None:
|
|
185
|
+
# COUNT(*) - count all rows
|
|
186
|
+
if func == "COUNT":
|
|
187
|
+
return pl.len().alias(alias)
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
if isinstance(agg.argument, Variable):
|
|
191
|
+
col_name = agg.argument.name
|
|
192
|
+
if col_name not in df.columns:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
col = pl.col(col_name)
|
|
196
|
+
|
|
197
|
+
if func == "COUNT":
|
|
198
|
+
if agg.distinct:
|
|
199
|
+
return col.n_unique().alias(alias)
|
|
200
|
+
else:
|
|
201
|
+
return col.count().alias(alias)
|
|
202
|
+
elif func == "SUM":
|
|
203
|
+
# Need to convert to numeric first
|
|
204
|
+
return col.cast(pl.Float64, strict=False).sum().alias(alias)
|
|
205
|
+
elif func == "AVG":
|
|
206
|
+
return col.cast(pl.Float64, strict=False).mean().alias(alias)
|
|
207
|
+
elif func == "MIN":
|
|
208
|
+
return col.min().alias(alias)
|
|
209
|
+
elif func == "MAX":
|
|
210
|
+
return col.max().alias(alias)
|
|
211
|
+
elif func == "GROUP_CONCAT":
|
|
212
|
+
sep = agg.separator or " "
|
|
213
|
+
return col.str.concat(sep).alias(alias)
|
|
214
|
+
elif func == "SAMPLE":
|
|
215
|
+
return col.first().alias(alias)
|
|
216
|
+
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
def _execute_ask(self, query: AskQuery) -> bool:
|
|
220
|
+
"""Execute an ASK query."""
|
|
221
|
+
df = self._execute_where(query.where, query.prefixes)
|
|
222
|
+
return len(df) > 0
|
|
223
|
+
|
|
224
|
+
def _execute_construct(self, query: ConstructQuery) -> list[tuple[str, str, str]]:
|
|
225
|
+
"""
|
|
226
|
+
Execute a CONSTRUCT query.
|
|
227
|
+
|
|
228
|
+
Returns a list of triples (s, p, o) as strings, with template variables
|
|
229
|
+
substituted from the WHERE clause results.
|
|
230
|
+
"""
|
|
231
|
+
# Execute WHERE clause to get bindings
|
|
232
|
+
df = self._execute_where(query.where, query.prefixes)
|
|
233
|
+
|
|
234
|
+
if len(df) == 0:
|
|
235
|
+
return []
|
|
236
|
+
|
|
237
|
+
# Decode term IDs to strings
|
|
238
|
+
df = self._decode_result(df)
|
|
239
|
+
|
|
240
|
+
# Generate triples by substituting template with each binding
|
|
241
|
+
triples = []
|
|
242
|
+
for row in df.iter_rows(named=True):
|
|
243
|
+
for pattern in query.template:
|
|
244
|
+
s = self._substitute_term(pattern.subject, row, query.prefixes)
|
|
245
|
+
p = self._substitute_term(pattern.predicate, row, query.prefixes)
|
|
246
|
+
o = self._substitute_term(pattern.object, row, query.prefixes)
|
|
247
|
+
|
|
248
|
+
if s is not None and p is not None and o is not None:
|
|
249
|
+
triples.append((s, p, o))
|
|
250
|
+
|
|
251
|
+
# Remove duplicates while preserving order
|
|
252
|
+
seen = set()
|
|
253
|
+
unique_triples = []
|
|
254
|
+
for t in triples:
|
|
255
|
+
if t not in seen:
|
|
256
|
+
seen.add(t)
|
|
257
|
+
unique_triples.append(t)
|
|
258
|
+
|
|
259
|
+
return unique_triples
|
|
260
|
+
|
|
261
|
+
def _substitute_term(
|
|
262
|
+
self,
|
|
263
|
+
term: Term,
|
|
264
|
+
bindings: dict[str, Any],
|
|
265
|
+
prefixes: dict[str, str]
|
|
266
|
+
) -> Optional[str]:
|
|
267
|
+
"""Substitute a term using variable bindings."""
|
|
268
|
+
if isinstance(term, Variable):
|
|
269
|
+
value = bindings.get(term.name)
|
|
270
|
+
return value if value is not None else None
|
|
271
|
+
elif isinstance(term, IRI):
|
|
272
|
+
return self._expand_iri(term.value, prefixes)
|
|
273
|
+
elif isinstance(term, Literal):
|
|
274
|
+
if term.language:
|
|
275
|
+
return f'"{term.value}"@{term.language}'
|
|
276
|
+
elif term.datatype:
|
|
277
|
+
return f'"{term.value}"^^<{term.datatype}>'
|
|
278
|
+
else:
|
|
279
|
+
return f'"{term.value}"'
|
|
280
|
+
elif isinstance(term, BlankNode):
|
|
281
|
+
return f"_:{term.label}"
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
def _execute_describe(self, query: DescribeQuery) -> list[tuple[str, str, str]]:
|
|
285
|
+
"""
|
|
286
|
+
Execute a DESCRIBE query.
|
|
287
|
+
|
|
288
|
+
Returns all triples where the described resources appear as subject or object.
|
|
289
|
+
"""
|
|
290
|
+
# Collect resources to describe
|
|
291
|
+
resources_to_describe = set()
|
|
292
|
+
|
|
293
|
+
if query.where:
|
|
294
|
+
# Execute WHERE to get variable bindings
|
|
295
|
+
df = self._execute_where(query.where, query.prefixes)
|
|
296
|
+
df = self._decode_result(df)
|
|
297
|
+
|
|
298
|
+
for resource in query.resources:
|
|
299
|
+
if isinstance(resource, Variable):
|
|
300
|
+
# Get all values for this variable
|
|
301
|
+
if resource.name in df.columns:
|
|
302
|
+
resources_to_describe.update(df[resource.name].to_list())
|
|
303
|
+
elif isinstance(resource, IRI):
|
|
304
|
+
resources_to_describe.add(self._expand_iri(resource.value, query.prefixes))
|
|
305
|
+
else:
|
|
306
|
+
# No WHERE clause - just describe the listed resources
|
|
307
|
+
for resource in query.resources:
|
|
308
|
+
if isinstance(resource, IRI):
|
|
309
|
+
resources_to_describe.add(self._expand_iri(resource.value, query.prefixes))
|
|
310
|
+
|
|
311
|
+
if not resources_to_describe:
|
|
312
|
+
return []
|
|
313
|
+
|
|
314
|
+
# Get all triples about these resources
|
|
315
|
+
triples = []
|
|
316
|
+
df = self.fact_store.scan_facts()
|
|
317
|
+
|
|
318
|
+
for resource in resources_to_describe:
|
|
319
|
+
resource_id = self.term_dict.lookup_iri(resource)
|
|
320
|
+
if resource_id is None:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
# As subject
|
|
324
|
+
subj_df = df.filter(pl.col("s") == resource_id)
|
|
325
|
+
for row in subj_df.iter_rows(named=True):
|
|
326
|
+
s_lex = self.term_dict.get_lex(row["s"]) or resource
|
|
327
|
+
p_lex = self.term_dict.get_lex(row["p"]) or f"<unknown:{row['p']}>"
|
|
328
|
+
o_lex = self.term_dict.get_lex(row["o"]) or f"<unknown:{row['o']}>"
|
|
329
|
+
triples.append((s_lex, p_lex, o_lex))
|
|
330
|
+
|
|
331
|
+
# As object
|
|
332
|
+
obj_df = df.filter(pl.col("o") == resource_id)
|
|
333
|
+
for row in obj_df.iter_rows(named=True):
|
|
334
|
+
s_lex = self.term_dict.get_lex(row["s"]) or f"<unknown:{row['s']}>"
|
|
335
|
+
p_lex = self.term_dict.get_lex(row["p"]) or f"<unknown:{row['p']}>"
|
|
336
|
+
o_lex = self.term_dict.get_lex(row["o"]) or resource
|
|
337
|
+
triples.append((s_lex, p_lex, o_lex))
|
|
338
|
+
|
|
339
|
+
# Remove duplicates
|
|
340
|
+
return list(set(triples))
|
|
341
|
+
|
|
342
|
+
def _execute_where(
|
|
343
|
+
self,
|
|
344
|
+
where: WhereClause,
|
|
345
|
+
prefixes: dict[str, str]
|
|
346
|
+
) -> pl.DataFrame:
|
|
347
|
+
"""Execute WHERE clause, returning DataFrame with integer term IDs."""
|
|
348
|
+
# Check if there's no work to do at all
|
|
349
|
+
if not where.patterns and not where.union_patterns and not where.optional_patterns:
|
|
350
|
+
return pl.DataFrame()
|
|
351
|
+
|
|
352
|
+
result_df: Optional[pl.DataFrame] = None
|
|
353
|
+
|
|
354
|
+
# Process basic triple patterns
|
|
355
|
+
for i, pattern in enumerate(where.patterns):
|
|
356
|
+
if isinstance(pattern, QuotedTriplePattern):
|
|
357
|
+
# Handle quoted triple patterns (Q6 style)
|
|
358
|
+
pattern_df = self._execute_quoted_pattern(pattern, prefixes, i)
|
|
359
|
+
else:
|
|
360
|
+
pattern_df = self._execute_pattern(pattern, prefixes, i)
|
|
361
|
+
|
|
362
|
+
if result_df is None:
|
|
363
|
+
result_df = pattern_df
|
|
364
|
+
else:
|
|
365
|
+
# Join on shared variables
|
|
366
|
+
shared_cols = set(result_df.columns) & set(pattern_df.columns)
|
|
367
|
+
shared_cols -= {"_pattern_idx"}
|
|
368
|
+
|
|
369
|
+
if shared_cols:
|
|
370
|
+
result_df = result_df.join(
|
|
371
|
+
pattern_df,
|
|
372
|
+
on=list(shared_cols),
|
|
373
|
+
how="inner"
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
result_df = result_df.join(pattern_df, how="cross")
|
|
377
|
+
|
|
378
|
+
# If we only have UNION patterns (no basic patterns), process UNION first
|
|
379
|
+
if result_df is None and where.union_patterns:
|
|
380
|
+
# Process first UNION to establish result_df
|
|
381
|
+
first_union = where.union_patterns[0]
|
|
382
|
+
result_df = self._apply_union_standalone(first_union, prefixes)
|
|
383
|
+
|
|
384
|
+
# Process remaining UNION patterns
|
|
385
|
+
for union in where.union_patterns[1:]:
|
|
386
|
+
result_df = self._apply_union(result_df, union, prefixes)
|
|
387
|
+
elif result_df is None:
|
|
388
|
+
return pl.DataFrame()
|
|
389
|
+
|
|
390
|
+
# Apply OPTIONAL patterns (left outer join) - must come before FILTER
|
|
391
|
+
# so that FILTER can reference optional variables
|
|
392
|
+
for optional in where.optional_patterns:
|
|
393
|
+
result_df = self._apply_optional(result_df, optional, prefixes)
|
|
394
|
+
|
|
395
|
+
# Apply MINUS patterns (anti-join)
|
|
396
|
+
for minus in where.minus_patterns:
|
|
397
|
+
result_df = self._apply_minus(result_df, minus, prefixes)
|
|
398
|
+
|
|
399
|
+
# Apply FILTER clauses - after OPTIONAL so all variables are available
|
|
400
|
+
for filter_clause in where.filters:
|
|
401
|
+
if isinstance(filter_clause, Filter):
|
|
402
|
+
result_df = self._apply_filter(result_df, filter_clause, prefixes)
|
|
403
|
+
elif isinstance(filter_clause, ProvenanceFilter):
|
|
404
|
+
result_df = self._apply_provenance_filter(result_df, filter_clause)
|
|
405
|
+
|
|
406
|
+
# Apply UNION patterns
|
|
407
|
+
for union in where.union_patterns:
|
|
408
|
+
result_df = self._apply_union(result_df, union, prefixes)
|
|
409
|
+
|
|
410
|
+
# Remove internal columns
|
|
411
|
+
internal_cols = [c for c in result_df.columns if c.startswith("_")]
|
|
412
|
+
if internal_cols:
|
|
413
|
+
result_df = result_df.drop(internal_cols)
|
|
414
|
+
|
|
415
|
+
return result_df
|
|
416
|
+
|
|
417
|
+
def _execute_pattern(
|
|
418
|
+
self,
|
|
419
|
+
pattern: TriplePattern,
|
|
420
|
+
prefixes: dict[str, str],
|
|
421
|
+
pattern_idx: int
|
|
422
|
+
) -> pl.DataFrame:
|
|
423
|
+
"""Execute a triple pattern using integer comparisons."""
|
|
424
|
+
# Check if this pattern has a property path predicate
|
|
425
|
+
if pattern.has_property_path():
|
|
426
|
+
return self._execute_property_path_pattern(pattern, prefixes, pattern_idx)
|
|
427
|
+
|
|
428
|
+
# Get the facts DataFrame
|
|
429
|
+
df = self.fact_store.scan_facts()
|
|
430
|
+
|
|
431
|
+
# Apply filters for concrete terms using integer IDs
|
|
432
|
+
if not isinstance(pattern.subject, Variable):
|
|
433
|
+
term_id = self._resolve_term_id(pattern.subject, prefixes)
|
|
434
|
+
if term_id is None:
|
|
435
|
+
return pl.DataFrame() # Term not in store
|
|
436
|
+
df = df.filter(pl.col("s") == term_id)
|
|
437
|
+
|
|
438
|
+
if not isinstance(pattern.predicate, Variable):
|
|
439
|
+
term_id = self._resolve_term_id(pattern.predicate, prefixes)
|
|
440
|
+
if term_id is None:
|
|
441
|
+
return pl.DataFrame()
|
|
442
|
+
df = df.filter(pl.col("p") == term_id)
|
|
443
|
+
|
|
444
|
+
if not isinstance(pattern.object, (Variable, QuotedTriplePattern)):
|
|
445
|
+
term_id = self._resolve_term_id(pattern.object, prefixes)
|
|
446
|
+
if term_id is None:
|
|
447
|
+
return pl.DataFrame()
|
|
448
|
+
df = df.filter(pl.col("o") == term_id)
|
|
449
|
+
|
|
450
|
+
# Build result with variable bindings
|
|
451
|
+
renames = {}
|
|
452
|
+
select_cols = []
|
|
453
|
+
|
|
454
|
+
if isinstance(pattern.subject, Variable):
|
|
455
|
+
renames["s"] = pattern.subject.name
|
|
456
|
+
select_cols.append("s")
|
|
457
|
+
|
|
458
|
+
if isinstance(pattern.predicate, Variable):
|
|
459
|
+
renames["p"] = pattern.predicate.name
|
|
460
|
+
select_cols.append("p")
|
|
461
|
+
|
|
462
|
+
if isinstance(pattern.object, Variable):
|
|
463
|
+
renames["o"] = pattern.object.name
|
|
464
|
+
select_cols.append("o")
|
|
465
|
+
|
|
466
|
+
# Include metadata columns for provenance filters
|
|
467
|
+
for col in ["source", "confidence", "t_added", "process"]:
|
|
468
|
+
if col in df.columns:
|
|
469
|
+
renames[col] = f"_prov_{pattern_idx}_{col}"
|
|
470
|
+
select_cols.append(col)
|
|
471
|
+
|
|
472
|
+
if select_cols:
|
|
473
|
+
result = df.select(select_cols).rename(renames)
|
|
474
|
+
else:
|
|
475
|
+
result = pl.DataFrame({"_match": [True] * len(df)})
|
|
476
|
+
|
|
477
|
+
return result
|
|
478
|
+
|
|
479
|
+
# =========================================================================
|
|
480
|
+
# Property Path Execution
|
|
481
|
+
# =========================================================================
|
|
482
|
+
|
|
483
|
+
def _execute_property_path_pattern(
|
|
484
|
+
self,
|
|
485
|
+
pattern: TriplePattern,
|
|
486
|
+
prefixes: dict[str, str],
|
|
487
|
+
pattern_idx: int
|
|
488
|
+
) -> pl.DataFrame:
|
|
489
|
+
"""
|
|
490
|
+
Execute a triple pattern with a property path predicate.
|
|
491
|
+
|
|
492
|
+
Supports:
|
|
493
|
+
- PathIRI: simple predicate (treated as normal pattern)
|
|
494
|
+
- PathSequence: a/b/c navigation
|
|
495
|
+
- PathAlternative: a|b|c any of these predicates
|
|
496
|
+
- PathInverse: ^a reverse direction
|
|
497
|
+
- PathMod: a*, a+, a? repetition
|
|
498
|
+
- PathNegatedPropertySet: !(a|b) any predicate except these
|
|
499
|
+
"""
|
|
500
|
+
path = pattern.predicate
|
|
501
|
+
|
|
502
|
+
# Resolve subject/object
|
|
503
|
+
subj_id = None
|
|
504
|
+
if not isinstance(pattern.subject, Variable):
|
|
505
|
+
subj_id = self._resolve_term_id(pattern.subject, prefixes)
|
|
506
|
+
if subj_id is None:
|
|
507
|
+
return pl.DataFrame()
|
|
508
|
+
|
|
509
|
+
obj_id = None
|
|
510
|
+
if not isinstance(pattern.object, Variable):
|
|
511
|
+
obj_id = self._resolve_term_id(pattern.object, prefixes)
|
|
512
|
+
if obj_id is None:
|
|
513
|
+
return pl.DataFrame()
|
|
514
|
+
|
|
515
|
+
# Execute path
|
|
516
|
+
result = self._execute_path(path, subj_id, obj_id, prefixes)
|
|
517
|
+
|
|
518
|
+
# Build output with variable bindings
|
|
519
|
+
renames = {}
|
|
520
|
+
if isinstance(pattern.subject, Variable):
|
|
521
|
+
renames["start"] = pattern.subject.name
|
|
522
|
+
if isinstance(pattern.object, Variable):
|
|
523
|
+
renames["end"] = pattern.object.name
|
|
524
|
+
|
|
525
|
+
if renames:
|
|
526
|
+
result = result.rename(renames)
|
|
527
|
+
|
|
528
|
+
# Select only needed columns
|
|
529
|
+
select_cols = list(renames.values()) if renames else ["start", "end"]
|
|
530
|
+
select_cols = [c for c in select_cols if c in result.columns]
|
|
531
|
+
if select_cols:
|
|
532
|
+
result = result.select(select_cols)
|
|
533
|
+
|
|
534
|
+
return result.unique()
|
|
535
|
+
|
|
536
|
+
def _execute_path(
|
|
537
|
+
self,
|
|
538
|
+
path: PropertyPath,
|
|
539
|
+
start_id: Optional[int],
|
|
540
|
+
end_id: Optional[int],
|
|
541
|
+
prefixes: dict[str, str]
|
|
542
|
+
) -> pl.DataFrame:
|
|
543
|
+
"""
|
|
544
|
+
Execute a property path, returning (start, end) pairs.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
path: The property path to execute
|
|
548
|
+
start_id: Fixed start node (or None for variable)
|
|
549
|
+
end_id: Fixed end node (or None for variable)
|
|
550
|
+
prefixes: Namespace prefixes
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
DataFrame with 'start' and 'end' columns
|
|
554
|
+
"""
|
|
555
|
+
if isinstance(path, PathIRI):
|
|
556
|
+
return self._execute_path_iri(path, start_id, end_id, prefixes)
|
|
557
|
+
elif isinstance(path, PathSequence):
|
|
558
|
+
return self._execute_path_sequence(path, start_id, end_id, prefixes)
|
|
559
|
+
elif isinstance(path, PathAlternative):
|
|
560
|
+
return self._execute_path_alternative(path, start_id, end_id, prefixes)
|
|
561
|
+
elif isinstance(path, PathInverse):
|
|
562
|
+
return self._execute_path_inverse(path, start_id, end_id, prefixes)
|
|
563
|
+
elif isinstance(path, PathMod):
|
|
564
|
+
return self._execute_path_mod(path, start_id, end_id, prefixes)
|
|
565
|
+
elif isinstance(path, PathNegatedPropertySet):
|
|
566
|
+
return self._execute_path_negated(path, start_id, end_id, prefixes)
|
|
567
|
+
else:
|
|
568
|
+
raise NotImplementedError(f"Path type {type(path)} not implemented")
|
|
569
|
+
|
|
570
|
+
def _execute_path_iri(
|
|
571
|
+
self,
|
|
572
|
+
path: PathIRI,
|
|
573
|
+
start_id: Optional[int],
|
|
574
|
+
end_id: Optional[int],
|
|
575
|
+
prefixes: dict[str, str]
|
|
576
|
+
) -> pl.DataFrame:
|
|
577
|
+
"""Execute a simple IRI path (single predicate)."""
|
|
578
|
+
pred_id = self._resolve_term_id(path.iri, prefixes)
|
|
579
|
+
if pred_id is None:
|
|
580
|
+
return pl.DataFrame({"start": [], "end": []})
|
|
581
|
+
|
|
582
|
+
df = self.fact_store.scan_facts()
|
|
583
|
+
df = df.filter(pl.col("p") == pred_id)
|
|
584
|
+
|
|
585
|
+
if start_id is not None:
|
|
586
|
+
df = df.filter(pl.col("s") == start_id)
|
|
587
|
+
if end_id is not None:
|
|
588
|
+
df = df.filter(pl.col("o") == end_id)
|
|
589
|
+
|
|
590
|
+
return df.select([
|
|
591
|
+
pl.col("s").alias("start"),
|
|
592
|
+
pl.col("o").alias("end")
|
|
593
|
+
])
|
|
594
|
+
|
|
595
|
+
def _execute_path_sequence(
|
|
596
|
+
self,
|
|
597
|
+
path: PathSequence,
|
|
598
|
+
start_id: Optional[int],
|
|
599
|
+
end_id: Optional[int],
|
|
600
|
+
prefixes: dict[str, str]
|
|
601
|
+
) -> pl.DataFrame:
|
|
602
|
+
"""Execute a path sequence (a/b/c)."""
|
|
603
|
+
if not path.paths:
|
|
604
|
+
return pl.DataFrame({"start": [], "end": []})
|
|
605
|
+
|
|
606
|
+
# Execute first path
|
|
607
|
+
result = self._execute_path(path.paths[0], start_id, None, prefixes)
|
|
608
|
+
|
|
609
|
+
# Chain through remaining paths
|
|
610
|
+
for i, subpath in enumerate(path.paths[1:], 1):
|
|
611
|
+
is_last = i == len(path.paths) - 1
|
|
612
|
+
|
|
613
|
+
# Execute next path segment
|
|
614
|
+
next_end = end_id if is_last else None
|
|
615
|
+
next_df = self._execute_path(subpath, None, next_end, prefixes)
|
|
616
|
+
|
|
617
|
+
# Join: result.end = next_df.start
|
|
618
|
+
result = result.join(
|
|
619
|
+
next_df.rename({"start": "_join_start", "end": "_next_end"}),
|
|
620
|
+
left_on="end",
|
|
621
|
+
right_on="_join_start",
|
|
622
|
+
how="inner"
|
|
623
|
+
).select([
|
|
624
|
+
pl.col("start"),
|
|
625
|
+
pl.col("_next_end").alias("end")
|
|
626
|
+
])
|
|
627
|
+
|
|
628
|
+
return result
|
|
629
|
+
|
|
630
|
+
def _execute_path_alternative(
|
|
631
|
+
self,
|
|
632
|
+
path: PathAlternative,
|
|
633
|
+
start_id: Optional[int],
|
|
634
|
+
end_id: Optional[int],
|
|
635
|
+
prefixes: dict[str, str]
|
|
636
|
+
) -> pl.DataFrame:
|
|
637
|
+
"""Execute a path alternative (a|b|c)."""
|
|
638
|
+
results = []
|
|
639
|
+
for subpath in path.paths:
|
|
640
|
+
df = self._execute_path(subpath, start_id, end_id, prefixes)
|
|
641
|
+
if len(df) > 0:
|
|
642
|
+
results.append(df)
|
|
643
|
+
|
|
644
|
+
if not results:
|
|
645
|
+
return pl.DataFrame({"start": [], "end": []})
|
|
646
|
+
|
|
647
|
+
return pl.concat(results).unique()
|
|
648
|
+
|
|
649
|
+
def _execute_path_inverse(
|
|
650
|
+
self,
|
|
651
|
+
path: PathInverse,
|
|
652
|
+
start_id: Optional[int],
|
|
653
|
+
end_id: Optional[int],
|
|
654
|
+
prefixes: dict[str, str]
|
|
655
|
+
) -> pl.DataFrame:
|
|
656
|
+
"""Execute an inverse path (^a) - swap start and end."""
|
|
657
|
+
# For inverse, we swap the direction
|
|
658
|
+
inner = self._execute_path(path.path, end_id, start_id, prefixes)
|
|
659
|
+
|
|
660
|
+
# Swap columns
|
|
661
|
+
return inner.select([
|
|
662
|
+
pl.col("end").alias("start"),
|
|
663
|
+
pl.col("start").alias("end")
|
|
664
|
+
])
|
|
665
|
+
|
|
666
|
+
def _execute_path_mod(
|
|
667
|
+
self,
|
|
668
|
+
path: PathMod,
|
|
669
|
+
start_id: Optional[int],
|
|
670
|
+
end_id: Optional[int],
|
|
671
|
+
prefixes: dict[str, str],
|
|
672
|
+
max_depth: int = 10
|
|
673
|
+
) -> pl.DataFrame:
|
|
674
|
+
"""Execute a modified path (a*, a+, a?)."""
|
|
675
|
+
if path.modifier == PropertyPathModifier.ZERO_OR_ONE:
|
|
676
|
+
# a? = identity OR one step
|
|
677
|
+
one_step = self._execute_path(path.path, start_id, end_id, prefixes)
|
|
678
|
+
|
|
679
|
+
# Add identity (start = end) for nodes
|
|
680
|
+
if start_id is not None:
|
|
681
|
+
identity = pl.DataFrame({"start": [start_id], "end": [start_id]})
|
|
682
|
+
elif end_id is not None:
|
|
683
|
+
identity = pl.DataFrame({"start": [end_id], "end": [end_id]})
|
|
684
|
+
else:
|
|
685
|
+
# Get all nodes
|
|
686
|
+
all_nodes = self._get_all_nodes()
|
|
687
|
+
identity = pl.DataFrame({"start": all_nodes, "end": all_nodes})
|
|
688
|
+
|
|
689
|
+
return pl.concat([one_step, identity]).unique()
|
|
690
|
+
|
|
691
|
+
elif path.modifier == PropertyPathModifier.ZERO_OR_MORE:
|
|
692
|
+
# a* = identity + transitive closure
|
|
693
|
+
return self._execute_transitive_closure(
|
|
694
|
+
path.path, start_id, end_id, prefixes,
|
|
695
|
+
include_identity=True, max_depth=max_depth
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
elif path.modifier == PropertyPathModifier.ONE_OR_MORE:
|
|
699
|
+
# a+ = at least one step, then transitive closure
|
|
700
|
+
return self._execute_transitive_closure(
|
|
701
|
+
path.path, start_id, end_id, prefixes,
|
|
702
|
+
include_identity=False, max_depth=max_depth
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
else:
|
|
706
|
+
raise NotImplementedError(f"Path modifier {path.modifier} not implemented")
|
|
707
|
+
|
|
708
|
+
def _execute_transitive_closure(
|
|
709
|
+
self,
|
|
710
|
+
path: PropertyPath,
|
|
711
|
+
start_id: Optional[int],
|
|
712
|
+
end_id: Optional[int],
|
|
713
|
+
prefixes: dict[str, str],
|
|
714
|
+
include_identity: bool,
|
|
715
|
+
max_depth: int = 10
|
|
716
|
+
) -> pl.DataFrame:
|
|
717
|
+
"""Compute transitive closure for path+ or path*."""
|
|
718
|
+
# Get single-step edges
|
|
719
|
+
edges = self._execute_path(path, None, None, prefixes)
|
|
720
|
+
|
|
721
|
+
# Ensure edges have proper schema
|
|
722
|
+
if len(edges) == 0:
|
|
723
|
+
if include_identity:
|
|
724
|
+
if start_id is not None:
|
|
725
|
+
return pl.DataFrame({
|
|
726
|
+
"start": pl.Series([start_id], dtype=pl.UInt64),
|
|
727
|
+
"end": pl.Series([start_id], dtype=pl.UInt64)
|
|
728
|
+
})
|
|
729
|
+
elif end_id is not None:
|
|
730
|
+
return pl.DataFrame({
|
|
731
|
+
"start": pl.Series([end_id], dtype=pl.UInt64),
|
|
732
|
+
"end": pl.Series([end_id], dtype=pl.UInt64)
|
|
733
|
+
})
|
|
734
|
+
return pl.DataFrame({
|
|
735
|
+
"start": pl.Series([], dtype=pl.UInt64),
|
|
736
|
+
"end": pl.Series([], dtype=pl.UInt64)
|
|
737
|
+
})
|
|
738
|
+
|
|
739
|
+
# Initialize reachable set with proper schema matching edges
|
|
740
|
+
if include_identity:
|
|
741
|
+
all_nodes = self._get_all_nodes()
|
|
742
|
+
reachable = pl.DataFrame({
|
|
743
|
+
"start": pl.Series(all_nodes, dtype=pl.UInt64),
|
|
744
|
+
"end": pl.Series(all_nodes, dtype=pl.UInt64)
|
|
745
|
+
})
|
|
746
|
+
# Add single-step edges
|
|
747
|
+
reachable = pl.concat([reachable, edges]).unique()
|
|
748
|
+
else:
|
|
749
|
+
# For ONE_OR_MORE, start with just the edges
|
|
750
|
+
reachable = edges.clone()
|
|
751
|
+
|
|
752
|
+
# Iteratively expand (fixed-point computation)
|
|
753
|
+
for _ in range(max_depth):
|
|
754
|
+
prev_len = len(reachable)
|
|
755
|
+
|
|
756
|
+
# Join reachable with edges: (a, b) + (b, c) => (a, c)
|
|
757
|
+
new_pairs = reachable.join(
|
|
758
|
+
edges.rename({"start": "_mid", "end": "_new_end"}),
|
|
759
|
+
left_on="end",
|
|
760
|
+
right_on="_mid",
|
|
761
|
+
how="inner"
|
|
762
|
+
).select([
|
|
763
|
+
pl.col("start"),
|
|
764
|
+
pl.col("_new_end").alias("end")
|
|
765
|
+
])
|
|
766
|
+
|
|
767
|
+
reachable = pl.concat([reachable, new_pairs]).unique()
|
|
768
|
+
|
|
769
|
+
if len(reachable) == prev_len:
|
|
770
|
+
break # Fixed point reached
|
|
771
|
+
|
|
772
|
+
# Apply start/end filters
|
|
773
|
+
if start_id is not None:
|
|
774
|
+
reachable = reachable.filter(pl.col("start") == start_id)
|
|
775
|
+
if end_id is not None:
|
|
776
|
+
reachable = reachable.filter(pl.col("end") == end_id)
|
|
777
|
+
|
|
778
|
+
return reachable
|
|
779
|
+
|
|
780
|
+
def _execute_path_negated(
|
|
781
|
+
self,
|
|
782
|
+
path: PathNegatedPropertySet,
|
|
783
|
+
start_id: Optional[int],
|
|
784
|
+
end_id: Optional[int],
|
|
785
|
+
prefixes: dict[str, str]
|
|
786
|
+
) -> pl.DataFrame:
|
|
787
|
+
"""Execute a negated property set !(a|b) - any predicate except these."""
|
|
788
|
+
# Get excluded predicate IDs
|
|
789
|
+
excluded_ids = set()
|
|
790
|
+
for iri in path.iris:
|
|
791
|
+
pred_id = self._resolve_term_id(iri, prefixes)
|
|
792
|
+
if pred_id is not None:
|
|
793
|
+
excluded_ids.add(pred_id)
|
|
794
|
+
|
|
795
|
+
df = self.fact_store.scan_facts()
|
|
796
|
+
|
|
797
|
+
# Exclude the specified predicates
|
|
798
|
+
if excluded_ids:
|
|
799
|
+
df = df.filter(~pl.col("p").is_in(list(excluded_ids)))
|
|
800
|
+
|
|
801
|
+
if start_id is not None:
|
|
802
|
+
df = df.filter(pl.col("s") == start_id)
|
|
803
|
+
if end_id is not None:
|
|
804
|
+
df = df.filter(pl.col("o") == end_id)
|
|
805
|
+
|
|
806
|
+
return df.select([
|
|
807
|
+
pl.col("s").alias("start"),
|
|
808
|
+
pl.col("o").alias("end")
|
|
809
|
+
])
|
|
810
|
+
|
|
811
|
+
def _get_all_nodes(self) -> list[int]:
|
|
812
|
+
"""Get all unique node IDs (subjects and objects)."""
|
|
813
|
+
df = self.fact_store.scan_facts()
|
|
814
|
+
subjects = df.select("s").unique()["s"].to_list()
|
|
815
|
+
objects = df.select("o").unique()["o"].to_list()
|
|
816
|
+
return list(set(subjects) | set(objects))
|
|
817
|
+
|
|
818
|
+
# =========================================================================
|
|
819
|
+
# MINUS Pattern Execution
|
|
820
|
+
# =========================================================================
|
|
821
|
+
|
|
822
|
+
def _apply_minus(
|
|
823
|
+
self,
|
|
824
|
+
result_df: pl.DataFrame,
|
|
825
|
+
minus: MinusPattern,
|
|
826
|
+
prefixes: dict[str, str]
|
|
827
|
+
) -> pl.DataFrame:
|
|
828
|
+
"""
|
|
829
|
+
Apply a MINUS pattern to filter out matching solutions.
|
|
830
|
+
|
|
831
|
+
MINUS implements set difference: returns rows from result_df
|
|
832
|
+
that don't have compatible bindings in the minus pattern.
|
|
833
|
+
|
|
834
|
+
SPARQL semantics: A solution µ1 is removed if there exists
|
|
835
|
+
a solution µ2 in the MINUS clause such that:
|
|
836
|
+
- µ1 and µ2 are compatible (agree on shared variables)
|
|
837
|
+
- dom(µ1) ∩ dom(µ2) ≠ ∅ (they share at least one variable)
|
|
838
|
+
"""
|
|
839
|
+
if len(result_df) == 0:
|
|
840
|
+
return result_df
|
|
841
|
+
|
|
842
|
+
# Execute the MINUS patterns to get solutions to exclude
|
|
843
|
+
minus_df: Optional[pl.DataFrame] = None
|
|
844
|
+
|
|
845
|
+
for i, pattern in enumerate(minus.patterns):
|
|
846
|
+
if isinstance(pattern, QuotedTriplePattern):
|
|
847
|
+
pattern_df = self._execute_quoted_pattern(pattern, prefixes, i)
|
|
848
|
+
else:
|
|
849
|
+
pattern_df = self._execute_pattern(pattern, prefixes, i)
|
|
850
|
+
|
|
851
|
+
if minus_df is None:
|
|
852
|
+
minus_df = pattern_df
|
|
853
|
+
else:
|
|
854
|
+
# Join on shared variables
|
|
855
|
+
shared_cols = set(minus_df.columns) & set(pattern_df.columns)
|
|
856
|
+
shared_cols -= {"_pattern_idx"}
|
|
857
|
+
|
|
858
|
+
if shared_cols:
|
|
859
|
+
minus_df = minus_df.join(
|
|
860
|
+
pattern_df,
|
|
861
|
+
on=list(shared_cols),
|
|
862
|
+
how="inner"
|
|
863
|
+
)
|
|
864
|
+
else:
|
|
865
|
+
minus_df = minus_df.join(pattern_df, how="cross")
|
|
866
|
+
|
|
867
|
+
if minus_df is None or len(minus_df) == 0:
|
|
868
|
+
return result_df
|
|
869
|
+
|
|
870
|
+
# Apply filters from the MINUS clause
|
|
871
|
+
for filter_clause in minus.filters:
|
|
872
|
+
if isinstance(filter_clause, Filter):
|
|
873
|
+
minus_df = self._apply_filter(minus_df, filter_clause, prefixes)
|
|
874
|
+
|
|
875
|
+
# Find shared variables between result and minus
|
|
876
|
+
shared_vars = set(result_df.columns) & set(minus_df.columns)
|
|
877
|
+
shared_vars = {c for c in shared_vars if not c.startswith("_")}
|
|
878
|
+
|
|
879
|
+
if not shared_vars:
|
|
880
|
+
# No shared variables - MINUS has no effect (SPARQL semantics)
|
|
881
|
+
return result_df
|
|
882
|
+
|
|
883
|
+
# Perform anti-join: keep rows from result_df that don't match minus_df
|
|
884
|
+
# We do this with a left join and then filter for nulls
|
|
885
|
+
shared_list = list(shared_vars)
|
|
886
|
+
|
|
887
|
+
# Add a marker column to minus_df to detect matches
|
|
888
|
+
minus_df = minus_df.select(shared_list).unique()
|
|
889
|
+
minus_df = minus_df.with_columns(pl.lit(True).alias("_minus_match"))
|
|
890
|
+
|
|
891
|
+
# Left join
|
|
892
|
+
result_df = result_df.join(
|
|
893
|
+
minus_df,
|
|
894
|
+
on=shared_list,
|
|
895
|
+
how="left"
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
# Keep only rows where there was no match
|
|
899
|
+
result_df = result_df.filter(pl.col("_minus_match").is_null())
|
|
900
|
+
result_df = result_df.drop("_minus_match")
|
|
901
|
+
|
|
902
|
+
return result_df
|
|
903
|
+
|
|
904
|
+
def _apply_optional(
|
|
905
|
+
self,
|
|
906
|
+
result_df: pl.DataFrame,
|
|
907
|
+
optional: OptionalPattern,
|
|
908
|
+
prefixes: dict[str, str]
|
|
909
|
+
) -> pl.DataFrame:
|
|
910
|
+
"""
|
|
911
|
+
Apply an OPTIONAL pattern using left outer join.
|
|
912
|
+
|
|
913
|
+
OPTIONAL { ... } patterns add bindings when matched but keep
|
|
914
|
+
rows even when no match exists (with NULL for optional columns).
|
|
915
|
+
"""
|
|
916
|
+
if len(result_df) == 0:
|
|
917
|
+
return result_df
|
|
918
|
+
|
|
919
|
+
# Execute the optional patterns
|
|
920
|
+
optional_df: Optional[pl.DataFrame] = None
|
|
921
|
+
|
|
922
|
+
for i, pattern in enumerate(optional.patterns):
|
|
923
|
+
if isinstance(pattern, QuotedTriplePattern):
|
|
924
|
+
pattern_df = self._execute_quoted_pattern(pattern, prefixes, 1000 + i)
|
|
925
|
+
elif isinstance(pattern, TriplePattern):
|
|
926
|
+
pattern_df = self._execute_pattern(pattern, prefixes, 1000 + i)
|
|
927
|
+
else:
|
|
928
|
+
continue # Skip nested patterns for now
|
|
929
|
+
|
|
930
|
+
if optional_df is None:
|
|
931
|
+
optional_df = pattern_df
|
|
932
|
+
else:
|
|
933
|
+
shared_cols = set(optional_df.columns) & set(pattern_df.columns)
|
|
934
|
+
shared_cols -= {"_pattern_idx"}
|
|
935
|
+
shared_cols = {c for c in shared_cols if not c.startswith("_")}
|
|
936
|
+
|
|
937
|
+
if shared_cols:
|
|
938
|
+
optional_df = optional_df.join(pattern_df, on=list(shared_cols), how="inner")
|
|
939
|
+
else:
|
|
940
|
+
optional_df = optional_df.join(pattern_df, how="cross")
|
|
941
|
+
|
|
942
|
+
if optional_df is None or len(optional_df) == 0:
|
|
943
|
+
return result_df
|
|
944
|
+
|
|
945
|
+
# Apply filters within the optional block
|
|
946
|
+
for filter_clause in optional.filters:
|
|
947
|
+
if isinstance(filter_clause, Filter):
|
|
948
|
+
optional_df = self._apply_filter(optional_df, filter_clause, prefixes)
|
|
949
|
+
|
|
950
|
+
# Remove internal columns from optional_df
|
|
951
|
+
internal_cols = [c for c in optional_df.columns if c.startswith("_")]
|
|
952
|
+
if internal_cols:
|
|
953
|
+
optional_df = optional_df.drop(internal_cols)
|
|
954
|
+
|
|
955
|
+
# Find shared columns for the join
|
|
956
|
+
shared_cols = set(result_df.columns) & set(optional_df.columns)
|
|
957
|
+
shared_cols = {c for c in shared_cols if not c.startswith("_")}
|
|
958
|
+
|
|
959
|
+
if shared_cols:
|
|
960
|
+
# Left outer join - keep all rows from result_df, add optional columns where matched
|
|
961
|
+
return result_df.join(optional_df, on=list(shared_cols), how="left")
|
|
962
|
+
else:
|
|
963
|
+
# No shared columns - return original
|
|
964
|
+
return result_df
|
|
965
|
+
|
|
966
|
+
def _apply_union_standalone(
|
|
967
|
+
self,
|
|
968
|
+
union: UnionPattern,
|
|
969
|
+
prefixes: dict[str, str]
|
|
970
|
+
) -> pl.DataFrame:
|
|
971
|
+
"""
|
|
972
|
+
Apply a UNION pattern as the primary query (no prior results).
|
|
973
|
+
|
|
974
|
+
This is used when WHERE clause starts with a UNION.
|
|
975
|
+
"""
|
|
976
|
+
union_results = []
|
|
977
|
+
|
|
978
|
+
for i, alternative in enumerate(union.alternatives):
|
|
979
|
+
# Execute each alternative as a mini WHERE clause
|
|
980
|
+
alt_where = WhereClause(patterns=alternative)
|
|
981
|
+
alt_df = self._execute_where(alt_where, prefixes)
|
|
982
|
+
|
|
983
|
+
if len(alt_df) > 0:
|
|
984
|
+
union_results.append(alt_df)
|
|
985
|
+
|
|
986
|
+
if not union_results:
|
|
987
|
+
return pl.DataFrame()
|
|
988
|
+
|
|
989
|
+
# Combine all union results
|
|
990
|
+
if len(union_results) == 1:
|
|
991
|
+
return union_results[0]
|
|
992
|
+
|
|
993
|
+
# Align schemas - add missing columns with null values
|
|
994
|
+
all_columns = set()
|
|
995
|
+
for r in union_results:
|
|
996
|
+
all_columns.update(r.columns)
|
|
997
|
+
|
|
998
|
+
aligned_results = []
|
|
999
|
+
for r in union_results:
|
|
1000
|
+
missing_cols = all_columns - set(r.columns)
|
|
1001
|
+
if missing_cols:
|
|
1002
|
+
# Add null columns for missing variables
|
|
1003
|
+
for col in missing_cols:
|
|
1004
|
+
r = r.with_columns(pl.lit(None).alias(col))
|
|
1005
|
+
aligned_results.append(r.select(sorted(all_columns)))
|
|
1006
|
+
|
|
1007
|
+
return pl.concat(aligned_results)
|
|
1008
|
+
|
|
1009
|
+
def _apply_union(
|
|
1010
|
+
self,
|
|
1011
|
+
result_df: pl.DataFrame,
|
|
1012
|
+
union: UnionPattern,
|
|
1013
|
+
prefixes: dict[str, str]
|
|
1014
|
+
) -> pl.DataFrame:
|
|
1015
|
+
"""
|
|
1016
|
+
Apply a UNION pattern by combining results from alternatives.
|
|
1017
|
+
|
|
1018
|
+
UNION combines results from multiple pattern groups:
|
|
1019
|
+
{ ?s ?p ?o } UNION { ?s ?q ?r }
|
|
1020
|
+
|
|
1021
|
+
Returns all rows matching ANY of the alternatives.
|
|
1022
|
+
"""
|
|
1023
|
+
union_results = []
|
|
1024
|
+
|
|
1025
|
+
for i, alternative in enumerate(union.alternatives):
|
|
1026
|
+
# Execute each alternative as a mini WHERE clause
|
|
1027
|
+
alt_where = WhereClause(patterns=alternative)
|
|
1028
|
+
alt_df = self._execute_where(alt_where, prefixes)
|
|
1029
|
+
|
|
1030
|
+
if len(alt_df) > 0:
|
|
1031
|
+
union_results.append(alt_df)
|
|
1032
|
+
|
|
1033
|
+
if not union_results:
|
|
1034
|
+
return result_df
|
|
1035
|
+
|
|
1036
|
+
# Combine all union results
|
|
1037
|
+
if len(union_results) == 1:
|
|
1038
|
+
union_df = union_results[0]
|
|
1039
|
+
else:
|
|
1040
|
+
# Align schemas - add missing columns with null values
|
|
1041
|
+
all_columns = set()
|
|
1042
|
+
for r in union_results:
|
|
1043
|
+
all_columns.update(r.columns)
|
|
1044
|
+
|
|
1045
|
+
aligned_results = []
|
|
1046
|
+
for r in union_results:
|
|
1047
|
+
missing_cols = all_columns - set(r.columns)
|
|
1048
|
+
if missing_cols:
|
|
1049
|
+
# Add null columns for missing variables
|
|
1050
|
+
for col in missing_cols:
|
|
1051
|
+
r = r.with_columns(pl.lit(None).alias(col))
|
|
1052
|
+
aligned_results.append(r.select(sorted(all_columns)))
|
|
1053
|
+
|
|
1054
|
+
union_df = pl.concat(aligned_results)
|
|
1055
|
+
|
|
1056
|
+
# If we have existing results, combine them with union
|
|
1057
|
+
if len(result_df) > 0 and len(result_df.columns) > 0:
|
|
1058
|
+
# Find shared columns
|
|
1059
|
+
shared_cols = set(result_df.columns) & set(union_df.columns)
|
|
1060
|
+
shared_cols = {c for c in shared_cols if not c.startswith("_")}
|
|
1061
|
+
|
|
1062
|
+
if shared_cols:
|
|
1063
|
+
# Join union results with existing results
|
|
1064
|
+
return result_df.join(union_df, on=list(shared_cols), how="inner")
|
|
1065
|
+
else:
|
|
1066
|
+
# No shared columns - cross join
|
|
1067
|
+
return result_df.join(union_df, how="cross")
|
|
1068
|
+
else:
|
|
1069
|
+
return union_df
|
|
1070
|
+
|
|
1071
|
+
def _execute_quoted_pattern(
|
|
1072
|
+
self,
|
|
1073
|
+
pattern: QuotedTriplePattern,
|
|
1074
|
+
prefixes: dict[str, str],
|
|
1075
|
+
pattern_idx: int
|
|
1076
|
+
) -> pl.DataFrame:
|
|
1077
|
+
"""
|
|
1078
|
+
Execute a quoted triple pattern (Q6: << s p o >> ?mp ?mo).
|
|
1079
|
+
|
|
1080
|
+
This is the key RDF★ expansion pattern that finds metadata
|
|
1081
|
+
about quoted triples.
|
|
1082
|
+
"""
|
|
1083
|
+
# Get the quoted triple components
|
|
1084
|
+
s_term = pattern.subject
|
|
1085
|
+
p_term = pattern.predicate
|
|
1086
|
+
o_term = pattern.object
|
|
1087
|
+
|
|
1088
|
+
# Check if the quoted triple itself is concrete or has variables
|
|
1089
|
+
qt_s_id = None if isinstance(s_term, Variable) else self._resolve_term_id(s_term, prefixes)
|
|
1090
|
+
qt_p_id = None if isinstance(p_term, Variable) else self._resolve_term_id(p_term, prefixes)
|
|
1091
|
+
qt_o_id = None if isinstance(o_term, Variable) else self._resolve_term_id(o_term, prefixes)
|
|
1092
|
+
|
|
1093
|
+
# If all components are concrete, look up the qt_id
|
|
1094
|
+
if qt_s_id is not None and qt_p_id is not None and qt_o_id is not None:
|
|
1095
|
+
qt_id = self.qt_dict.lookup_id(qt_s_id, qt_p_id, qt_o_id)
|
|
1096
|
+
if qt_id is None:
|
|
1097
|
+
return pl.DataFrame() # Quoted triple not found
|
|
1098
|
+
|
|
1099
|
+
# Find facts where this qt_id appears as subject
|
|
1100
|
+
df = self.fact_store.scan_facts().filter(pl.col("s") == qt_id)
|
|
1101
|
+
else:
|
|
1102
|
+
# Need to join with qt_dict to expand
|
|
1103
|
+
df = self._expand_qt_metadata(qt_s_id, qt_p_id, qt_o_id)
|
|
1104
|
+
|
|
1105
|
+
# Rename predicate/object vars if they exist in outer pattern
|
|
1106
|
+
# (for << s p o >> ?mp ?mo patterns)
|
|
1107
|
+
renames = {}
|
|
1108
|
+
select_cols = []
|
|
1109
|
+
|
|
1110
|
+
# The quoted triple's metadata predicate/object
|
|
1111
|
+
if "p" in df.columns:
|
|
1112
|
+
renames["p"] = "mp" if pattern_idx == 0 else f"mp_{pattern_idx}"
|
|
1113
|
+
select_cols.append("p")
|
|
1114
|
+
if "o" in df.columns:
|
|
1115
|
+
renames["o"] = "mo" if pattern_idx == 0 else f"mo_{pattern_idx}"
|
|
1116
|
+
select_cols.append("o")
|
|
1117
|
+
|
|
1118
|
+
# Include base triple components if variables
|
|
1119
|
+
if isinstance(s_term, Variable) and "base_s" in df.columns:
|
|
1120
|
+
renames["base_s"] = s_term.name
|
|
1121
|
+
select_cols.append("base_s")
|
|
1122
|
+
if isinstance(p_term, Variable) and "base_p" in df.columns:
|
|
1123
|
+
renames["base_p"] = p_term.name
|
|
1124
|
+
select_cols.append("base_p")
|
|
1125
|
+
if isinstance(o_term, Variable) and "base_o" in df.columns:
|
|
1126
|
+
renames["base_o"] = o_term.name
|
|
1127
|
+
select_cols.append("base_o")
|
|
1128
|
+
|
|
1129
|
+
if select_cols:
|
|
1130
|
+
result = df.select(select_cols).rename(renames)
|
|
1131
|
+
else:
|
|
1132
|
+
result = df
|
|
1133
|
+
|
|
1134
|
+
return result
|
|
1135
|
+
|
|
1136
|
+
def _expand_qt_metadata(
|
|
1137
|
+
self,
|
|
1138
|
+
qt_s_id: Optional[TermId],
|
|
1139
|
+
qt_p_id: Optional[TermId],
|
|
1140
|
+
qt_o_id: Optional[TermId]
|
|
1141
|
+
) -> pl.DataFrame:
|
|
1142
|
+
"""
|
|
1143
|
+
Expand quoted triple metadata with optional filters on components.
|
|
1144
|
+
|
|
1145
|
+
This implements the RDF★ expansion join: find metadata about
|
|
1146
|
+
quoted triples, optionally filtering by their s/p/o components.
|
|
1147
|
+
"""
|
|
1148
|
+
# Get facts about quoted triples (metadata facts)
|
|
1149
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1150
|
+
|
|
1151
|
+
if len(df) == 0:
|
|
1152
|
+
return df
|
|
1153
|
+
|
|
1154
|
+
# Expand qt_id to base (s, p, o)
|
|
1155
|
+
df = self.fact_store.expand_qt_metadata(df, self.qt_dict)
|
|
1156
|
+
|
|
1157
|
+
# Apply filters on quoted triple components
|
|
1158
|
+
if qt_s_id is not None:
|
|
1159
|
+
df = df.filter(pl.col("base_s") == qt_s_id)
|
|
1160
|
+
if qt_p_id is not None:
|
|
1161
|
+
df = df.filter(pl.col("base_p") == qt_p_id)
|
|
1162
|
+
if qt_o_id is not None:
|
|
1163
|
+
df = df.filter(pl.col("base_o") == qt_o_id)
|
|
1164
|
+
|
|
1165
|
+
return df
|
|
1166
|
+
|
|
1167
|
+
def _resolve_term_id(
|
|
1168
|
+
self,
|
|
1169
|
+
term: Term,
|
|
1170
|
+
prefixes: dict[str, str]
|
|
1171
|
+
) -> Optional[TermId]:
|
|
1172
|
+
"""Resolve a term to its integer ID."""
|
|
1173
|
+
if isinstance(term, IRI):
|
|
1174
|
+
iri = self._expand_iri(term.value, prefixes)
|
|
1175
|
+
return self.term_dict.lookup_iri(iri)
|
|
1176
|
+
elif isinstance(term, Literal):
|
|
1177
|
+
return self.term_dict.lookup_literal(
|
|
1178
|
+
str(term.value),
|
|
1179
|
+
term.datatype,
|
|
1180
|
+
term.language
|
|
1181
|
+
)
|
|
1182
|
+
elif isinstance(term, BlankNode):
|
|
1183
|
+
return self.term_dict.lookup_bnode(term.label)
|
|
1184
|
+
return None
|
|
1185
|
+
|
|
1186
|
+
def _expand_iri(self, iri: str, prefixes: dict[str, str]) -> str:
|
|
1187
|
+
"""Expand prefixed IRI to full form."""
|
|
1188
|
+
if ":" in iri and not iri.startswith("http"):
|
|
1189
|
+
prefix, local = iri.split(":", 1)
|
|
1190
|
+
if prefix in prefixes:
|
|
1191
|
+
return prefixes[prefix] + local
|
|
1192
|
+
return iri
|
|
1193
|
+
|
|
1194
|
+
def _decode_result(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
1195
|
+
"""Decode integer term IDs back to lexical forms for output."""
|
|
1196
|
+
if len(df) == 0:
|
|
1197
|
+
return df
|
|
1198
|
+
|
|
1199
|
+
# Find columns that contain term IDs (variables and metadata predicates)
|
|
1200
|
+
id_columns = [c for c in df.columns if not c.startswith("_prov")]
|
|
1201
|
+
|
|
1202
|
+
for col in id_columns:
|
|
1203
|
+
if df.schema[col] == pl.UInt64:
|
|
1204
|
+
# Decode this column
|
|
1205
|
+
decoded = []
|
|
1206
|
+
for term_id in df[col].to_list():
|
|
1207
|
+
if term_id is None:
|
|
1208
|
+
decoded.append(None) # Keep NULL for OPTIONAL non-matches
|
|
1209
|
+
else:
|
|
1210
|
+
lex = self.term_dict.get_lex(term_id)
|
|
1211
|
+
decoded.append(lex if lex else f"<unknown:{term_id}>")
|
|
1212
|
+
df = df.with_columns(pl.Series(col, decoded))
|
|
1213
|
+
|
|
1214
|
+
return df
|
|
1215
|
+
|
|
1216
|
+
def _apply_filter(
|
|
1217
|
+
self,
|
|
1218
|
+
df: pl.DataFrame,
|
|
1219
|
+
filter_clause: Filter,
|
|
1220
|
+
prefixes: dict[str, str]
|
|
1221
|
+
) -> pl.DataFrame:
|
|
1222
|
+
"""
|
|
1223
|
+
Apply a FILTER clause.
|
|
1224
|
+
|
|
1225
|
+
For filters involving literal comparisons (especially numeric),
|
|
1226
|
+
we need to decode the term IDs to their actual values first,
|
|
1227
|
+
apply the filter, then re-encode if needed.
|
|
1228
|
+
"""
|
|
1229
|
+
# Get variables used in the filter expression
|
|
1230
|
+
filter_vars = self._get_filter_variables(filter_clause.expression)
|
|
1231
|
+
|
|
1232
|
+
# Check if all required variables exist in the dataframe
|
|
1233
|
+
missing_vars = [v for v in filter_vars if v.name not in df.columns]
|
|
1234
|
+
if missing_vars:
|
|
1235
|
+
# If any filter variable is missing, no rows can match
|
|
1236
|
+
return df.head(0)
|
|
1237
|
+
|
|
1238
|
+
# Decode the filter-relevant columns to actual values
|
|
1239
|
+
decoded_df = df.clone()
|
|
1240
|
+
for var in filter_vars:
|
|
1241
|
+
if var.name in decoded_df.columns:
|
|
1242
|
+
col = decoded_df[var.name]
|
|
1243
|
+
if col.dtype == pl.UInt64:
|
|
1244
|
+
# Decode this column to its lexical values
|
|
1245
|
+
decoded_values = []
|
|
1246
|
+
for term_id in col.to_list():
|
|
1247
|
+
if term_id is None:
|
|
1248
|
+
decoded_values.append(None)
|
|
1249
|
+
else:
|
|
1250
|
+
lex = self.term_dict.get_lex(term_id)
|
|
1251
|
+
# Try to convert to numeric if possible
|
|
1252
|
+
if lex is not None:
|
|
1253
|
+
try:
|
|
1254
|
+
# Try int first, then float
|
|
1255
|
+
decoded_values.append(int(lex))
|
|
1256
|
+
except ValueError:
|
|
1257
|
+
try:
|
|
1258
|
+
decoded_values.append(float(lex))
|
|
1259
|
+
except ValueError:
|
|
1260
|
+
decoded_values.append(lex)
|
|
1261
|
+
else:
|
|
1262
|
+
decoded_values.append(None)
|
|
1263
|
+
decoded_df = decoded_df.with_columns(
|
|
1264
|
+
pl.Series(f"_decoded_{var.name}", decoded_values)
|
|
1265
|
+
)
|
|
1266
|
+
else:
|
|
1267
|
+
# Column already decoded (e.g., string type) - just alias it
|
|
1268
|
+
decoded_df = decoded_df.with_columns(
|
|
1269
|
+
pl.col(var.name).alias(f"_decoded_{var.name}")
|
|
1270
|
+
)
|
|
1271
|
+
|
|
1272
|
+
# Build filter expression using decoded columns
|
|
1273
|
+
expr = self._build_filter_expression_decoded(filter_clause.expression, prefixes)
|
|
1274
|
+
if expr is not None:
|
|
1275
|
+
# Filter using decoded values, keep original columns
|
|
1276
|
+
filtered = decoded_df.filter(expr)
|
|
1277
|
+
# Drop the decoded columns
|
|
1278
|
+
decoded_cols = [c for c in filtered.columns if c.startswith("_decoded_")]
|
|
1279
|
+
if decoded_cols:
|
|
1280
|
+
filtered = filtered.drop(decoded_cols)
|
|
1281
|
+
return filtered
|
|
1282
|
+
|
|
1283
|
+
return df
|
|
1284
|
+
|
|
1285
|
+
def _get_filter_variables(self, expr) -> set:
|
|
1286
|
+
"""Get all variables referenced in a filter expression."""
|
|
1287
|
+
from rdf_starbase.sparql.ast import Variable, Comparison, LogicalExpression
|
|
1288
|
+
|
|
1289
|
+
variables = set()
|
|
1290
|
+
if isinstance(expr, Variable):
|
|
1291
|
+
variables.add(expr)
|
|
1292
|
+
elif isinstance(expr, Comparison):
|
|
1293
|
+
if isinstance(expr.left, Variable):
|
|
1294
|
+
variables.add(expr.left)
|
|
1295
|
+
if isinstance(expr.right, Variable):
|
|
1296
|
+
variables.add(expr.right)
|
|
1297
|
+
elif isinstance(expr, LogicalExpression):
|
|
1298
|
+
for operand in expr.operands:
|
|
1299
|
+
variables.update(self._get_filter_variables(operand))
|
|
1300
|
+
return variables
|
|
1301
|
+
|
|
1302
|
+
def _build_filter_expression_decoded(
|
|
1303
|
+
self,
|
|
1304
|
+
expr: Union[Comparison, LogicalExpression, FunctionCall],
|
|
1305
|
+
prefixes: dict[str, str]
|
|
1306
|
+
) -> Optional[pl.Expr]:
|
|
1307
|
+
"""Build Polars filter expression using decoded column names."""
|
|
1308
|
+
if isinstance(expr, Comparison):
|
|
1309
|
+
left = self._term_to_expr_decoded(expr.left, prefixes)
|
|
1310
|
+
right = self._term_to_expr_decoded(expr.right, prefixes)
|
|
1311
|
+
|
|
1312
|
+
if left is None or right is None:
|
|
1313
|
+
return None
|
|
1314
|
+
|
|
1315
|
+
op_map = {
|
|
1316
|
+
ComparisonOp.EQ: lambda l, r: l == r,
|
|
1317
|
+
ComparisonOp.NE: lambda l, r: l != r,
|
|
1318
|
+
ComparisonOp.LT: lambda l, r: l < r,
|
|
1319
|
+
ComparisonOp.LE: lambda l, r: l <= r,
|
|
1320
|
+
ComparisonOp.GT: lambda l, r: l > r,
|
|
1321
|
+
ComparisonOp.GE: lambda l, r: l >= r,
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
return op_map[expr.operator](left, right)
|
|
1325
|
+
|
|
1326
|
+
elif isinstance(expr, LogicalExpression):
|
|
1327
|
+
operand_exprs = [
|
|
1328
|
+
self._build_filter_expression_decoded(op, prefixes)
|
|
1329
|
+
for op in expr.operands
|
|
1330
|
+
]
|
|
1331
|
+
valid_exprs = [e for e in operand_exprs if e is not None]
|
|
1332
|
+
|
|
1333
|
+
if not valid_exprs:
|
|
1334
|
+
return None
|
|
1335
|
+
|
|
1336
|
+
if expr.operator == LogicalOp.AND:
|
|
1337
|
+
result = valid_exprs[0]
|
|
1338
|
+
for e in valid_exprs[1:]:
|
|
1339
|
+
result = result & e
|
|
1340
|
+
return result
|
|
1341
|
+
elif expr.operator == LogicalOp.OR:
|
|
1342
|
+
result = valid_exprs[0]
|
|
1343
|
+
for e in valid_exprs[1:]:
|
|
1344
|
+
result = result | e
|
|
1345
|
+
return result
|
|
1346
|
+
elif expr.operator == LogicalOp.NOT:
|
|
1347
|
+
return ~valid_exprs[0]
|
|
1348
|
+
|
|
1349
|
+
return None
|
|
1350
|
+
|
|
1351
|
+
def _term_to_expr_decoded(
|
|
1352
|
+
self,
|
|
1353
|
+
term: Term,
|
|
1354
|
+
prefixes: dict[str, str]
|
|
1355
|
+
) -> Optional[pl.Expr]:
|
|
1356
|
+
"""Convert a term to a Polars expression using decoded column names."""
|
|
1357
|
+
if isinstance(term, Variable):
|
|
1358
|
+
# Use the decoded column if it exists
|
|
1359
|
+
return pl.col(f"_decoded_{term.name}")
|
|
1360
|
+
elif isinstance(term, Literal):
|
|
1361
|
+
# Convert literal to appropriate type
|
|
1362
|
+
try:
|
|
1363
|
+
return pl.lit(int(term.value))
|
|
1364
|
+
except ValueError:
|
|
1365
|
+
try:
|
|
1366
|
+
return pl.lit(float(term.value))
|
|
1367
|
+
except ValueError:
|
|
1368
|
+
return pl.lit(term.value)
|
|
1369
|
+
elif isinstance(term, IRI):
|
|
1370
|
+
# For IRI comparisons, use the full IRI string
|
|
1371
|
+
iri_str = self._expand_iri(term, prefixes)
|
|
1372
|
+
return pl.lit(iri_str)
|
|
1373
|
+
return None
|
|
1374
|
+
|
|
1375
|
+
def _apply_provenance_filter(
|
|
1376
|
+
self,
|
|
1377
|
+
df: pl.DataFrame,
|
|
1378
|
+
filter_clause: ProvenanceFilter
|
|
1379
|
+
) -> pl.DataFrame:
|
|
1380
|
+
"""Apply provenance FILTER (FILTER_CONFIDENCE, FILTER_SOURCE, etc.)."""
|
|
1381
|
+
field = filter_clause.provenance_field
|
|
1382
|
+
prov_cols = [c for c in df.columns if c.endswith(f"_{field}")]
|
|
1383
|
+
|
|
1384
|
+
if not prov_cols:
|
|
1385
|
+
return df
|
|
1386
|
+
|
|
1387
|
+
expr = filter_clause.expression
|
|
1388
|
+
if isinstance(expr, Comparison):
|
|
1389
|
+
combined_expr = None
|
|
1390
|
+
for col in prov_cols:
|
|
1391
|
+
col_expr = self._build_provenance_comparison(expr, col)
|
|
1392
|
+
if col_expr is not None:
|
|
1393
|
+
if combined_expr is None:
|
|
1394
|
+
combined_expr = col_expr
|
|
1395
|
+
else:
|
|
1396
|
+
combined_expr = combined_expr | col_expr
|
|
1397
|
+
|
|
1398
|
+
if combined_expr is not None:
|
|
1399
|
+
return df.filter(combined_expr)
|
|
1400
|
+
|
|
1401
|
+
return df
|
|
1402
|
+
|
|
1403
|
+
def _build_provenance_comparison(
|
|
1404
|
+
self,
|
|
1405
|
+
expr: Comparison,
|
|
1406
|
+
prov_col: str
|
|
1407
|
+
) -> Optional[pl.Expr]:
|
|
1408
|
+
"""Build comparison expression for provenance filtering."""
|
|
1409
|
+
if isinstance(expr.left, Variable):
|
|
1410
|
+
left = pl.col(prov_col)
|
|
1411
|
+
right = self._literal_to_polars(expr.right)
|
|
1412
|
+
elif isinstance(expr.right, Variable):
|
|
1413
|
+
left = self._literal_to_polars(expr.left)
|
|
1414
|
+
right = pl.col(prov_col)
|
|
1415
|
+
else:
|
|
1416
|
+
left = self._literal_to_polars(expr.left)
|
|
1417
|
+
right = self._literal_to_polars(expr.right)
|
|
1418
|
+
|
|
1419
|
+
if left is None or right is None:
|
|
1420
|
+
return None
|
|
1421
|
+
|
|
1422
|
+
op_map = {
|
|
1423
|
+
ComparisonOp.EQ: lambda l, r: l == r,
|
|
1424
|
+
ComparisonOp.NE: lambda l, r: l != r,
|
|
1425
|
+
ComparisonOp.LT: lambda l, r: l < r,
|
|
1426
|
+
ComparisonOp.LE: lambda l, r: l <= r,
|
|
1427
|
+
ComparisonOp.GT: lambda l, r: l > r,
|
|
1428
|
+
ComparisonOp.GE: lambda l, r: l >= r,
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
return op_map[expr.operator](left, right)
|
|
1432
|
+
|
|
1433
|
+
def _build_filter_expression(
|
|
1434
|
+
self,
|
|
1435
|
+
expr: Union[Comparison, LogicalExpression, FunctionCall],
|
|
1436
|
+
prefixes: dict[str, str]
|
|
1437
|
+
) -> Optional[pl.Expr]:
|
|
1438
|
+
"""Build Polars filter expression from SPARQL filter AST."""
|
|
1439
|
+
if isinstance(expr, Comparison):
|
|
1440
|
+
left = self._term_to_expr(expr.left, prefixes)
|
|
1441
|
+
right = self._term_to_expr(expr.right, prefixes)
|
|
1442
|
+
|
|
1443
|
+
if left is None or right is None:
|
|
1444
|
+
return None
|
|
1445
|
+
|
|
1446
|
+
op_map = {
|
|
1447
|
+
ComparisonOp.EQ: lambda l, r: l == r,
|
|
1448
|
+
ComparisonOp.NE: lambda l, r: l != r,
|
|
1449
|
+
ComparisonOp.LT: lambda l, r: l < r,
|
|
1450
|
+
ComparisonOp.LE: lambda l, r: l <= r,
|
|
1451
|
+
ComparisonOp.GT: lambda l, r: l > r,
|
|
1452
|
+
ComparisonOp.GE: lambda l, r: l >= r,
|
|
1453
|
+
}
|
|
1454
|
+
|
|
1455
|
+
return op_map[expr.operator](left, right)
|
|
1456
|
+
|
|
1457
|
+
elif isinstance(expr, LogicalExpression):
|
|
1458
|
+
operand_exprs = [
|
|
1459
|
+
self._build_filter_expression(op, prefixes)
|
|
1460
|
+
for op in expr.operands
|
|
1461
|
+
]
|
|
1462
|
+
valid_exprs = [e for e in operand_exprs if e is not None]
|
|
1463
|
+
|
|
1464
|
+
if not valid_exprs:
|
|
1465
|
+
return None
|
|
1466
|
+
|
|
1467
|
+
if expr.operator == LogicalOp.AND:
|
|
1468
|
+
result = valid_exprs[0]
|
|
1469
|
+
for e in valid_exprs[1:]:
|
|
1470
|
+
result = result & e
|
|
1471
|
+
return result
|
|
1472
|
+
elif expr.operator == LogicalOp.OR:
|
|
1473
|
+
result = valid_exprs[0]
|
|
1474
|
+
for e in valid_exprs[1:]:
|
|
1475
|
+
result = result | e
|
|
1476
|
+
return result
|
|
1477
|
+
elif expr.operator == LogicalOp.NOT:
|
|
1478
|
+
return ~valid_exprs[0]
|
|
1479
|
+
|
|
1480
|
+
return None
|
|
1481
|
+
|
|
1482
|
+
def _term_to_expr(
|
|
1483
|
+
self,
|
|
1484
|
+
term: Term,
|
|
1485
|
+
prefixes: dict[str, str]
|
|
1486
|
+
) -> Optional[pl.Expr]:
|
|
1487
|
+
"""Convert a term to a Polars expression."""
|
|
1488
|
+
if isinstance(term, Variable):
|
|
1489
|
+
return pl.col(term.name)
|
|
1490
|
+
elif isinstance(term, Literal):
|
|
1491
|
+
return pl.lit(term.value)
|
|
1492
|
+
elif isinstance(term, IRI):
|
|
1493
|
+
# For IRI comparisons, use the term ID
|
|
1494
|
+
term_id = self._resolve_term_id(term, prefixes)
|
|
1495
|
+
return pl.lit(term_id) if term_id else None
|
|
1496
|
+
return None
|
|
1497
|
+
|
|
1498
|
+
def _literal_to_polars(self, term: Term) -> Optional[Any]:
|
|
1499
|
+
"""Convert a literal term to a Polars literal."""
|
|
1500
|
+
if isinstance(term, Literal):
|
|
1501
|
+
return pl.lit(term.value)
|
|
1502
|
+
elif isinstance(term, Variable):
|
|
1503
|
+
return pl.col(term.name)
|
|
1504
|
+
return None
|
|
1505
|
+
|
|
1506
|
+
|
|
1507
|
+
# === RDF★ Expansion Query Patterns ===
|
|
1508
|
+
# These implement Q6-Q12 from the SPARQL-Star test suite
|
|
1509
|
+
|
|
1510
|
+
class ExpansionPatterns:
|
|
1511
|
+
"""
|
|
1512
|
+
Factory for common RDF★ expansion query patterns.
|
|
1513
|
+
|
|
1514
|
+
These patterns efficiently query metadata about quoted triples
|
|
1515
|
+
and expand them back to (s, p, o) components.
|
|
1516
|
+
"""
|
|
1517
|
+
|
|
1518
|
+
def __init__(
|
|
1519
|
+
self,
|
|
1520
|
+
term_dict: TermDict,
|
|
1521
|
+
qt_dict: QtDict,
|
|
1522
|
+
fact_store: FactStore
|
|
1523
|
+
):
|
|
1524
|
+
self.term_dict = term_dict
|
|
1525
|
+
self.qt_dict = qt_dict
|
|
1526
|
+
self.fact_store = fact_store
|
|
1527
|
+
|
|
1528
|
+
def q6_metadata_for_triple(
|
|
1529
|
+
self,
|
|
1530
|
+
subject: str,
|
|
1531
|
+
predicate: str,
|
|
1532
|
+
obj: str
|
|
1533
|
+
) -> pl.DataFrame:
|
|
1534
|
+
"""
|
|
1535
|
+
Q6: Fetch all metadata about a specific quoted triple.
|
|
1536
|
+
|
|
1537
|
+
SELECT ?mp ?mo WHERE {
|
|
1538
|
+
<< subject predicate object >> ?mp ?mo .
|
|
1539
|
+
}
|
|
1540
|
+
"""
|
|
1541
|
+
# Look up term IDs
|
|
1542
|
+
s_id = self.term_dict.lookup_iri(subject)
|
|
1543
|
+
p_id = self.term_dict.lookup_iri(predicate)
|
|
1544
|
+
o_id = self.term_dict.lookup_iri(obj)
|
|
1545
|
+
|
|
1546
|
+
if s_id is None or p_id is None or o_id is None:
|
|
1547
|
+
return pl.DataFrame({"mp": [], "mo": []})
|
|
1548
|
+
|
|
1549
|
+
# Look up the quoted triple
|
|
1550
|
+
qt_id = self.qt_dict.get_id(s_id, p_id, o_id)
|
|
1551
|
+
if qt_id is None:
|
|
1552
|
+
return pl.DataFrame({"mp": [], "mo": []})
|
|
1553
|
+
|
|
1554
|
+
# Find facts where qt_id is the subject
|
|
1555
|
+
df = self.fact_store.scan_facts().filter(pl.col("s") == qt_id)
|
|
1556
|
+
|
|
1557
|
+
# Decode predicates and objects
|
|
1558
|
+
result = []
|
|
1559
|
+
for row in df.iter_rows(named=True):
|
|
1560
|
+
mp = self.term_dict.get_lex(row["p"])
|
|
1561
|
+
mo = self.term_dict.get_lex(row["o"])
|
|
1562
|
+
result.append({"mp": mp, "mo": mo})
|
|
1563
|
+
|
|
1564
|
+
return pl.DataFrame(result) if result else pl.DataFrame({"mp": [], "mo": []})
|
|
1565
|
+
|
|
1566
|
+
def q7_expand_by_source(self, source_uri: str) -> pl.DataFrame:
|
|
1567
|
+
"""
|
|
1568
|
+
Q7: Given a source, find all quoted triples derived from it
|
|
1569
|
+
and expand them to base (s, p, o).
|
|
1570
|
+
|
|
1571
|
+
SELECT ?s ?p ?o WHERE {
|
|
1572
|
+
?qt prov:wasDerivedFrom source_uri .
|
|
1573
|
+
# Expand the quoted triple
|
|
1574
|
+
}
|
|
1575
|
+
"""
|
|
1576
|
+
# Look up the source term
|
|
1577
|
+
source_id = self.term_dict.lookup_iri(source_uri)
|
|
1578
|
+
if source_id is None:
|
|
1579
|
+
return pl.DataFrame({"s": [], "p": [], "o": []})
|
|
1580
|
+
|
|
1581
|
+
# Look up prov:wasDerivedFrom predicate
|
|
1582
|
+
prov_pred = self.term_dict.lookup_iri(
|
|
1583
|
+
"http://www.w3.org/ns/prov#wasDerivedFrom"
|
|
1584
|
+
)
|
|
1585
|
+
if prov_pred is None:
|
|
1586
|
+
return pl.DataFrame({"s": [], "p": [], "o": []})
|
|
1587
|
+
|
|
1588
|
+
# Find metadata facts with this predicate and source
|
|
1589
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1590
|
+
df = df.filter(
|
|
1591
|
+
(pl.col("p") == prov_pred) &
|
|
1592
|
+
(pl.col("o") == source_id)
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
# Expand qt_ids to (s, p, o)
|
|
1596
|
+
df = self.fact_store.expand_metadata_df(df)
|
|
1597
|
+
|
|
1598
|
+
# Decode to lexical forms
|
|
1599
|
+
result = []
|
|
1600
|
+
for row in df.iter_rows(named=True):
|
|
1601
|
+
s = self.term_dict.get_lex(row["base_s"])
|
|
1602
|
+
p = self.term_dict.get_lex(row["base_p"])
|
|
1603
|
+
o = self.term_dict.get_lex(row["base_o"])
|
|
1604
|
+
result.append({"s": s, "p": p, "o": o})
|
|
1605
|
+
|
|
1606
|
+
return pl.DataFrame(result) if result else pl.DataFrame({"s": [], "p": [], "o": []})
|
|
1607
|
+
|
|
1608
|
+
def q8_expand_by_activity(self, activity_uri: str) -> pl.DataFrame:
|
|
1609
|
+
"""
|
|
1610
|
+
Q8: List all statements generated by a given run/activity and expand.
|
|
1611
|
+
|
|
1612
|
+
SELECT ?s ?p ?o WHERE {
|
|
1613
|
+
?qt prov:wasGeneratedBy activity_uri .
|
|
1614
|
+
}
|
|
1615
|
+
"""
|
|
1616
|
+
activity_id = self.term_dict.lookup_iri(activity_uri)
|
|
1617
|
+
if activity_id is None:
|
|
1618
|
+
return pl.DataFrame({"s": [], "p": [], "o": []})
|
|
1619
|
+
|
|
1620
|
+
gen_pred = self.term_dict.lookup_iri(
|
|
1621
|
+
"http://www.w3.org/ns/prov#wasGeneratedBy"
|
|
1622
|
+
)
|
|
1623
|
+
if gen_pred is None:
|
|
1624
|
+
return pl.DataFrame({"s": [], "p": [], "o": []})
|
|
1625
|
+
|
|
1626
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1627
|
+
df = df.filter(
|
|
1628
|
+
(pl.col("p") == gen_pred) &
|
|
1629
|
+
(pl.col("o") == activity_id)
|
|
1630
|
+
)
|
|
1631
|
+
|
|
1632
|
+
df = self.fact_store.expand_metadata_df(df)
|
|
1633
|
+
|
|
1634
|
+
result = []
|
|
1635
|
+
for row in df.iter_rows(named=True):
|
|
1636
|
+
s = self.term_dict.get_lex(row["base_s"])
|
|
1637
|
+
p = self.term_dict.get_lex(row["base_p"])
|
|
1638
|
+
o = self.term_dict.get_lex(row["base_o"])
|
|
1639
|
+
result.append({"s": s, "p": p, "o": o})
|
|
1640
|
+
|
|
1641
|
+
return pl.DataFrame(result) if result else pl.DataFrame({"s": [], "p": [], "o": []})
|
|
1642
|
+
|
|
1643
|
+
def q9_filter_by_confidence(
|
|
1644
|
+
self,
|
|
1645
|
+
min_confidence: float,
|
|
1646
|
+
max_confidence: Optional[float] = None,
|
|
1647
|
+
expand_lex: bool = True,
|
|
1648
|
+
) -> pl.DataFrame:
|
|
1649
|
+
"""
|
|
1650
|
+
Q9: Filter statements by confidence and expand.
|
|
1651
|
+
|
|
1652
|
+
SELECT ?s ?p ?o ?c WHERE {
|
|
1653
|
+
?qt ex:confidence ?c .
|
|
1654
|
+
FILTER(?c > min_confidence)
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
Uses pure Polars join for vectorized performance.
|
|
1658
|
+
|
|
1659
|
+
Args:
|
|
1660
|
+
min_confidence: Minimum confidence threshold (exclusive)
|
|
1661
|
+
max_confidence: Maximum confidence threshold (inclusive, optional)
|
|
1662
|
+
expand_lex: If True, return lexical forms. If False, return term IDs (faster).
|
|
1663
|
+
"""
|
|
1664
|
+
conf_pred = self.term_dict.lookup_iri("http://example.org/confidence")
|
|
1665
|
+
if conf_pred is None:
|
|
1666
|
+
cols = {"s": [], "p": [], "o": [], "c": []}
|
|
1667
|
+
return pl.DataFrame(cols)
|
|
1668
|
+
|
|
1669
|
+
# Get confidence facts
|
|
1670
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1671
|
+
df = df.filter(pl.col("p") == conf_pred)
|
|
1672
|
+
|
|
1673
|
+
if df.is_empty():
|
|
1674
|
+
cols = {"s": [], "p": [], "o": [], "c": []}
|
|
1675
|
+
return pl.DataFrame(cols)
|
|
1676
|
+
|
|
1677
|
+
# Build float map as a Polars DataFrame for vectorized join
|
|
1678
|
+
float_map = self.term_dict.build_literal_to_float_map()
|
|
1679
|
+
if not float_map:
|
|
1680
|
+
cols = {"s": [], "p": [], "o": [], "c": []}
|
|
1681
|
+
return pl.DataFrame(cols)
|
|
1682
|
+
|
|
1683
|
+
# Create lookup DataFrame: term_id -> float value
|
|
1684
|
+
map_df = pl.DataFrame({
|
|
1685
|
+
"term_id": list(float_map.keys()),
|
|
1686
|
+
"conf_value": list(float_map.values()),
|
|
1687
|
+
}).cast({"term_id": pl.UInt64, "conf_value": pl.Float64})
|
|
1688
|
+
|
|
1689
|
+
# Join to get confidence values (pure Polars, no Python iteration!)
|
|
1690
|
+
df = df.join(map_df, left_on="o", right_on="term_id", how="inner")
|
|
1691
|
+
|
|
1692
|
+
# Filter by confidence threshold (vectorized!)
|
|
1693
|
+
df = df.filter(pl.col("conf_value") > min_confidence)
|
|
1694
|
+
if max_confidence is not None:
|
|
1695
|
+
df = df.filter(pl.col("conf_value") <= max_confidence)
|
|
1696
|
+
|
|
1697
|
+
if df.is_empty():
|
|
1698
|
+
cols = {"s": [], "p": [], "o": [], "c": []}
|
|
1699
|
+
return pl.DataFrame(cols)
|
|
1700
|
+
|
|
1701
|
+
# Expand to get base triple components
|
|
1702
|
+
df = self.fact_store.expand_metadata_df(df)
|
|
1703
|
+
|
|
1704
|
+
if not expand_lex:
|
|
1705
|
+
# Return term IDs directly (much faster for large results)
|
|
1706
|
+
return df.select([
|
|
1707
|
+
pl.col("base_s").alias("s"),
|
|
1708
|
+
pl.col("base_p").alias("p"),
|
|
1709
|
+
pl.col("base_o").alias("o"),
|
|
1710
|
+
pl.col("conf_value").alias("c"),
|
|
1711
|
+
])
|
|
1712
|
+
|
|
1713
|
+
# Map term IDs to lexical forms using vectorized lookup
|
|
1714
|
+
s_lex = self.term_dict.get_lex_series(df["base_s"])
|
|
1715
|
+
p_lex = self.term_dict.get_lex_series(df["base_p"])
|
|
1716
|
+
o_lex = self.term_dict.get_lex_series(df["base_o"])
|
|
1717
|
+
|
|
1718
|
+
return pl.DataFrame({
|
|
1719
|
+
"s": s_lex,
|
|
1720
|
+
"p": p_lex,
|
|
1721
|
+
"o": o_lex,
|
|
1722
|
+
"c": df["conf_value"],
|
|
1723
|
+
})
|
|
1724
|
+
|
|
1725
|
+
def q9_count_by_confidence(
|
|
1726
|
+
self,
|
|
1727
|
+
min_confidence: float,
|
|
1728
|
+
max_confidence: Optional[float] = None,
|
|
1729
|
+
) -> int:
|
|
1730
|
+
"""
|
|
1731
|
+
Count statements above confidence threshold (fast).
|
|
1732
|
+
|
|
1733
|
+
SELECT (COUNT(*) as ?count) WHERE {
|
|
1734
|
+
?qt ex:confidence ?c .
|
|
1735
|
+
FILTER(?c > min_confidence)
|
|
1736
|
+
}
|
|
1737
|
+
"""
|
|
1738
|
+
df = self.q9_filter_by_confidence(min_confidence, max_confidence, expand_lex=False)
|
|
1739
|
+
return len(df)
|
|
1740
|
+
|
|
1741
|
+
def q9_native_filter_by_confidence(
|
|
1742
|
+
self,
|
|
1743
|
+
min_confidence: float,
|
|
1744
|
+
max_confidence: Optional[float] = None,
|
|
1745
|
+
expand_lex: bool = True,
|
|
1746
|
+
) -> pl.DataFrame:
|
|
1747
|
+
"""
|
|
1748
|
+
Q9 (Native): Filter facts by confidence using native column.
|
|
1749
|
+
|
|
1750
|
+
This is the FAST version that uses the native `confidence` column
|
|
1751
|
+
in the FactStore schema, avoiding any string parsing or joins.
|
|
1752
|
+
|
|
1753
|
+
Use this when facts were ingested with `add_facts_with_provenance()`
|
|
1754
|
+
which stores confidence directly in the native column.
|
|
1755
|
+
|
|
1756
|
+
SELECT ?s ?p ?o ?c WHERE {
|
|
1757
|
+
FILTER(confidence > min_confidence)
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
Args:
|
|
1761
|
+
min_confidence: Minimum confidence threshold (exclusive)
|
|
1762
|
+
max_confidence: Maximum confidence threshold (inclusive, optional)
|
|
1763
|
+
expand_lex: If True, return lexical forms. If False, return term IDs.
|
|
1764
|
+
"""
|
|
1765
|
+
# Pure vectorized scan on native column - no joins!
|
|
1766
|
+
df = self.fact_store.scan_by_confidence(
|
|
1767
|
+
min_confidence,
|
|
1768
|
+
max_confidence,
|
|
1769
|
+
include_metadata=False, # Only base facts
|
|
1770
|
+
)
|
|
1771
|
+
|
|
1772
|
+
if df.is_empty():
|
|
1773
|
+
return pl.DataFrame({"s": [], "p": [], "o": [], "c": []})
|
|
1774
|
+
|
|
1775
|
+
if not expand_lex:
|
|
1776
|
+
return df.select([
|
|
1777
|
+
pl.col("s"),
|
|
1778
|
+
pl.col("p"),
|
|
1779
|
+
pl.col("o"),
|
|
1780
|
+
pl.col("confidence").alias("c"),
|
|
1781
|
+
])
|
|
1782
|
+
|
|
1783
|
+
# Map term IDs to lexical forms
|
|
1784
|
+
s_lex = self.term_dict.get_lex_series(df["s"])
|
|
1785
|
+
p_lex = self.term_dict.get_lex_series(df["p"])
|
|
1786
|
+
o_lex = self.term_dict.get_lex_series(df["o"])
|
|
1787
|
+
|
|
1788
|
+
return pl.DataFrame({
|
|
1789
|
+
"s": s_lex,
|
|
1790
|
+
"p": p_lex,
|
|
1791
|
+
"o": o_lex,
|
|
1792
|
+
"c": df["confidence"],
|
|
1793
|
+
})
|
|
1794
|
+
|
|
1795
|
+
def q9_native_count(
|
|
1796
|
+
self,
|
|
1797
|
+
min_confidence: float,
|
|
1798
|
+
max_confidence: Optional[float] = None,
|
|
1799
|
+
) -> int:
|
|
1800
|
+
"""
|
|
1801
|
+
Count facts by confidence using native column (fastest).
|
|
1802
|
+
"""
|
|
1803
|
+
df = self.fact_store.scan_by_confidence(
|
|
1804
|
+
min_confidence,
|
|
1805
|
+
max_confidence,
|
|
1806
|
+
include_metadata=False,
|
|
1807
|
+
)
|
|
1808
|
+
return len(df)
|
|
1809
|
+
|
|
1810
|
+
def q10_filter_by_time_range(
|
|
1811
|
+
self,
|
|
1812
|
+
start: datetime,
|
|
1813
|
+
end: datetime
|
|
1814
|
+
) -> pl.DataFrame:
|
|
1815
|
+
"""
|
|
1816
|
+
Q10: Filter by time range on metadata.
|
|
1817
|
+
|
|
1818
|
+
SELECT ?qt ?t WHERE {
|
|
1819
|
+
?qt prov:generatedAtTime ?t .
|
|
1820
|
+
FILTER(?t >= start && ?t < end)
|
|
1821
|
+
}
|
|
1822
|
+
"""
|
|
1823
|
+
time_pred = self.term_dict.lookup_iri(
|
|
1824
|
+
"http://www.w3.org/ns/prov#generatedAtTime"
|
|
1825
|
+
)
|
|
1826
|
+
if time_pred is None:
|
|
1827
|
+
return pl.DataFrame({"qt": [], "t": []})
|
|
1828
|
+
|
|
1829
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1830
|
+
df = df.filter(pl.col("p") == time_pred)
|
|
1831
|
+
|
|
1832
|
+
result = []
|
|
1833
|
+
for row in df.iter_rows(named=True):
|
|
1834
|
+
time_lex = self.term_dict.get_lex(row["o"])
|
|
1835
|
+
if time_lex is None:
|
|
1836
|
+
continue
|
|
1837
|
+
try:
|
|
1838
|
+
# Parse ISO datetime
|
|
1839
|
+
t = datetime.fromisoformat(time_lex.replace("Z", "+00:00"))
|
|
1840
|
+
if start <= t < end:
|
|
1841
|
+
qt_lex = self._qt_to_string(row["s"])
|
|
1842
|
+
result.append({"qt": qt_lex, "t": time_lex})
|
|
1843
|
+
except ValueError:
|
|
1844
|
+
continue
|
|
1845
|
+
|
|
1846
|
+
return pl.DataFrame(result) if result else pl.DataFrame({"qt": [], "t": []})
|
|
1847
|
+
|
|
1848
|
+
def q11_count_by_source(self) -> pl.DataFrame:
|
|
1849
|
+
"""
|
|
1850
|
+
Q11: Count statements per source.
|
|
1851
|
+
|
|
1852
|
+
SELECT ?src (COUNT(?qt) AS ?n) WHERE {
|
|
1853
|
+
?qt prov:wasDerivedFrom ?src .
|
|
1854
|
+
} GROUP BY ?src ORDER BY DESC(?n)
|
|
1855
|
+
"""
|
|
1856
|
+
prov_pred = self.term_dict.lookup_iri(
|
|
1857
|
+
"http://www.w3.org/ns/prov#wasDerivedFrom"
|
|
1858
|
+
)
|
|
1859
|
+
if prov_pred is None:
|
|
1860
|
+
return pl.DataFrame({"src": [], "n": []})
|
|
1861
|
+
|
|
1862
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1863
|
+
df = df.filter(pl.col("p") == prov_pred)
|
|
1864
|
+
|
|
1865
|
+
# Group by source (object) and count
|
|
1866
|
+
grouped = df.group_by("o").agg(pl.len().alias("n"))
|
|
1867
|
+
grouped = grouped.sort("n", descending=True)
|
|
1868
|
+
|
|
1869
|
+
# Decode source URIs
|
|
1870
|
+
result = []
|
|
1871
|
+
for row in grouped.iter_rows(named=True):
|
|
1872
|
+
src = self.term_dict.get_lex(row["o"])
|
|
1873
|
+
result.append({"src": src, "n": row["n"]})
|
|
1874
|
+
|
|
1875
|
+
return pl.DataFrame(result) if result else pl.DataFrame({"src": [], "n": []})
|
|
1876
|
+
|
|
1877
|
+
def q12_count_by_run(self) -> pl.DataFrame:
|
|
1878
|
+
"""
|
|
1879
|
+
Q12: Count statements per run.
|
|
1880
|
+
|
|
1881
|
+
SELECT ?run (COUNT(?qt) AS ?n) WHERE {
|
|
1882
|
+
?qt prov:wasGeneratedBy ?run .
|
|
1883
|
+
} GROUP BY ?run ORDER BY DESC(?n)
|
|
1884
|
+
"""
|
|
1885
|
+
gen_pred = self.term_dict.lookup_iri(
|
|
1886
|
+
"http://www.w3.org/ns/prov#wasGeneratedBy"
|
|
1887
|
+
)
|
|
1888
|
+
if gen_pred is None:
|
|
1889
|
+
return pl.DataFrame({"run": [], "n": []})
|
|
1890
|
+
|
|
1891
|
+
df = self.fact_store.scan_metadata_facts()
|
|
1892
|
+
df = df.filter(pl.col("p") == gen_pred)
|
|
1893
|
+
|
|
1894
|
+
grouped = df.group_by("o").agg(pl.len().alias("n"))
|
|
1895
|
+
grouped = grouped.sort("n", descending=True)
|
|
1896
|
+
|
|
1897
|
+
result = []
|
|
1898
|
+
for row in grouped.iter_rows(named=True):
|
|
1899
|
+
run = self.term_dict.get_lex(row["o"])
|
|
1900
|
+
result.append({"run": run, "n": row["n"]})
|
|
1901
|
+
|
|
1902
|
+
return pl.DataFrame(result) if result else pl.DataFrame({"run": [], "n": []})
|
|
1903
|
+
|
|
1904
|
+
def _qt_to_string(self, qt_id: TermId) -> str:
|
|
1905
|
+
"""Convert a quoted triple ID to << s p o >> string form."""
|
|
1906
|
+
qt = self.qt_dict.lookup(qt_id)
|
|
1907
|
+
if qt is None:
|
|
1908
|
+
return f"<unknown qt:{qt_id}>"
|
|
1909
|
+
|
|
1910
|
+
s = self.term_dict.get_lex(qt.s) or f"<{qt.s}>"
|
|
1911
|
+
p = self.term_dict.get_lex(qt.p) or f"<{qt.p}>"
|
|
1912
|
+
o = self.term_dict.get_lex(qt.o) or f"<{qt.o}>"
|
|
1913
|
+
|
|
1914
|
+
return f"<< {s} {p} {o} >>"
|