rdf-starbase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdf_starbase/__init__.py +57 -0
- rdf_starbase/ai_grounding.py +728 -0
- rdf_starbase/compat/__init__.py +26 -0
- rdf_starbase/compat/rdflib.py +1104 -0
- rdf_starbase/formats/__init__.py +29 -0
- rdf_starbase/formats/jsonld.py +488 -0
- rdf_starbase/formats/ntriples.py +419 -0
- rdf_starbase/formats/rdfxml.py +434 -0
- rdf_starbase/formats/turtle.py +882 -0
- rdf_starbase/models.py +92 -0
- rdf_starbase/registry.py +540 -0
- rdf_starbase/repositories.py +407 -0
- rdf_starbase/repository_api.py +739 -0
- rdf_starbase/sparql/__init__.py +35 -0
- rdf_starbase/sparql/ast.py +910 -0
- rdf_starbase/sparql/executor.py +1925 -0
- rdf_starbase/sparql/parser.py +1716 -0
- rdf_starbase/storage/__init__.py +44 -0
- rdf_starbase/storage/executor.py +1914 -0
- rdf_starbase/storage/facts.py +850 -0
- rdf_starbase/storage/lsm.py +531 -0
- rdf_starbase/storage/persistence.py +338 -0
- rdf_starbase/storage/quoted_triples.py +292 -0
- rdf_starbase/storage/reasoner.py +1035 -0
- rdf_starbase/storage/terms.py +628 -0
- rdf_starbase/store.py +1049 -0
- rdf_starbase/store_legacy.py +748 -0
- rdf_starbase/web.py +568 -0
- rdf_starbase-0.1.0.dist-info/METADATA +706 -0
- rdf_starbase-0.1.0.dist-info/RECORD +31 -0
- rdf_starbase-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1925 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
SPARQL-Star Query Executor using Polars.
|
|
4
|
+
|
|
5
|
+
Translates SPARQL-Star AST to Polars operations for blazingly fast execution.
|
|
6
|
+
|
|
7
|
+
Includes internal optimizations for provenance queries that map standard
|
|
8
|
+
SPARQL-Star patterns like << ?s ?p ?o >> prov:value ?conf to efficient
|
|
9
|
+
columnar access.
|
|
10
|
+
|
|
11
|
+
Supported provenance vocabularies:
|
|
12
|
+
- PROV-O: W3C Provenance Ontology (prov:wasAttributedTo, prov:value, etc.)
|
|
13
|
+
- DQV: Data Quality Vocabulary (dqv:hasQualityMeasurement)
|
|
14
|
+
- PAV: Provenance, Authoring and Versioning (pav:createdBy, pav:authoredBy)
|
|
15
|
+
- DCAT: Data Catalog Vocabulary (dcat:accessURL, etc.)
|
|
16
|
+
|
|
17
|
+
When inserting RDF-Star annotations like:
|
|
18
|
+
<< ex:s ex:p ex:o >> prov:wasAttributedTo "IMDb" .
|
|
19
|
+
<< ex:s ex:p ex:o >> prov:value 0.95 .
|
|
20
|
+
|
|
21
|
+
The executor recognizes these predicates and maps them to internal assertion
|
|
22
|
+
metadata (source, confidence) rather than creating separate triples.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from typing import Any, Optional, Union, TYPE_CHECKING
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
|
|
28
|
+
import polars as pl
|
|
29
|
+
|
|
30
|
+
from rdf_starbase.sparql.ast import (
|
|
31
|
+
Query, SelectQuery, AskQuery, InsertDataQuery, DeleteDataQuery,
|
|
32
|
+
DeleteWhereQuery, ModifyQuery,
|
|
33
|
+
DescribeQuery, ConstructQuery,
|
|
34
|
+
CreateGraphQuery, DropGraphQuery, ClearGraphQuery,
|
|
35
|
+
LoadQuery, CopyGraphQuery, MoveGraphQuery, AddGraphQuery,
|
|
36
|
+
TriplePattern, QuotedTriplePattern,
|
|
37
|
+
OptionalPattern, UnionPattern, GraphPattern,
|
|
38
|
+
Variable, IRI, Literal, BlankNode,
|
|
39
|
+
Filter, Comparison, LogicalExpression, FunctionCall,
|
|
40
|
+
AggregateExpression, Bind, ValuesClause,
|
|
41
|
+
ComparisonOp, LogicalOp,
|
|
42
|
+
WhereClause,
|
|
43
|
+
Term,
|
|
44
|
+
)
|
|
45
|
+
from rdf_starbase.models import ProvenanceContext
|
|
46
|
+
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from rdf_starbase.store import TripleStore
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# =============================================================================
|
|
52
|
+
# Provenance Predicate Mappings
|
|
53
|
+
# =============================================================================
|
|
54
|
+
# These predicates, when used in RDF-Star annotations, are recognized and
|
|
55
|
+
# mapped to internal assertion metadata fields rather than stored as
|
|
56
|
+
# separate triples.
|
|
57
|
+
|
|
58
|
+
# Maps predicate IRIs to internal field names
|
|
59
|
+
PROVENANCE_SOURCE_PREDICATES = {
|
|
60
|
+
# PROV-O - W3C Provenance Ontology
|
|
61
|
+
"http://www.w3.org/ns/prov#wasAttributedTo",
|
|
62
|
+
"http://www.w3.org/ns/prov#wasDerivedFrom",
|
|
63
|
+
"http://www.w3.org/ns/prov#wasGeneratedBy",
|
|
64
|
+
"http://www.w3.org/ns/prov#hadPrimarySource",
|
|
65
|
+
# PAV - Provenance, Authoring and Versioning
|
|
66
|
+
"http://purl.org/pav/createdBy",
|
|
67
|
+
"http://purl.org/pav/authoredBy",
|
|
68
|
+
"http://purl.org/pav/importedFrom",
|
|
69
|
+
"http://purl.org/pav/retrievedFrom",
|
|
70
|
+
"http://purl.org/pav/sourceAccessedAt",
|
|
71
|
+
# Dublin Core
|
|
72
|
+
"http://purl.org/dc/terms/source",
|
|
73
|
+
"http://purl.org/dc/elements/1.1/source",
|
|
74
|
+
# Schema.org
|
|
75
|
+
"http://schema.org/isBasedOn",
|
|
76
|
+
"http://schema.org/citation",
|
|
77
|
+
# Custom RDF-StarBase
|
|
78
|
+
"http://rdf-starbase.io/source",
|
|
79
|
+
"source", # Short form
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
PROVENANCE_CONFIDENCE_PREDICATES = {
|
|
83
|
+
# PROV-O
|
|
84
|
+
"http://www.w3.org/ns/prov#value",
|
|
85
|
+
# DQV - Data Quality Vocabulary
|
|
86
|
+
"http://www.w3.org/ns/dqv#hasQualityMeasurement",
|
|
87
|
+
"http://www.w3.org/ns/dqv#value",
|
|
88
|
+
# Schema.org
|
|
89
|
+
"http://schema.org/ratingValue",
|
|
90
|
+
# Custom RDF-StarBase
|
|
91
|
+
"http://rdf-starbase.io/confidence",
|
|
92
|
+
"confidence", # Short form
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
PROVENANCE_TIMESTAMP_PREDICATES = {
|
|
96
|
+
# PROV-O
|
|
97
|
+
"http://www.w3.org/ns/prov#generatedAtTime",
|
|
98
|
+
"http://www.w3.org/ns/prov#invalidatedAtTime",
|
|
99
|
+
# PAV
|
|
100
|
+
"http://purl.org/pav/createdOn",
|
|
101
|
+
"http://purl.org/pav/authoredOn",
|
|
102
|
+
"http://purl.org/pav/lastRefreshedOn",
|
|
103
|
+
# Dublin Core
|
|
104
|
+
"http://purl.org/dc/terms/created",
|
|
105
|
+
"http://purl.org/dc/terms/modified",
|
|
106
|
+
# Custom
|
|
107
|
+
"http://rdf-starbase.io/timestamp",
|
|
108
|
+
"timestamp",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Legacy map for query optimization (reading provenance)
|
|
112
|
+
PROV_PREDICATE_MAP = {
|
|
113
|
+
"http://www.w3.org/ns/prov#value": "confidence",
|
|
114
|
+
"http://www.w3.org/ns/prov#wasDerivedFrom": "source",
|
|
115
|
+
"http://www.w3.org/ns/prov#generatedAtTime": "timestamp",
|
|
116
|
+
"http://www.w3.org/ns/prov#wasGeneratedBy": "process",
|
|
117
|
+
"prov:value": "confidence",
|
|
118
|
+
"prov:wasDerivedFrom": "source",
|
|
119
|
+
"prov:generatedAtTime": "timestamp",
|
|
120
|
+
"prov:wasGeneratedBy": "process",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class SPARQLExecutor:
|
|
125
|
+
"""
|
|
126
|
+
Executes SPARQL-Star queries against a TripleStore.
|
|
127
|
+
|
|
128
|
+
Translation strategy:
|
|
129
|
+
- Each TriplePattern becomes a filtered view of the DataFrame
|
|
130
|
+
- Variables become column selections
|
|
131
|
+
- Joins are performed for patterns sharing variables
|
|
132
|
+
- Filters become Polars filter expressions
|
|
133
|
+
- Uses lazy evaluation for query optimization
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, store: "TripleStore"):
|
|
137
|
+
"""
|
|
138
|
+
Initialize executor with a triple store.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
store: The TripleStore to query
|
|
142
|
+
"""
|
|
143
|
+
self.store = store
|
|
144
|
+
self._var_counter = 0
|
|
145
|
+
|
|
146
|
+
def execute(
|
|
147
|
+
self,
|
|
148
|
+
query: Query,
|
|
149
|
+
provenance: Optional[ProvenanceContext] = None
|
|
150
|
+
) -> Union[pl.DataFrame, bool, dict]:
|
|
151
|
+
"""
|
|
152
|
+
Execute a SPARQL-Star query.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
query: Parsed Query AST
|
|
156
|
+
provenance: Optional provenance context for INSERT/DELETE operations
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
DataFrame for SELECT queries, bool for ASK queries,
|
|
160
|
+
dict with count for INSERT/DELETE operations
|
|
161
|
+
"""
|
|
162
|
+
if isinstance(query, SelectQuery):
|
|
163
|
+
return self._execute_select(query)
|
|
164
|
+
elif isinstance(query, AskQuery):
|
|
165
|
+
return self._execute_ask(query)
|
|
166
|
+
elif isinstance(query, DescribeQuery):
|
|
167
|
+
return self._execute_describe(query)
|
|
168
|
+
elif isinstance(query, ConstructQuery):
|
|
169
|
+
return self._execute_construct(query)
|
|
170
|
+
elif isinstance(query, InsertDataQuery):
|
|
171
|
+
return self._execute_insert_data(query, provenance)
|
|
172
|
+
elif isinstance(query, DeleteDataQuery):
|
|
173
|
+
return self._execute_delete_data(query)
|
|
174
|
+
elif isinstance(query, DeleteWhereQuery):
|
|
175
|
+
return self._execute_delete_where(query)
|
|
176
|
+
elif isinstance(query, ModifyQuery):
|
|
177
|
+
return self._execute_modify(query, provenance)
|
|
178
|
+
elif isinstance(query, CreateGraphQuery):
|
|
179
|
+
return self._execute_create_graph(query)
|
|
180
|
+
elif isinstance(query, DropGraphQuery):
|
|
181
|
+
return self._execute_drop_graph(query)
|
|
182
|
+
elif isinstance(query, ClearGraphQuery):
|
|
183
|
+
return self._execute_clear_graph(query)
|
|
184
|
+
elif isinstance(query, LoadQuery):
|
|
185
|
+
return self._execute_load(query, provenance)
|
|
186
|
+
elif isinstance(query, CopyGraphQuery):
|
|
187
|
+
return self._execute_copy_graph(query)
|
|
188
|
+
elif isinstance(query, MoveGraphQuery):
|
|
189
|
+
return self._execute_move_graph(query)
|
|
190
|
+
elif isinstance(query, AddGraphQuery):
|
|
191
|
+
return self._execute_add_graph(query)
|
|
192
|
+
else:
|
|
193
|
+
raise NotImplementedError(f"Query type {type(query)} not yet supported")
|
|
194
|
+
|
|
195
|
+
def _execute_select(self, query: SelectQuery) -> pl.DataFrame:
|
|
196
|
+
"""Execute a SELECT query."""
|
|
197
|
+
# Handle FROM clause - restrict to specified graphs
|
|
198
|
+
from_graphs = None
|
|
199
|
+
if query.from_graphs:
|
|
200
|
+
# Merge all FROM graphs into default graph behavior
|
|
201
|
+
from_graphs = [g.value for g in query.from_graphs]
|
|
202
|
+
|
|
203
|
+
# Start with lazy frame for optimization
|
|
204
|
+
df = self._execute_where(
|
|
205
|
+
query.where,
|
|
206
|
+
query.prefixes,
|
|
207
|
+
as_of=query.as_of,
|
|
208
|
+
from_graphs=from_graphs
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Bind provenance variables if requested (source, confidence, timestamp, process)
|
|
212
|
+
# These are special variable names that map to assertion metadata
|
|
213
|
+
provenance_var_mapping = {
|
|
214
|
+
"source": "source",
|
|
215
|
+
"confidence": "confidence",
|
|
216
|
+
"timestamp": "timestamp",
|
|
217
|
+
"process": "process",
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
for var in query.variables:
|
|
221
|
+
if isinstance(var, Variable) and var.name in provenance_var_mapping:
|
|
222
|
+
prov_col = provenance_var_mapping[var.name]
|
|
223
|
+
# Find the first pattern's provenance column
|
|
224
|
+
for col in df.columns:
|
|
225
|
+
if col.startswith("_prov_") and col.endswith(f"_{prov_col}"):
|
|
226
|
+
df = df.with_columns(pl.col(col).alias(var.name))
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
# Determine columns to select before DISTINCT (DISTINCT should only apply to output columns)
|
|
230
|
+
select_cols = None
|
|
231
|
+
if not query.is_select_all():
|
|
232
|
+
select_cols = []
|
|
233
|
+
for v in query.variables:
|
|
234
|
+
if isinstance(v, Variable) and v.name in df.columns:
|
|
235
|
+
select_cols.append(v.name)
|
|
236
|
+
elif isinstance(v, AggregateExpression) and v.alias and v.alias.name in df.columns:
|
|
237
|
+
select_cols.append(v.alias.name)
|
|
238
|
+
|
|
239
|
+
# Handle GROUP BY and aggregates
|
|
240
|
+
if query.group_by or query.has_aggregates():
|
|
241
|
+
df = self._apply_group_by_aggregates(df, query)
|
|
242
|
+
else:
|
|
243
|
+
# Apply DISTINCT if requested (non-aggregate)
|
|
244
|
+
# Must apply DISTINCT only on the projected columns, not internal _prov_* columns
|
|
245
|
+
if query.distinct:
|
|
246
|
+
if select_cols:
|
|
247
|
+
df = df.unique(subset=select_cols)
|
|
248
|
+
else:
|
|
249
|
+
# SELECT * - apply unique to all non-internal columns
|
|
250
|
+
non_internal = [c for c in df.columns if not c.startswith("_prov_")]
|
|
251
|
+
df = df.unique(subset=non_internal if non_internal else None)
|
|
252
|
+
|
|
253
|
+
# Apply HAVING (filter after grouping)
|
|
254
|
+
if query.having:
|
|
255
|
+
df = self._apply_filter(df, Filter(expression=query.having))
|
|
256
|
+
|
|
257
|
+
# Apply ORDER BY
|
|
258
|
+
if query.order_by:
|
|
259
|
+
order_cols = []
|
|
260
|
+
descending = []
|
|
261
|
+
for var, asc in query.order_by:
|
|
262
|
+
if var.name in df.columns:
|
|
263
|
+
order_cols.append(var.name)
|
|
264
|
+
descending.append(not asc)
|
|
265
|
+
if order_cols:
|
|
266
|
+
df = df.sort(order_cols, descending=descending)
|
|
267
|
+
|
|
268
|
+
# Apply LIMIT and OFFSET
|
|
269
|
+
if query.offset:
|
|
270
|
+
df = df.slice(query.offset, query.limit or len(df))
|
|
271
|
+
elif query.limit:
|
|
272
|
+
df = df.head(query.limit)
|
|
273
|
+
|
|
274
|
+
# Select only requested variables (or all if SELECT *)
|
|
275
|
+
if not query.is_select_all():
|
|
276
|
+
select_cols = []
|
|
277
|
+
for v in query.variables:
|
|
278
|
+
if isinstance(v, Variable) and v.name in df.columns:
|
|
279
|
+
select_cols.append(v.name)
|
|
280
|
+
elif isinstance(v, AggregateExpression) and v.alias and v.alias.name in df.columns:
|
|
281
|
+
select_cols.append(v.alias.name)
|
|
282
|
+
if select_cols:
|
|
283
|
+
df = df.select(select_cols)
|
|
284
|
+
|
|
285
|
+
return df
|
|
286
|
+
|
|
287
|
+
def _apply_group_by_aggregates(
|
|
288
|
+
self,
|
|
289
|
+
df: pl.DataFrame,
|
|
290
|
+
query: SelectQuery
|
|
291
|
+
) -> pl.DataFrame:
|
|
292
|
+
"""
|
|
293
|
+
Apply GROUP BY and aggregate functions to a DataFrame.
|
|
294
|
+
|
|
295
|
+
Supports: COUNT, SUM, AVG, MIN, MAX, GROUP_CONCAT, SAMPLE
|
|
296
|
+
"""
|
|
297
|
+
if len(df) == 0:
|
|
298
|
+
return df
|
|
299
|
+
|
|
300
|
+
# Build aggregation expressions
|
|
301
|
+
agg_exprs = []
|
|
302
|
+
|
|
303
|
+
for var in query.variables:
|
|
304
|
+
if isinstance(var, AggregateExpression):
|
|
305
|
+
agg_expr = self._build_aggregate_expr(var)
|
|
306
|
+
if agg_expr is not None:
|
|
307
|
+
agg_exprs.append(agg_expr)
|
|
308
|
+
|
|
309
|
+
# If we have GROUP BY, use it; otherwise aggregate entire result
|
|
310
|
+
if query.group_by:
|
|
311
|
+
group_cols = [v.name for v in query.group_by if v.name in df.columns]
|
|
312
|
+
if group_cols and agg_exprs:
|
|
313
|
+
df = df.group_by(group_cols).agg(agg_exprs)
|
|
314
|
+
elif group_cols:
|
|
315
|
+
# GROUP BY without aggregates - just unique combinations
|
|
316
|
+
df = df.select(group_cols).unique()
|
|
317
|
+
elif agg_exprs:
|
|
318
|
+
# Aggregates without GROUP BY - aggregate entire result
|
|
319
|
+
df = df.select(agg_exprs)
|
|
320
|
+
|
|
321
|
+
return df
|
|
322
|
+
|
|
323
|
+
def _build_aggregate_expr(self, agg: AggregateExpression) -> Optional[pl.Expr]:
|
|
324
|
+
"""Build a Polars aggregation expression from an AggregateExpression AST."""
|
|
325
|
+
# Get the column to aggregate
|
|
326
|
+
if agg.argument is None:
|
|
327
|
+
# COUNT(*) - count all rows
|
|
328
|
+
col_name = None
|
|
329
|
+
elif isinstance(agg.argument, Variable):
|
|
330
|
+
col_name = agg.argument.name
|
|
331
|
+
else:
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
# Determine alias
|
|
335
|
+
alias = agg.alias.name if agg.alias else f"{agg.function.lower()}"
|
|
336
|
+
|
|
337
|
+
# Build the aggregation
|
|
338
|
+
if agg.function == "COUNT":
|
|
339
|
+
if col_name is None:
|
|
340
|
+
expr = pl.len().alias(alias)
|
|
341
|
+
elif agg.distinct:
|
|
342
|
+
expr = pl.col(col_name).n_unique().alias(alias)
|
|
343
|
+
else:
|
|
344
|
+
expr = pl.col(col_name).count().alias(alias)
|
|
345
|
+
elif agg.function == "SUM":
|
|
346
|
+
if col_name:
|
|
347
|
+
expr = pl.col(col_name).cast(pl.Float64).sum().alias(alias)
|
|
348
|
+
else:
|
|
349
|
+
return None
|
|
350
|
+
elif agg.function == "AVG":
|
|
351
|
+
if col_name:
|
|
352
|
+
expr = pl.col(col_name).cast(pl.Float64).mean().alias(alias)
|
|
353
|
+
else:
|
|
354
|
+
return None
|
|
355
|
+
elif agg.function == "MIN":
|
|
356
|
+
if col_name:
|
|
357
|
+
expr = pl.col(col_name).min().alias(alias)
|
|
358
|
+
else:
|
|
359
|
+
return None
|
|
360
|
+
elif agg.function == "MAX":
|
|
361
|
+
if col_name:
|
|
362
|
+
expr = pl.col(col_name).max().alias(alias)
|
|
363
|
+
else:
|
|
364
|
+
return None
|
|
365
|
+
elif agg.function == "GROUP_CONCAT":
|
|
366
|
+
if col_name:
|
|
367
|
+
sep = agg.separator or " "
|
|
368
|
+
expr = pl.col(col_name).cast(pl.Utf8).str.concat(sep).alias(alias)
|
|
369
|
+
else:
|
|
370
|
+
return None
|
|
371
|
+
elif agg.function == "SAMPLE":
|
|
372
|
+
if col_name:
|
|
373
|
+
expr = pl.col(col_name).first().alias(alias)
|
|
374
|
+
else:
|
|
375
|
+
return None
|
|
376
|
+
else:
|
|
377
|
+
return None
|
|
378
|
+
|
|
379
|
+
return expr
|
|
380
|
+
|
|
381
|
+
def _execute_ask(self, query: AskQuery) -> bool:
|
|
382
|
+
"""Execute an ASK query."""
|
|
383
|
+
df = self._execute_where(query.where, query.prefixes, as_of=query.as_of)
|
|
384
|
+
return len(df) > 0
|
|
385
|
+
|
|
386
|
+
def _execute_describe(self, query: DescribeQuery) -> pl.DataFrame:
|
|
387
|
+
"""
|
|
388
|
+
Execute a DESCRIBE query.
|
|
389
|
+
|
|
390
|
+
Returns all triples where the resource appears as subject or object.
|
|
391
|
+
"""
|
|
392
|
+
prefixes = query.prefixes
|
|
393
|
+
|
|
394
|
+
# Get resource URIs to describe
|
|
395
|
+
if query.where:
|
|
396
|
+
# Execute WHERE clause to get bindings
|
|
397
|
+
bindings = self._execute_where(query.where, prefixes, as_of=query.as_of)
|
|
398
|
+
resources = set()
|
|
399
|
+
for resource in query.resources:
|
|
400
|
+
if isinstance(resource, Variable) and resource.name in bindings.columns:
|
|
401
|
+
resources.update(bindings[resource.name].unique().to_list())
|
|
402
|
+
elif isinstance(resource, IRI):
|
|
403
|
+
resources.add(self._expand_iri(resource.value, prefixes))
|
|
404
|
+
else:
|
|
405
|
+
resources = {
|
|
406
|
+
self._expand_iri(r.value, prefixes) if isinstance(r, IRI) else str(r)
|
|
407
|
+
for r in query.resources
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
# Get all triples where resource is subject or object
|
|
411
|
+
df = self.store._df
|
|
412
|
+
|
|
413
|
+
# Apply time-travel filter if specified
|
|
414
|
+
if query.as_of:
|
|
415
|
+
df = df.filter(pl.col("timestamp") <= query.as_of)
|
|
416
|
+
|
|
417
|
+
if len(df) == 0:
|
|
418
|
+
return df
|
|
419
|
+
|
|
420
|
+
resource_list = list(resources)
|
|
421
|
+
result = df.filter(
|
|
422
|
+
pl.col("subject").is_in(resource_list) |
|
|
423
|
+
pl.col("object").is_in(resource_list)
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
return result
|
|
427
|
+
|
|
428
|
+
def _execute_construct(self, query: ConstructQuery) -> pl.DataFrame:
|
|
429
|
+
"""
|
|
430
|
+
Execute a CONSTRUCT query.
|
|
431
|
+
|
|
432
|
+
Returns triples constructed from the template using WHERE bindings.
|
|
433
|
+
"""
|
|
434
|
+
prefixes = query.prefixes
|
|
435
|
+
bindings = self._execute_where(query.where, prefixes, as_of=query.as_of)
|
|
436
|
+
|
|
437
|
+
if len(bindings) == 0:
|
|
438
|
+
return pl.DataFrame({"subject": [], "predicate": [], "object": []})
|
|
439
|
+
|
|
440
|
+
# Build result triples from template
|
|
441
|
+
result_triples = []
|
|
442
|
+
|
|
443
|
+
for row in bindings.iter_rows(named=True):
|
|
444
|
+
for pattern in query.template:
|
|
445
|
+
# Substitute variables with bound values
|
|
446
|
+
subject = self._substitute_term(pattern.subject, row, prefixes)
|
|
447
|
+
predicate = self._substitute_term(pattern.predicate, row, prefixes)
|
|
448
|
+
obj = self._substitute_term(pattern.object, row, prefixes)
|
|
449
|
+
|
|
450
|
+
if subject is not None and predicate is not None and obj is not None:
|
|
451
|
+
result_triples.append({
|
|
452
|
+
"subject": subject,
|
|
453
|
+
"predicate": predicate,
|
|
454
|
+
"object": obj,
|
|
455
|
+
})
|
|
456
|
+
|
|
457
|
+
return pl.DataFrame(result_triples) if result_triples else pl.DataFrame({"subject": [], "predicate": [], "object": []})
|
|
458
|
+
|
|
459
|
+
def _substitute_term(self, term: Term, row: dict, prefixes: dict) -> Optional[str]:
|
|
460
|
+
"""Substitute a term with a value from bindings."""
|
|
461
|
+
if isinstance(term, Variable):
|
|
462
|
+
return row.get(term.name)
|
|
463
|
+
elif isinstance(term, IRI):
|
|
464
|
+
return self._expand_iri(term.value, prefixes)
|
|
465
|
+
elif isinstance(term, Literal):
|
|
466
|
+
return term.value
|
|
467
|
+
elif isinstance(term, BlankNode):
|
|
468
|
+
return f"_:{term.label}"
|
|
469
|
+
return str(term)
|
|
470
|
+
|
|
471
|
+
def _expand_iri(self, iri: str, prefixes: dict) -> str:
|
|
472
|
+
"""Expand a prefixed IRI using prefix declarations."""
|
|
473
|
+
if ":" in iri and not iri.startswith("http"):
|
|
474
|
+
parts = iri.split(":", 1)
|
|
475
|
+
if len(parts) == 2 and parts[0] in prefixes:
|
|
476
|
+
return prefixes[parts[0]] + parts[1]
|
|
477
|
+
return iri
|
|
478
|
+
|
|
479
|
+
def _try_optimize_provenance_pattern(
|
|
480
|
+
self,
|
|
481
|
+
pattern: TriplePattern,
|
|
482
|
+
prefixes: dict[str, str],
|
|
483
|
+
pattern_idx: int
|
|
484
|
+
) -> Optional[tuple[str, str, QuotedTriplePattern, Optional[str]]]:
|
|
485
|
+
"""
|
|
486
|
+
Try to optimize a provenance pattern to direct column access.
|
|
487
|
+
|
|
488
|
+
Detects patterns like:
|
|
489
|
+
<< ?s ?p ?o >> prov:value ?conf (specific predicate)
|
|
490
|
+
<< ?s ?p ?o >> ?mp ?mo (variable predicate - get ALL)
|
|
491
|
+
|
|
492
|
+
And maps them to the corresponding columnar provenance data
|
|
493
|
+
(confidence, source, timestamp, process).
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Tuple of (object_var_name, column_name_or_"*", inner_pattern, predicate_var_name)
|
|
497
|
+
- column_name is "*" when predicate is a variable (return all provenance)
|
|
498
|
+
- predicate_var_name is set when predicate is a variable
|
|
499
|
+
None if not a provenance pattern.
|
|
500
|
+
"""
|
|
501
|
+
# Must be a triple pattern with a quoted triple as subject
|
|
502
|
+
if not isinstance(pattern.subject, QuotedTriplePattern):
|
|
503
|
+
return None
|
|
504
|
+
|
|
505
|
+
# Object must be a variable to bind the provenance value
|
|
506
|
+
if not isinstance(pattern.object, Variable):
|
|
507
|
+
return None
|
|
508
|
+
|
|
509
|
+
# Check if predicate is a variable - if so, return ALL provenance
|
|
510
|
+
if isinstance(pattern.predicate, Variable):
|
|
511
|
+
return (pattern.object.name, "*", pattern.subject, pattern.predicate.name)
|
|
512
|
+
|
|
513
|
+
# Predicate must be a known provenance predicate IRI
|
|
514
|
+
if not isinstance(pattern.predicate, IRI):
|
|
515
|
+
return None
|
|
516
|
+
|
|
517
|
+
pred_iri = self._expand_iri(pattern.predicate.value, prefixes)
|
|
518
|
+
|
|
519
|
+
# Check if it's a provenance predicate we can optimize
|
|
520
|
+
column_name = PROV_PREDICATE_MAP.get(pred_iri)
|
|
521
|
+
if not column_name:
|
|
522
|
+
# Also check without expansion
|
|
523
|
+
column_name = PROV_PREDICATE_MAP.get(pattern.predicate.value)
|
|
524
|
+
|
|
525
|
+
if not column_name:
|
|
526
|
+
return None
|
|
527
|
+
|
|
528
|
+
return (pattern.object.name, column_name, pattern.subject, None)
|
|
529
|
+
|
|
530
|
+
def _execute_where(
|
|
531
|
+
self,
|
|
532
|
+
where: WhereClause,
|
|
533
|
+
prefixes: dict[str, str],
|
|
534
|
+
as_of: Optional[datetime] = None,
|
|
535
|
+
from_graphs: Optional[list[str]] = None,
|
|
536
|
+
) -> pl.DataFrame:
|
|
537
|
+
"""
|
|
538
|
+
Execute a WHERE clause and return matching bindings.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
where: The WHERE clause to execute
|
|
542
|
+
prefixes: Prefix mappings
|
|
543
|
+
as_of: Optional timestamp for time-travel queries
|
|
544
|
+
from_graphs: Optional list of graph URIs to restrict query to
|
|
545
|
+
|
|
546
|
+
Includes internal optimization for provenance patterns:
|
|
547
|
+
When detecting patterns like << ?s ?p ?o >> prov:value ?conf,
|
|
548
|
+
we map directly to the confidence column instead of doing a join.
|
|
549
|
+
Also handles << ?s ?p ?o >> ?mp ?mo to return ALL provenance.
|
|
550
|
+
"""
|
|
551
|
+
# Handle case where UNION is the only pattern
|
|
552
|
+
if not where.patterns and not where.union_patterns and not where.graph_patterns:
|
|
553
|
+
return pl.DataFrame()
|
|
554
|
+
|
|
555
|
+
# Separate regular patterns from optimizable provenance patterns
|
|
556
|
+
# For provenance patterns, we execute the inner pattern and bind provenance columns
|
|
557
|
+
patterns_to_execute = [] # List of (idx, pattern, prov_bindings)
|
|
558
|
+
|
|
559
|
+
for i, pattern in enumerate(where.patterns):
|
|
560
|
+
opt_result = self._try_optimize_provenance_pattern(pattern, prefixes, i)
|
|
561
|
+
if opt_result:
|
|
562
|
+
# This is a provenance pattern - execute inner pattern and bind column
|
|
563
|
+
obj_var_name, col_name, inner_pattern, pred_var_name = opt_result
|
|
564
|
+
# Create a TriplePattern from the inner QuotedTriplePattern
|
|
565
|
+
inner_triple = TriplePattern(
|
|
566
|
+
subject=inner_pattern.subject,
|
|
567
|
+
predicate=inner_pattern.predicate,
|
|
568
|
+
object=inner_pattern.object
|
|
569
|
+
)
|
|
570
|
+
patterns_to_execute.append((i, inner_triple, (obj_var_name, col_name, pred_var_name)))
|
|
571
|
+
else:
|
|
572
|
+
patterns_to_execute.append((i, pattern, None))
|
|
573
|
+
|
|
574
|
+
# Execute patterns and join results
|
|
575
|
+
result_df: Optional[pl.DataFrame] = None
|
|
576
|
+
|
|
577
|
+
for i, pattern, prov_binding in patterns_to_execute:
|
|
578
|
+
pattern_df = self._execute_pattern(pattern, prefixes, i, as_of=as_of, from_graphs=from_graphs)
|
|
579
|
+
|
|
580
|
+
# If this pattern has a provenance binding, add it as a column alias
|
|
581
|
+
if prov_binding:
|
|
582
|
+
obj_var_name, col_name, pred_var_name = prov_binding
|
|
583
|
+
|
|
584
|
+
if col_name == "*":
|
|
585
|
+
# Variable predicate - unpivot ALL provenance columns into rows
|
|
586
|
+
# Map column names to their prov predicates
|
|
587
|
+
prov_col_to_pred = {
|
|
588
|
+
"source": "<http://www.w3.org/ns/prov#wasDerivedFrom>",
|
|
589
|
+
"confidence": "<http://www.w3.org/ns/prov#value>",
|
|
590
|
+
"timestamp": "<http://www.w3.org/ns/prov#generatedAtTime>",
|
|
591
|
+
"process": "<http://www.w3.org/ns/prov#wasGeneratedBy>",
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
# Find all _prov_ columns for this pattern
|
|
595
|
+
prov_cols = [c for c in pattern_df.columns if c.startswith(f"_prov_{i}_")]
|
|
596
|
+
|
|
597
|
+
if prov_cols:
|
|
598
|
+
# Build unpivoted dataframe - one row per provenance value
|
|
599
|
+
unpivoted_dfs = []
|
|
600
|
+
base_cols = [c for c in pattern_df.columns if not c.startswith("_prov_")]
|
|
601
|
+
|
|
602
|
+
for prov_col in prov_cols:
|
|
603
|
+
# Extract column type from _prov_{idx}_{type}
|
|
604
|
+
col_type = prov_col.split("_")[-1] # e.g., "source", "confidence"
|
|
605
|
+
pred_uri = prov_col_to_pred.get(col_type)
|
|
606
|
+
|
|
607
|
+
if pred_uri:
|
|
608
|
+
# Create a df with this provenance column as the object
|
|
609
|
+
row_df = pattern_df.select(base_cols + [prov_col])
|
|
610
|
+
# Filter out nulls
|
|
611
|
+
row_df = row_df.filter(pl.col(prov_col).is_not_null())
|
|
612
|
+
|
|
613
|
+
if len(row_df) > 0:
|
|
614
|
+
# Add predicate and rename object column
|
|
615
|
+
row_df = row_df.with_columns([
|
|
616
|
+
pl.lit(pred_uri).alias(pred_var_name),
|
|
617
|
+
pl.col(prov_col).cast(pl.Utf8).alias(obj_var_name)
|
|
618
|
+
]).drop(prov_col)
|
|
619
|
+
unpivoted_dfs.append(row_df)
|
|
620
|
+
|
|
621
|
+
if unpivoted_dfs:
|
|
622
|
+
pattern_df = pl.concat(unpivoted_dfs)
|
|
623
|
+
else:
|
|
624
|
+
# No provenance data - return empty with correct columns
|
|
625
|
+
pattern_df = pattern_df.select(base_cols).with_columns([
|
|
626
|
+
pl.lit(None).cast(pl.Utf8).alias(pred_var_name),
|
|
627
|
+
pl.lit(None).cast(pl.Utf8).alias(obj_var_name)
|
|
628
|
+
]).head(0)
|
|
629
|
+
else:
|
|
630
|
+
# Specific predicate - just alias the column
|
|
631
|
+
prov_col = f"_prov_{i}_{col_name}"
|
|
632
|
+
if prov_col in pattern_df.columns:
|
|
633
|
+
pattern_df = pattern_df.with_columns(
|
|
634
|
+
pl.col(prov_col).alias(obj_var_name)
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
if result_df is None:
|
|
638
|
+
result_df = pattern_df
|
|
639
|
+
else:
|
|
640
|
+
# Find shared variables to join on
|
|
641
|
+
shared_cols = set(result_df.columns) & set(pattern_df.columns)
|
|
642
|
+
shared_cols -= {"_pattern_idx"} # Don't join on internal columns
|
|
643
|
+
# Also exclude provenance internal columns from join keys
|
|
644
|
+
shared_cols = {c for c in shared_cols if not c.startswith("_prov_")}
|
|
645
|
+
|
|
646
|
+
if shared_cols:
|
|
647
|
+
result_df = result_df.join(
|
|
648
|
+
pattern_df,
|
|
649
|
+
on=list(shared_cols),
|
|
650
|
+
how="inner"
|
|
651
|
+
)
|
|
652
|
+
else:
|
|
653
|
+
# Cross join if no shared variables
|
|
654
|
+
result_df = result_df.join(pattern_df, how="cross")
|
|
655
|
+
|
|
656
|
+
# Handle GRAPH patterns
|
|
657
|
+
if where.graph_patterns:
|
|
658
|
+
for graph_pattern in where.graph_patterns:
|
|
659
|
+
graph_df = self._execute_graph_pattern(graph_pattern, prefixes, as_of=as_of)
|
|
660
|
+
if result_df is None:
|
|
661
|
+
result_df = graph_df
|
|
662
|
+
elif len(graph_df) > 0:
|
|
663
|
+
# Join with existing results
|
|
664
|
+
shared_cols = set(result_df.columns) & set(graph_df.columns)
|
|
665
|
+
shared_cols -= {"_pattern_idx"}
|
|
666
|
+
shared_cols = {c for c in shared_cols if not c.startswith("_prov_")}
|
|
667
|
+
|
|
668
|
+
if shared_cols:
|
|
669
|
+
result_df = result_df.join(graph_df, on=list(shared_cols), how="inner")
|
|
670
|
+
else:
|
|
671
|
+
result_df = result_df.join(graph_df, how="cross")
|
|
672
|
+
|
|
673
|
+
# Handle UNION patterns - these can be standalone or combined with other patterns
|
|
674
|
+
if where.union_patterns:
|
|
675
|
+
for union in where.union_patterns:
|
|
676
|
+
if result_df is None or len(result_df) == 0:
|
|
677
|
+
# UNION is the primary pattern - execute it directly
|
|
678
|
+
result_df = self._execute_union_standalone(union, prefixes)
|
|
679
|
+
else:
|
|
680
|
+
# Combine UNION results with existing patterns
|
|
681
|
+
result_df = self._apply_union(result_df, union, prefixes)
|
|
682
|
+
|
|
683
|
+
if result_df is None:
|
|
684
|
+
return pl.DataFrame()
|
|
685
|
+
|
|
686
|
+
# Apply standard FILTER clauses
|
|
687
|
+
for filter_clause in where.filters:
|
|
688
|
+
result_df = self._apply_filter(result_df, filter_clause)
|
|
689
|
+
|
|
690
|
+
# Apply OPTIONAL patterns with left outer joins
|
|
691
|
+
for optional in where.optional_patterns:
|
|
692
|
+
result_df = self._apply_optional(result_df, optional, prefixes)
|
|
693
|
+
|
|
694
|
+
# Apply BIND clauses - add new columns with computed values
|
|
695
|
+
for bind in where.binds:
|
|
696
|
+
result_df = self._apply_bind(result_df, bind, prefixes)
|
|
697
|
+
|
|
698
|
+
# Apply VALUES clause - filter/join with inline data
|
|
699
|
+
if where.values:
|
|
700
|
+
result_df = self._apply_values(result_df, where.values, prefixes)
|
|
701
|
+
|
|
702
|
+
# Check if we have matches before removing internal columns
|
|
703
|
+
has_matches = len(result_df) > 0
|
|
704
|
+
|
|
705
|
+
# Remove internal columns EXCEPT provenance columns (keep _prov_*)
|
|
706
|
+
internal_cols = [c for c in result_df.columns if c.startswith("_") and not c.startswith("_prov_")]
|
|
707
|
+
if internal_cols:
|
|
708
|
+
result_df = result_df.drop(internal_cols)
|
|
709
|
+
|
|
710
|
+
# If we had matches but now have no columns (all terms were concrete),
|
|
711
|
+
# return a DataFrame with a single row to indicate a match exists
|
|
712
|
+
if has_matches and len(result_df.columns) == 0:
|
|
713
|
+
result_df = pl.DataFrame({"_matched": [True] * has_matches})
|
|
714
|
+
# Actually just need count, not the values
|
|
715
|
+
result_df = pl.DataFrame({"_matched": [True]})
|
|
716
|
+
|
|
717
|
+
return result_df
|
|
718
|
+
|
|
719
|
+
def _execute_pattern(
|
|
720
|
+
self,
|
|
721
|
+
pattern: TriplePattern,
|
|
722
|
+
prefixes: dict[str, str],
|
|
723
|
+
pattern_idx: int,
|
|
724
|
+
as_of: Optional[datetime] = None,
|
|
725
|
+
from_graphs: Optional[list[str]] = None,
|
|
726
|
+
) -> pl.DataFrame:
|
|
727
|
+
"""
|
|
728
|
+
Execute a single triple pattern against the store.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
pattern: The triple pattern to match
|
|
732
|
+
prefixes: Prefix mappings
|
|
733
|
+
pattern_idx: Index of this pattern (for internal column naming)
|
|
734
|
+
as_of: Optional timestamp for time-travel queries
|
|
735
|
+
from_graphs: Optional list of graph URIs to restrict query to
|
|
736
|
+
|
|
737
|
+
Returns a DataFrame with columns for each variable in the pattern.
|
|
738
|
+
"""
|
|
739
|
+
# Start with all assertions
|
|
740
|
+
df = self.store._df.lazy()
|
|
741
|
+
|
|
742
|
+
# Apply time-travel filter if specified
|
|
743
|
+
if as_of is not None:
|
|
744
|
+
df = df.filter(pl.col("timestamp") <= as_of)
|
|
745
|
+
|
|
746
|
+
# Apply FROM graph restriction
|
|
747
|
+
if from_graphs is not None:
|
|
748
|
+
# Match triples in specified graphs (None for default graph)
|
|
749
|
+
graph_conditions = []
|
|
750
|
+
for g in from_graphs:
|
|
751
|
+
if g is None or g == "":
|
|
752
|
+
graph_conditions.append(pl.col("graph").is_null())
|
|
753
|
+
else:
|
|
754
|
+
graph_conditions.append(pl.col("graph") == g)
|
|
755
|
+
if graph_conditions:
|
|
756
|
+
combined = graph_conditions[0]
|
|
757
|
+
for cond in graph_conditions[1:]:
|
|
758
|
+
combined = combined | cond
|
|
759
|
+
df = df.filter(combined)
|
|
760
|
+
|
|
761
|
+
# Apply time-travel filter if specified
|
|
762
|
+
if as_of is not None:
|
|
763
|
+
df = df.filter(pl.col("timestamp") <= as_of)
|
|
764
|
+
|
|
765
|
+
# Apply filters for concrete terms
|
|
766
|
+
if not isinstance(pattern.subject, Variable):
|
|
767
|
+
value = self._resolve_term(pattern.subject, prefixes)
|
|
768
|
+
# Match both with and without angle brackets for URIs
|
|
769
|
+
if value.startswith("http"):
|
|
770
|
+
df = df.filter(
|
|
771
|
+
(pl.col("subject") == value) |
|
|
772
|
+
(pl.col("subject") == f"<{value}>")
|
|
773
|
+
)
|
|
774
|
+
else:
|
|
775
|
+
df = df.filter(pl.col("subject") == value)
|
|
776
|
+
|
|
777
|
+
if not isinstance(pattern.predicate, Variable):
|
|
778
|
+
value = self._resolve_term(pattern.predicate, prefixes)
|
|
779
|
+
# Match both with and without angle brackets for URIs
|
|
780
|
+
if value.startswith("http"):
|
|
781
|
+
df = df.filter(
|
|
782
|
+
(pl.col("predicate") == value) |
|
|
783
|
+
(pl.col("predicate") == f"<{value}>")
|
|
784
|
+
)
|
|
785
|
+
else:
|
|
786
|
+
df = df.filter(pl.col("predicate") == value)
|
|
787
|
+
|
|
788
|
+
if not isinstance(pattern.object, (Variable, QuotedTriplePattern)):
|
|
789
|
+
value = self._resolve_term(pattern.object, prefixes)
|
|
790
|
+
str_value = str(value)
|
|
791
|
+
# Match both with and without angle brackets for URIs
|
|
792
|
+
if str_value.startswith("http"):
|
|
793
|
+
df = df.filter(
|
|
794
|
+
(pl.col("object") == str_value) |
|
|
795
|
+
(pl.col("object") == f"<{str_value}>")
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
df = df.filter(pl.col("object") == str_value)
|
|
799
|
+
|
|
800
|
+
# Exclude deprecated by default
|
|
801
|
+
df = df.filter(~pl.col("deprecated"))
|
|
802
|
+
|
|
803
|
+
# Collect results
|
|
804
|
+
result = df.collect()
|
|
805
|
+
|
|
806
|
+
# Rename columns to variable names and select relevant columns
|
|
807
|
+
renames = {}
|
|
808
|
+
select_cols = []
|
|
809
|
+
|
|
810
|
+
if isinstance(pattern.subject, Variable):
|
|
811
|
+
renames["subject"] = pattern.subject.name
|
|
812
|
+
select_cols.append("subject")
|
|
813
|
+
|
|
814
|
+
if isinstance(pattern.predicate, Variable):
|
|
815
|
+
renames["predicate"] = pattern.predicate.name
|
|
816
|
+
select_cols.append("predicate")
|
|
817
|
+
|
|
818
|
+
if isinstance(pattern.object, Variable):
|
|
819
|
+
renames["object"] = pattern.object.name
|
|
820
|
+
select_cols.append("object")
|
|
821
|
+
# Also include typed object_value for numeric FILTER comparisons
|
|
822
|
+
# Rename it to the variable name with "_value" suffix
|
|
823
|
+
if "object_value" in result.columns:
|
|
824
|
+
renames["object_value"] = f"{pattern.object.name}_value"
|
|
825
|
+
select_cols.append("object_value")
|
|
826
|
+
|
|
827
|
+
# Always include provenance columns for provenance filters
|
|
828
|
+
provenance_cols = ["source", "confidence", "timestamp", "process"]
|
|
829
|
+
for col in provenance_cols:
|
|
830
|
+
renames[col] = f"_prov_{pattern_idx}_{col}"
|
|
831
|
+
select_cols.append(col)
|
|
832
|
+
|
|
833
|
+
# Select and rename
|
|
834
|
+
if select_cols:
|
|
835
|
+
result = result.select(select_cols)
|
|
836
|
+
result = result.rename(renames)
|
|
837
|
+
else:
|
|
838
|
+
# Pattern has no variables - just return count
|
|
839
|
+
result = pl.DataFrame({"_match": [True] * len(result)})
|
|
840
|
+
|
|
841
|
+
return result
|
|
842
|
+
|
|
843
|
+
def _execute_graph_pattern(
|
|
844
|
+
self,
|
|
845
|
+
graph_pattern: "GraphPattern",
|
|
846
|
+
prefixes: dict[str, str],
|
|
847
|
+
as_of: Optional[datetime] = None,
|
|
848
|
+
) -> pl.DataFrame:
|
|
849
|
+
"""
|
|
850
|
+
Execute a GRAPH pattern: GRAPH <uri> { patterns }.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
graph_pattern: The GRAPH pattern to execute
|
|
854
|
+
prefixes: Prefix mappings
|
|
855
|
+
as_of: Optional timestamp for time-travel queries
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
DataFrame with matching bindings from the specified graph
|
|
859
|
+
"""
|
|
860
|
+
# Resolve the graph reference
|
|
861
|
+
if isinstance(graph_pattern.graph, IRI):
|
|
862
|
+
graph_uri = self._resolve_term(graph_pattern.graph, prefixes)
|
|
863
|
+
graph_filter = [graph_uri]
|
|
864
|
+
elif isinstance(graph_pattern.graph, Variable):
|
|
865
|
+
# Variable graph - match all named graphs and bind the variable
|
|
866
|
+
graph_filter = None # Will filter manually
|
|
867
|
+
graph_var_name = graph_pattern.graph.name
|
|
868
|
+
else:
|
|
869
|
+
return pl.DataFrame()
|
|
870
|
+
|
|
871
|
+
# Execute each pattern in the graph
|
|
872
|
+
result_df: Optional[pl.DataFrame] = None
|
|
873
|
+
|
|
874
|
+
for i, pattern in enumerate(graph_pattern.patterns):
|
|
875
|
+
pattern_df = self._execute_pattern(
|
|
876
|
+
pattern,
|
|
877
|
+
prefixes,
|
|
878
|
+
1000 + i, # Use high pattern idx to avoid conflicts
|
|
879
|
+
as_of=as_of,
|
|
880
|
+
from_graphs=graph_filter
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# If graph is a variable, add the graph column as a binding
|
|
884
|
+
if isinstance(graph_pattern.graph, Variable):
|
|
885
|
+
# Need to also get graph column from store
|
|
886
|
+
df = self.store._df.lazy()
|
|
887
|
+
if as_of is not None:
|
|
888
|
+
df = df.filter(pl.col("timestamp") <= as_of)
|
|
889
|
+
df = df.filter(~pl.col("deprecated"))
|
|
890
|
+
df = df.filter(pl.col("graph").is_not_null()) # Only named graphs
|
|
891
|
+
|
|
892
|
+
# Re-execute pattern with graph column
|
|
893
|
+
graph_df = self._execute_pattern_with_graph(
|
|
894
|
+
pattern, prefixes, 1000 + i, as_of=as_of
|
|
895
|
+
)
|
|
896
|
+
if graph_var_name not in graph_df.columns and "graph" in graph_df.columns:
|
|
897
|
+
graph_df = graph_df.rename({"graph": graph_var_name})
|
|
898
|
+
pattern_df = graph_df
|
|
899
|
+
|
|
900
|
+
if result_df is None:
|
|
901
|
+
result_df = pattern_df
|
|
902
|
+
else:
|
|
903
|
+
# Join on shared variables
|
|
904
|
+
shared_cols = set(result_df.columns) & set(pattern_df.columns)
|
|
905
|
+
shared_cols = {c for c in shared_cols if not c.startswith("_prov_")}
|
|
906
|
+
if shared_cols:
|
|
907
|
+
result_df = result_df.join(pattern_df, on=list(shared_cols), how="inner")
|
|
908
|
+
else:
|
|
909
|
+
result_df = result_df.join(pattern_df, how="cross")
|
|
910
|
+
|
|
911
|
+
return result_df if result_df is not None else pl.DataFrame()
|
|
912
|
+
|
|
913
|
+
def _execute_pattern_with_graph(
|
|
914
|
+
self,
|
|
915
|
+
pattern: TriplePattern,
|
|
916
|
+
prefixes: dict[str, str],
|
|
917
|
+
pattern_idx: int,
|
|
918
|
+
as_of: Optional[datetime] = None,
|
|
919
|
+
) -> pl.DataFrame:
|
|
920
|
+
"""Execute a pattern and include the graph column in results."""
|
|
921
|
+
# Start with all assertions
|
|
922
|
+
df = self.store._df.lazy()
|
|
923
|
+
|
|
924
|
+
if as_of is not None:
|
|
925
|
+
df = df.filter(pl.col("timestamp") <= as_of)
|
|
926
|
+
|
|
927
|
+
# Only named graphs
|
|
928
|
+
df = df.filter(pl.col("graph").is_not_null())
|
|
929
|
+
|
|
930
|
+
# Apply filters for concrete terms
|
|
931
|
+
if not isinstance(pattern.subject, Variable):
|
|
932
|
+
value = self._resolve_term(pattern.subject, prefixes)
|
|
933
|
+
if value.startswith("http"):
|
|
934
|
+
df = df.filter(
|
|
935
|
+
(pl.col("subject") == value) |
|
|
936
|
+
(pl.col("subject") == f"<{value}>")
|
|
937
|
+
)
|
|
938
|
+
else:
|
|
939
|
+
df = df.filter(pl.col("subject") == value)
|
|
940
|
+
|
|
941
|
+
if not isinstance(pattern.predicate, Variable):
|
|
942
|
+
value = self._resolve_term(pattern.predicate, prefixes)
|
|
943
|
+
if value.startswith("http"):
|
|
944
|
+
df = df.filter(
|
|
945
|
+
(pl.col("predicate") == value) |
|
|
946
|
+
(pl.col("predicate") == f"<{value}>")
|
|
947
|
+
)
|
|
948
|
+
else:
|
|
949
|
+
df = df.filter(pl.col("predicate") == value)
|
|
950
|
+
|
|
951
|
+
if not isinstance(pattern.object, (Variable, QuotedTriplePattern)):
|
|
952
|
+
value = self._resolve_term(pattern.object, prefixes)
|
|
953
|
+
str_value = str(value)
|
|
954
|
+
if str_value.startswith("http"):
|
|
955
|
+
df = df.filter(
|
|
956
|
+
(pl.col("object") == str_value) |
|
|
957
|
+
(pl.col("object") == f"<{str_value}>")
|
|
958
|
+
)
|
|
959
|
+
else:
|
|
960
|
+
df = df.filter(pl.col("object") == str_value)
|
|
961
|
+
|
|
962
|
+
df = df.filter(~pl.col("deprecated"))
|
|
963
|
+
result = df.collect()
|
|
964
|
+
|
|
965
|
+
# Rename and select columns
|
|
966
|
+
renames = {}
|
|
967
|
+
select_cols = ["graph"] # Always include graph
|
|
968
|
+
|
|
969
|
+
if isinstance(pattern.subject, Variable):
|
|
970
|
+
renames["subject"] = pattern.subject.name
|
|
971
|
+
select_cols.append("subject")
|
|
972
|
+
|
|
973
|
+
if isinstance(pattern.predicate, Variable):
|
|
974
|
+
renames["predicate"] = pattern.predicate.name
|
|
975
|
+
select_cols.append("predicate")
|
|
976
|
+
|
|
977
|
+
if isinstance(pattern.object, Variable):
|
|
978
|
+
renames["object"] = pattern.object.name
|
|
979
|
+
select_cols.append("object")
|
|
980
|
+
|
|
981
|
+
if select_cols:
|
|
982
|
+
result = result.select(select_cols)
|
|
983
|
+
result = result.rename(renames)
|
|
984
|
+
|
|
985
|
+
return result
|
|
986
|
+
|
|
987
|
+
def _resolve_term(self, term: Term, prefixes: dict[str, str]) -> str:
|
|
988
|
+
"""Resolve a term to its string value for matching against store."""
|
|
989
|
+
if isinstance(term, IRI):
|
|
990
|
+
value = term.value
|
|
991
|
+
# Expand prefixed names
|
|
992
|
+
if ":" in value and not value.startswith("http"):
|
|
993
|
+
prefix, local = value.split(":", 1)
|
|
994
|
+
if prefix in prefixes:
|
|
995
|
+
value = prefixes[prefix] + local
|
|
996
|
+
# Return without angle brackets - store has mixed formats
|
|
997
|
+
# The _execute_pattern will try both with/without brackets
|
|
998
|
+
return value
|
|
999
|
+
elif isinstance(term, Literal):
|
|
1000
|
+
return str(term.value)
|
|
1001
|
+
elif isinstance(term, BlankNode):
|
|
1002
|
+
return f"_:{term.label}"
|
|
1003
|
+
else:
|
|
1004
|
+
return str(term)
|
|
1005
|
+
|
|
1006
|
+
def _apply_filter(self, df: pl.DataFrame, filter_clause: Filter) -> pl.DataFrame:
|
|
1007
|
+
"""Apply a standard FILTER to the DataFrame."""
|
|
1008
|
+
expr = self._build_filter_expression(filter_clause.expression)
|
|
1009
|
+
if expr is not None:
|
|
1010
|
+
return df.filter(expr)
|
|
1011
|
+
return df
|
|
1012
|
+
|
|
1013
|
+
def _apply_optional(
|
|
1014
|
+
self,
|
|
1015
|
+
df: pl.DataFrame,
|
|
1016
|
+
optional: OptionalPattern,
|
|
1017
|
+
prefixes: dict[str, str]
|
|
1018
|
+
) -> pl.DataFrame:
|
|
1019
|
+
"""
|
|
1020
|
+
Apply an OPTIONAL pattern using left outer join.
|
|
1021
|
+
|
|
1022
|
+
OPTIONAL { ... } patterns add bindings when matched but keep
|
|
1023
|
+
rows even when no match exists (with NULL for optional columns).
|
|
1024
|
+
"""
|
|
1025
|
+
# Execute the optional patterns
|
|
1026
|
+
optional_df: Optional[pl.DataFrame] = None
|
|
1027
|
+
|
|
1028
|
+
for i, pattern in enumerate(optional.patterns):
|
|
1029
|
+
if isinstance(pattern, (TriplePattern, QuotedTriplePattern)):
|
|
1030
|
+
pattern_df = self._execute_pattern(pattern, prefixes, 1000 + i)
|
|
1031
|
+
|
|
1032
|
+
if optional_df is None:
|
|
1033
|
+
optional_df = pattern_df
|
|
1034
|
+
else:
|
|
1035
|
+
shared_cols = set(optional_df.columns) & set(pattern_df.columns)
|
|
1036
|
+
shared_cols -= {"_pattern_idx"}
|
|
1037
|
+
|
|
1038
|
+
if shared_cols:
|
|
1039
|
+
optional_df = optional_df.join(pattern_df, on=list(shared_cols), how="inner")
|
|
1040
|
+
else:
|
|
1041
|
+
optional_df = optional_df.join(pattern_df, how="cross")
|
|
1042
|
+
|
|
1043
|
+
if optional_df is None or len(optional_df) == 0:
|
|
1044
|
+
return df
|
|
1045
|
+
|
|
1046
|
+
# Apply filters within the optional block
|
|
1047
|
+
for filter_clause in optional.filters:
|
|
1048
|
+
optional_df = self._apply_filter(optional_df, filter_clause)
|
|
1049
|
+
|
|
1050
|
+
# Remove internal columns from optional_df
|
|
1051
|
+
internal_cols = [c for c in optional_df.columns if c.startswith("_")]
|
|
1052
|
+
if internal_cols:
|
|
1053
|
+
optional_df = optional_df.drop(internal_cols)
|
|
1054
|
+
|
|
1055
|
+
# Find shared columns for the join
|
|
1056
|
+
shared_cols = set(df.columns) & set(optional_df.columns)
|
|
1057
|
+
|
|
1058
|
+
if shared_cols:
|
|
1059
|
+
# Left outer join - keep all rows from df, add optional columns where matched
|
|
1060
|
+
return df.join(optional_df, on=list(shared_cols), how="left")
|
|
1061
|
+
else:
|
|
1062
|
+
# No shared columns - this is unusual for OPTIONAL, but handle it
|
|
1063
|
+
return df
|
|
1064
|
+
|
|
1065
|
+
def _apply_union(
|
|
1066
|
+
self,
|
|
1067
|
+
df: pl.DataFrame,
|
|
1068
|
+
union: UnionPattern,
|
|
1069
|
+
prefixes: dict[str, str]
|
|
1070
|
+
) -> pl.DataFrame:
|
|
1071
|
+
"""
|
|
1072
|
+
Apply a UNION pattern by combining results from alternatives.
|
|
1073
|
+
|
|
1074
|
+
UNION combines results from multiple pattern groups:
|
|
1075
|
+
{ ?s ?p ?o } UNION { ?s ?q ?r }
|
|
1076
|
+
|
|
1077
|
+
Returns all rows matching ANY of the alternatives.
|
|
1078
|
+
"""
|
|
1079
|
+
union_results = []
|
|
1080
|
+
|
|
1081
|
+
for i, alternative in enumerate(union.alternatives):
|
|
1082
|
+
# Execute each alternative as a mini WHERE clause
|
|
1083
|
+
alt_where = WhereClause(patterns=alternative)
|
|
1084
|
+
alt_df = self._execute_where(alt_where, prefixes)
|
|
1085
|
+
|
|
1086
|
+
if len(alt_df) > 0:
|
|
1087
|
+
union_results.append(alt_df)
|
|
1088
|
+
|
|
1089
|
+
if not union_results:
|
|
1090
|
+
return df
|
|
1091
|
+
|
|
1092
|
+
# Combine all union results
|
|
1093
|
+
if len(union_results) == 1:
|
|
1094
|
+
union_df = union_results[0]
|
|
1095
|
+
else:
|
|
1096
|
+
# Align schemas - add missing columns with null values
|
|
1097
|
+
all_columns = set()
|
|
1098
|
+
for r in union_results:
|
|
1099
|
+
all_columns.update(r.columns)
|
|
1100
|
+
|
|
1101
|
+
aligned_results = []
|
|
1102
|
+
for r in union_results:
|
|
1103
|
+
missing = all_columns - set(r.columns)
|
|
1104
|
+
if missing:
|
|
1105
|
+
for col in missing:
|
|
1106
|
+
r = r.with_columns(pl.lit(None).alias(col))
|
|
1107
|
+
aligned_results.append(r.select(sorted(all_columns)))
|
|
1108
|
+
|
|
1109
|
+
union_df = pl.concat(aligned_results, how="vertical")
|
|
1110
|
+
|
|
1111
|
+
# If we have existing results, join with them
|
|
1112
|
+
if len(df) > 0 and len(df.columns) > 0:
|
|
1113
|
+
shared_cols = set(df.columns) & set(union_df.columns)
|
|
1114
|
+
if shared_cols:
|
|
1115
|
+
return df.join(union_df, on=list(shared_cols), how="inner")
|
|
1116
|
+
else:
|
|
1117
|
+
return df.join(union_df, how="cross")
|
|
1118
|
+
|
|
1119
|
+
return union_df
|
|
1120
|
+
|
|
1121
|
+
def _execute_union_standalone(
|
|
1122
|
+
self,
|
|
1123
|
+
union: UnionPattern,
|
|
1124
|
+
prefixes: dict[str, str]
|
|
1125
|
+
) -> pl.DataFrame:
|
|
1126
|
+
"""
|
|
1127
|
+
Execute a UNION pattern as a standalone query (no prior patterns).
|
|
1128
|
+
|
|
1129
|
+
Returns combined results from all alternatives.
|
|
1130
|
+
"""
|
|
1131
|
+
union_results = []
|
|
1132
|
+
|
|
1133
|
+
for alternative in union.alternatives:
|
|
1134
|
+
# Execute each alternative as a mini WHERE clause
|
|
1135
|
+
alt_where = WhereClause(patterns=alternative)
|
|
1136
|
+
alt_df = self._execute_where(alt_where, prefixes)
|
|
1137
|
+
|
|
1138
|
+
if len(alt_df) > 0:
|
|
1139
|
+
union_results.append(alt_df)
|
|
1140
|
+
|
|
1141
|
+
if not union_results:
|
|
1142
|
+
return pl.DataFrame()
|
|
1143
|
+
|
|
1144
|
+
# Combine all union results
|
|
1145
|
+
if len(union_results) == 1:
|
|
1146
|
+
return union_results[0]
|
|
1147
|
+
|
|
1148
|
+
# Align schemas - add missing columns with null values
|
|
1149
|
+
all_columns = set()
|
|
1150
|
+
for r in union_results:
|
|
1151
|
+
all_columns.update(r.columns)
|
|
1152
|
+
|
|
1153
|
+
aligned_results = []
|
|
1154
|
+
for r in union_results:
|
|
1155
|
+
missing = all_columns - set(r.columns)
|
|
1156
|
+
if missing:
|
|
1157
|
+
for col in missing:
|
|
1158
|
+
r = r.with_columns(pl.lit(None).alias(col))
|
|
1159
|
+
aligned_results.append(r.select(sorted(all_columns)))
|
|
1160
|
+
|
|
1161
|
+
return pl.concat(aligned_results, how="vertical")
|
|
1162
|
+
|
|
1163
|
+
def _apply_bind(
|
|
1164
|
+
self,
|
|
1165
|
+
df: pl.DataFrame,
|
|
1166
|
+
bind: Bind,
|
|
1167
|
+
prefixes: dict[str, str]
|
|
1168
|
+
) -> pl.DataFrame:
|
|
1169
|
+
"""
|
|
1170
|
+
Apply a BIND clause, adding a new column with the computed value.
|
|
1171
|
+
|
|
1172
|
+
BIND(?price * 1.1 AS ?taxed_price)
|
|
1173
|
+
BIND("default" AS ?label)
|
|
1174
|
+
"""
|
|
1175
|
+
var_name = bind.variable.name
|
|
1176
|
+
|
|
1177
|
+
# Handle different expression types
|
|
1178
|
+
if isinstance(bind.expression, Variable):
|
|
1179
|
+
# BIND(?x AS ?y) - copy column
|
|
1180
|
+
src_name = bind.expression.name
|
|
1181
|
+
if src_name in df.columns:
|
|
1182
|
+
df = df.with_columns(pl.col(src_name).alias(var_name))
|
|
1183
|
+
elif isinstance(bind.expression, Literal):
|
|
1184
|
+
# BIND("value" AS ?var) - add constant
|
|
1185
|
+
df = df.with_columns(pl.lit(bind.expression.value).alias(var_name))
|
|
1186
|
+
elif isinstance(bind.expression, IRI):
|
|
1187
|
+
# BIND(<uri> AS ?var) - add constant IRI
|
|
1188
|
+
value = self._resolve_term(bind.expression, prefixes)
|
|
1189
|
+
df = df.with_columns(pl.lit(value).alias(var_name))
|
|
1190
|
+
elif isinstance(bind.expression, Comparison):
|
|
1191
|
+
# BIND(?x > 5 AS ?flag) - boolean expression
|
|
1192
|
+
expr = self._build_filter_expression(bind.expression)
|
|
1193
|
+
if expr is not None:
|
|
1194
|
+
df = df.with_columns(expr.alias(var_name))
|
|
1195
|
+
elif isinstance(bind.expression, FunctionCall):
|
|
1196
|
+
# BIND(CONCAT(?a, ?b) AS ?c) - function call
|
|
1197
|
+
expr = self._build_function_call(bind.expression)
|
|
1198
|
+
if expr is not None:
|
|
1199
|
+
df = df.with_columns(expr.alias(var_name))
|
|
1200
|
+
|
|
1201
|
+
return df
|
|
1202
|
+
|
|
1203
|
+
def _apply_values(
|
|
1204
|
+
self,
|
|
1205
|
+
df: pl.DataFrame,
|
|
1206
|
+
values: ValuesClause,
|
|
1207
|
+
prefixes: dict[str, str]
|
|
1208
|
+
) -> pl.DataFrame:
|
|
1209
|
+
"""
|
|
1210
|
+
Apply a VALUES clause, joining with inline data.
|
|
1211
|
+
|
|
1212
|
+
VALUES ?x { 1 2 3 }
|
|
1213
|
+
VALUES (?x ?y) { (1 2) (3 4) }
|
|
1214
|
+
"""
|
|
1215
|
+
# Build a DataFrame from the VALUES data
|
|
1216
|
+
var_names = [v.name for v in values.variables]
|
|
1217
|
+
|
|
1218
|
+
# Convert bindings to column data
|
|
1219
|
+
columns = {name: [] for name in var_names}
|
|
1220
|
+
|
|
1221
|
+
for row in values.bindings:
|
|
1222
|
+
for i, val in enumerate(row):
|
|
1223
|
+
if i < len(var_names):
|
|
1224
|
+
if val is None:
|
|
1225
|
+
columns[var_names[i]].append(None)
|
|
1226
|
+
elif isinstance(val, Literal):
|
|
1227
|
+
columns[var_names[i]].append(val.value)
|
|
1228
|
+
elif isinstance(val, IRI):
|
|
1229
|
+
columns[var_names[i]].append(self._resolve_term(val, prefixes))
|
|
1230
|
+
else:
|
|
1231
|
+
columns[var_names[i]].append(str(val))
|
|
1232
|
+
|
|
1233
|
+
values_df = pl.DataFrame(columns)
|
|
1234
|
+
|
|
1235
|
+
if len(df) == 0 or len(df.columns) == 0:
|
|
1236
|
+
# VALUES is the only source - return it directly
|
|
1237
|
+
return values_df
|
|
1238
|
+
|
|
1239
|
+
# Join with existing results
|
|
1240
|
+
shared_cols = set(df.columns) & set(values_df.columns)
|
|
1241
|
+
|
|
1242
|
+
if shared_cols:
|
|
1243
|
+
# Inner join on shared columns - filter to matching values
|
|
1244
|
+
return df.join(values_df, on=list(shared_cols), how="inner")
|
|
1245
|
+
else:
|
|
1246
|
+
# Cross join - add all value combinations
|
|
1247
|
+
return df.join(values_df, how="cross")
|
|
1248
|
+
|
|
1249
|
+
def _build_filter_expression(
|
|
1250
|
+
self,
|
|
1251
|
+
expr: Union[Comparison, LogicalExpression, FunctionCall]
|
|
1252
|
+
) -> Optional[pl.Expr]:
|
|
1253
|
+
"""Build a Polars filter expression from SPARQL filter AST."""
|
|
1254
|
+
|
|
1255
|
+
if isinstance(expr, Comparison):
|
|
1256
|
+
# Handle type coercion for variable vs literal comparisons
|
|
1257
|
+
left, right = self._build_comparison_operands(expr.left, expr.right)
|
|
1258
|
+
|
|
1259
|
+
if left is None or right is None:
|
|
1260
|
+
return None
|
|
1261
|
+
|
|
1262
|
+
op_map = {
|
|
1263
|
+
ComparisonOp.EQ: lambda l, r: l == r,
|
|
1264
|
+
ComparisonOp.NE: lambda l, r: l != r,
|
|
1265
|
+
ComparisonOp.LT: lambda l, r: l < r,
|
|
1266
|
+
ComparisonOp.LE: lambda l, r: l <= r,
|
|
1267
|
+
ComparisonOp.GT: lambda l, r: l > r,
|
|
1268
|
+
ComparisonOp.GE: lambda l, r: l >= r,
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
return op_map[expr.operator](left, right)
|
|
1272
|
+
|
|
1273
|
+
elif isinstance(expr, LogicalExpression):
|
|
1274
|
+
operand_exprs = [
|
|
1275
|
+
self._build_filter_expression(op) for op in expr.operands
|
|
1276
|
+
]
|
|
1277
|
+
operand_exprs = [e for e in operand_exprs if e is not None]
|
|
1278
|
+
|
|
1279
|
+
if not operand_exprs:
|
|
1280
|
+
return None
|
|
1281
|
+
|
|
1282
|
+
if expr.operator == LogicalOp.NOT:
|
|
1283
|
+
return ~operand_exprs[0]
|
|
1284
|
+
elif expr.operator == LogicalOp.AND:
|
|
1285
|
+
result = operand_exprs[0]
|
|
1286
|
+
for e in operand_exprs[1:]:
|
|
1287
|
+
result = result & e
|
|
1288
|
+
return result
|
|
1289
|
+
elif expr.operator == LogicalOp.OR:
|
|
1290
|
+
result = operand_exprs[0]
|
|
1291
|
+
for e in operand_exprs[1:]:
|
|
1292
|
+
result = result | e
|
|
1293
|
+
return result
|
|
1294
|
+
|
|
1295
|
+
elif isinstance(expr, FunctionCall):
|
|
1296
|
+
return self._build_function_call(expr)
|
|
1297
|
+
|
|
1298
|
+
return None
|
|
1299
|
+
|
|
1300
|
+
def _build_comparison_operands(
|
|
1301
|
+
self,
|
|
1302
|
+
left_term: Union[Variable, Literal, IRI, FunctionCall],
|
|
1303
|
+
right_term: Union[Variable, Literal, IRI, FunctionCall]
|
|
1304
|
+
) -> tuple[Optional[pl.Expr], Optional[pl.Expr]]:
|
|
1305
|
+
"""
|
|
1306
|
+
Build comparison operands with proper type coercion.
|
|
1307
|
+
|
|
1308
|
+
When comparing a variable (column) with a typed literal, uses the
|
|
1309
|
+
pre-computed typed value column (e.g., age_value) if available.
|
|
1310
|
+
"""
|
|
1311
|
+
left = self._term_to_expr(left_term)
|
|
1312
|
+
right = self._term_to_expr(right_term)
|
|
1313
|
+
|
|
1314
|
+
if left is None or right is None:
|
|
1315
|
+
return left, right
|
|
1316
|
+
|
|
1317
|
+
# Use typed _value column for numeric comparisons with variables
|
|
1318
|
+
if isinstance(left_term, Variable) and isinstance(right_term, Literal):
|
|
1319
|
+
if right_term.datatype and self._is_numeric_datatype(right_term.datatype):
|
|
1320
|
+
# Use the pre-computed typed value column
|
|
1321
|
+
left = pl.col(f"{left_term.name}_value")
|
|
1322
|
+
elif isinstance(right_term, Variable) and isinstance(left_term, Literal):
|
|
1323
|
+
if left_term.datatype and self._is_numeric_datatype(left_term.datatype):
|
|
1324
|
+
# Use the pre-computed typed value column
|
|
1325
|
+
right = pl.col(f"{right_term.name}_value")
|
|
1326
|
+
|
|
1327
|
+
return left, right
|
|
1328
|
+
|
|
1329
|
+
def _is_numeric_datatype(self, datatype: str) -> bool:
|
|
1330
|
+
"""Check if a datatype is numeric (integer, decimal, double, float, boolean)."""
|
|
1331
|
+
numeric_indicators = ["integer", "int", "decimal", "float", "double", "boolean"]
|
|
1332
|
+
datatype_lower = datatype.lower()
|
|
1333
|
+
return any(ind in datatype_lower for ind in numeric_indicators)
|
|
1334
|
+
|
|
1335
|
+
def _cast_column_for_comparison(self, col_expr: pl.Expr, datatype: str) -> pl.Expr:
|
|
1336
|
+
"""Cast a column expression based on the datatype of the comparison literal."""
|
|
1337
|
+
if "integer" in datatype or "int" in datatype:
|
|
1338
|
+
return col_expr.cast(pl.Int64, strict=False)
|
|
1339
|
+
elif "decimal" in datatype or "float" in datatype or "double" in datatype:
|
|
1340
|
+
return col_expr.cast(pl.Float64, strict=False)
|
|
1341
|
+
elif "boolean" in datatype:
|
|
1342
|
+
return col_expr.cast(pl.Boolean, strict=False)
|
|
1343
|
+
return col_expr
|
|
1344
|
+
|
|
1345
|
+
def _term_to_expr(
|
|
1346
|
+
self,
|
|
1347
|
+
term: Union[Variable, Literal, IRI, FunctionCall]
|
|
1348
|
+
) -> Optional[pl.Expr]:
|
|
1349
|
+
"""Convert a term to a Polars expression."""
|
|
1350
|
+
if isinstance(term, Variable):
|
|
1351
|
+
return pl.col(term.name)
|
|
1352
|
+
elif isinstance(term, Literal):
|
|
1353
|
+
# Convert typed literals to appropriate Python types
|
|
1354
|
+
value = term.value
|
|
1355
|
+
if term.datatype:
|
|
1356
|
+
value = self._convert_typed_value(value, term.datatype)
|
|
1357
|
+
return pl.lit(value)
|
|
1358
|
+
elif isinstance(term, IRI):
|
|
1359
|
+
return pl.lit(term.value)
|
|
1360
|
+
elif isinstance(term, FunctionCall):
|
|
1361
|
+
return self._build_function_call(term)
|
|
1362
|
+
return None
|
|
1363
|
+
|
|
1364
|
+
def _convert_typed_value(self, value: Any, datatype: str) -> Any:
|
|
1365
|
+
"""Convert a literal value based on its XSD datatype."""
|
|
1366
|
+
if isinstance(value, (int, float, bool)):
|
|
1367
|
+
return value # Already native type
|
|
1368
|
+
|
|
1369
|
+
# XSD numeric types
|
|
1370
|
+
if "integer" in datatype or "int" in datatype:
|
|
1371
|
+
try:
|
|
1372
|
+
return int(value)
|
|
1373
|
+
except (ValueError, TypeError):
|
|
1374
|
+
return value
|
|
1375
|
+
elif "decimal" in datatype or "float" in datatype or "double" in datatype:
|
|
1376
|
+
try:
|
|
1377
|
+
return float(value)
|
|
1378
|
+
except (ValueError, TypeError):
|
|
1379
|
+
return value
|
|
1380
|
+
elif "boolean" in datatype:
|
|
1381
|
+
if isinstance(value, str):
|
|
1382
|
+
return value.lower() == "true"
|
|
1383
|
+
return bool(value)
|
|
1384
|
+
|
|
1385
|
+
return value
|
|
1386
|
+
|
|
1387
|
+
def _build_function_call(self, func: FunctionCall) -> Optional[pl.Expr]:
|
|
1388
|
+
"""Build a Polars expression for a SPARQL function."""
|
|
1389
|
+
name = func.name.upper()
|
|
1390
|
+
|
|
1391
|
+
if name == "BOUND":
|
|
1392
|
+
if func.arguments and isinstance(func.arguments[0], Variable):
|
|
1393
|
+
return pl.col(func.arguments[0].name).is_not_null()
|
|
1394
|
+
|
|
1395
|
+
elif name in ("ISIRI", "ISURI"):
|
|
1396
|
+
if func.arguments and isinstance(func.arguments[0], Variable):
|
|
1397
|
+
col = pl.col(func.arguments[0].name)
|
|
1398
|
+
return col.str.starts_with("http")
|
|
1399
|
+
|
|
1400
|
+
elif name == "ISLITERAL":
|
|
1401
|
+
if func.arguments and isinstance(func.arguments[0], Variable):
|
|
1402
|
+
col = pl.col(func.arguments[0].name)
|
|
1403
|
+
return ~col.str.starts_with("http") & ~col.str.starts_with("_:")
|
|
1404
|
+
|
|
1405
|
+
elif name == "ISBLANK":
|
|
1406
|
+
if func.arguments and isinstance(func.arguments[0], Variable):
|
|
1407
|
+
col = pl.col(func.arguments[0].name)
|
|
1408
|
+
return col.str.starts_with("_:")
|
|
1409
|
+
|
|
1410
|
+
elif name == "STR":
|
|
1411
|
+
if func.arguments and isinstance(func.arguments[0], Variable):
|
|
1412
|
+
return pl.col(func.arguments[0].name).cast(pl.Utf8)
|
|
1413
|
+
|
|
1414
|
+
# Add more functions as needed
|
|
1415
|
+
|
|
1416
|
+
return None
|
|
1417
|
+
|
|
1418
|
+
def _execute_insert_data(
|
|
1419
|
+
self,
|
|
1420
|
+
query: InsertDataQuery,
|
|
1421
|
+
provenance: Optional[ProvenanceContext] = None
|
|
1422
|
+
) -> dict:
|
|
1423
|
+
"""
|
|
1424
|
+
Execute an INSERT DATA query with RDF-Star provenance recognition.
|
|
1425
|
+
|
|
1426
|
+
This method intelligently handles RDF-Star annotations:
|
|
1427
|
+
- Regular triples are inserted with default provenance
|
|
1428
|
+
- Quoted triple annotations like << s p o >> prov:wasAttributedTo "source"
|
|
1429
|
+
are recognized and applied to the base triple's metadata
|
|
1430
|
+
|
|
1431
|
+
Args:
|
|
1432
|
+
query: The InsertDataQuery AST
|
|
1433
|
+
provenance: Optional default provenance context
|
|
1434
|
+
|
|
1435
|
+
Returns:
|
|
1436
|
+
Dict with 'count' of inserted triples
|
|
1437
|
+
"""
|
|
1438
|
+
if provenance is None:
|
|
1439
|
+
provenance = ProvenanceContext(source="SPARQL_INSERT", confidence=1.0)
|
|
1440
|
+
|
|
1441
|
+
prefixes = query.prefixes
|
|
1442
|
+
|
|
1443
|
+
# First pass: collect provenance annotations for quoted triples
|
|
1444
|
+
# Key: (subject, predicate, object) tuple of the base triple
|
|
1445
|
+
# Value: dict with 'source', 'confidence', 'timestamp' overrides
|
|
1446
|
+
provenance_annotations: dict[tuple[str, str, str], dict[str, Any]] = {}
|
|
1447
|
+
|
|
1448
|
+
# Separate regular triples from provenance annotations
|
|
1449
|
+
regular_triples = []
|
|
1450
|
+
|
|
1451
|
+
for triple in query.triples:
|
|
1452
|
+
# Check if this is a provenance annotation (subject is a quoted triple)
|
|
1453
|
+
if isinstance(triple.subject, QuotedTriplePattern):
|
|
1454
|
+
# This is an RDF-Star annotation like:
|
|
1455
|
+
# << ex:s ex:p ex:o >> prov:wasAttributedTo "IMDb" .
|
|
1456
|
+
quoted = triple.subject
|
|
1457
|
+
predicate_iri = self._resolve_term_value(triple.predicate, prefixes)
|
|
1458
|
+
obj_value = self._resolve_term_value(triple.object, prefixes)
|
|
1459
|
+
|
|
1460
|
+
# Get the base triple key
|
|
1461
|
+
base_s = self._resolve_term_value(quoted.subject, prefixes)
|
|
1462
|
+
base_p = self._resolve_term_value(quoted.predicate, prefixes)
|
|
1463
|
+
base_o = self._resolve_term_value(quoted.object, prefixes)
|
|
1464
|
+
base_key = (base_s, base_p, base_o)
|
|
1465
|
+
|
|
1466
|
+
# Initialize annotations dict for this triple if needed
|
|
1467
|
+
if base_key not in provenance_annotations:
|
|
1468
|
+
provenance_annotations[base_key] = {}
|
|
1469
|
+
|
|
1470
|
+
# Check if this predicate maps to a provenance field
|
|
1471
|
+
if predicate_iri in PROVENANCE_SOURCE_PREDICATES:
|
|
1472
|
+
provenance_annotations[base_key]['source'] = str(obj_value)
|
|
1473
|
+
elif predicate_iri in PROVENANCE_CONFIDENCE_PREDICATES:
|
|
1474
|
+
try:
|
|
1475
|
+
conf_val = float(obj_value)
|
|
1476
|
+
provenance_annotations[base_key]['confidence'] = conf_val
|
|
1477
|
+
except (ValueError, TypeError):
|
|
1478
|
+
# If can't parse as float, store as-is (will be ignored)
|
|
1479
|
+
pass
|
|
1480
|
+
elif predicate_iri in PROVENANCE_TIMESTAMP_PREDICATES:
|
|
1481
|
+
provenance_annotations[base_key]['timestamp'] = str(obj_value)
|
|
1482
|
+
else:
|
|
1483
|
+
# Not a recognized provenance predicate - treat as regular triple
|
|
1484
|
+
# (This creates an actual RDF-Star triple about the quoted triple)
|
|
1485
|
+
regular_triples.append(triple)
|
|
1486
|
+
else:
|
|
1487
|
+
# Regular triple
|
|
1488
|
+
regular_triples.append(triple)
|
|
1489
|
+
|
|
1490
|
+
# Second pass: insert regular triples with their provenance
|
|
1491
|
+
count = 0
|
|
1492
|
+
|
|
1493
|
+
for triple in regular_triples:
|
|
1494
|
+
subject = self._resolve_term_value(triple.subject, prefixes)
|
|
1495
|
+
predicate = self._resolve_term_value(triple.predicate, prefixes)
|
|
1496
|
+
obj = self._resolve_term_value(triple.object, prefixes)
|
|
1497
|
+
|
|
1498
|
+
# Check if we have provenance annotations for this triple
|
|
1499
|
+
triple_key = (subject, predicate, obj)
|
|
1500
|
+
if triple_key in provenance_annotations:
|
|
1501
|
+
annotations = provenance_annotations[triple_key]
|
|
1502
|
+
# Create provenance context with overrides
|
|
1503
|
+
triple_prov = ProvenanceContext(
|
|
1504
|
+
source=annotations.get('source', provenance.source),
|
|
1505
|
+
confidence=annotations.get('confidence', provenance.confidence),
|
|
1506
|
+
timestamp=provenance.timestamp,
|
|
1507
|
+
)
|
|
1508
|
+
else:
|
|
1509
|
+
triple_prov = provenance
|
|
1510
|
+
|
|
1511
|
+
self.store.add_triple(subject, predicate, obj, triple_prov)
|
|
1512
|
+
count += 1
|
|
1513
|
+
|
|
1514
|
+
# Also insert any base triples that only had annotations (no regular triple)
|
|
1515
|
+
# This handles the case where annotations come first:
|
|
1516
|
+
# << ex:s ex:p ex:o >> prov:wasAttributedTo "source" .
|
|
1517
|
+
# (but no explicit ex:s ex:p ex:o . triple)
|
|
1518
|
+
inserted_keys = {
|
|
1519
|
+
(self._resolve_term_value(t.subject, prefixes),
|
|
1520
|
+
self._resolve_term_value(t.predicate, prefixes),
|
|
1521
|
+
self._resolve_term_value(t.object, prefixes))
|
|
1522
|
+
for t in regular_triples
|
|
1523
|
+
if not isinstance(t.subject, QuotedTriplePattern)
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
for base_key, annotations in provenance_annotations.items():
|
|
1527
|
+
if base_key not in inserted_keys:
|
|
1528
|
+
# This triple was only defined via annotations, insert it
|
|
1529
|
+
subject, predicate, obj = base_key
|
|
1530
|
+
triple_prov = ProvenanceContext(
|
|
1531
|
+
source=annotations.get('source', provenance.source),
|
|
1532
|
+
confidence=annotations.get('confidence', provenance.confidence),
|
|
1533
|
+
timestamp=provenance.timestamp,
|
|
1534
|
+
)
|
|
1535
|
+
self.store.add_triple(subject, predicate, obj, triple_prov)
|
|
1536
|
+
count += 1
|
|
1537
|
+
|
|
1538
|
+
return {"count": count, "operation": "INSERT DATA"}
|
|
1539
|
+
|
|
1540
|
+
def _execute_delete_data(self, query: DeleteDataQuery) -> dict:
|
|
1541
|
+
"""
|
|
1542
|
+
Execute a DELETE DATA query.
|
|
1543
|
+
|
|
1544
|
+
DELETE DATA {
|
|
1545
|
+
<subject> <predicate> <object> .
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
Deletes the specified concrete triples from the store.
|
|
1549
|
+
"""
|
|
1550
|
+
prefixes = query.prefixes
|
|
1551
|
+
count = 0
|
|
1552
|
+
|
|
1553
|
+
for triple in query.triples:
|
|
1554
|
+
subject = self._resolve_term_value(triple.subject, prefixes)
|
|
1555
|
+
predicate = self._resolve_term_value(triple.predicate, prefixes)
|
|
1556
|
+
obj = self._resolve_term_value(triple.object, prefixes)
|
|
1557
|
+
|
|
1558
|
+
# Mark the triple as deleted
|
|
1559
|
+
deleted = self.store.mark_deleted(s=subject, p=predicate, o=obj)
|
|
1560
|
+
count += deleted
|
|
1561
|
+
|
|
1562
|
+
return {"count": count, "operation": "DELETE DATA"}
|
|
1563
|
+
|
|
1564
|
+
def _execute_delete_where(self, query: DeleteWhereQuery) -> dict:
|
|
1565
|
+
"""
|
|
1566
|
+
Execute a DELETE WHERE query.
|
|
1567
|
+
|
|
1568
|
+
DELETE WHERE { ?s ?p ?o }
|
|
1569
|
+
|
|
1570
|
+
Finds all matching triples and deletes them.
|
|
1571
|
+
"""
|
|
1572
|
+
# First, execute the WHERE clause to find matching bindings
|
|
1573
|
+
where = query.where
|
|
1574
|
+
prefixes = query.prefixes
|
|
1575
|
+
|
|
1576
|
+
if not where.patterns:
|
|
1577
|
+
return {"count": 0, "operation": "DELETE WHERE", "error": "No patterns in WHERE clause"}
|
|
1578
|
+
|
|
1579
|
+
# Execute WHERE to get bindings
|
|
1580
|
+
bindings = self._execute_where(where, prefixes)
|
|
1581
|
+
|
|
1582
|
+
if bindings is None or bindings.height == 0:
|
|
1583
|
+
return {"count": 0, "operation": "DELETE WHERE"}
|
|
1584
|
+
|
|
1585
|
+
# Build delete patterns from WHERE patterns
|
|
1586
|
+
count = 0
|
|
1587
|
+
for i in range(bindings.height):
|
|
1588
|
+
row = bindings.row(i, named=True)
|
|
1589
|
+
for pattern in where.patterns:
|
|
1590
|
+
if isinstance(pattern, TriplePattern):
|
|
1591
|
+
# Resolve each component using bindings
|
|
1592
|
+
subject = self._resolve_pattern_term(pattern.subject, row, query.prefixes)
|
|
1593
|
+
predicate = self._resolve_pattern_term(pattern.predicate, row, query.prefixes)
|
|
1594
|
+
obj = self._resolve_pattern_term(pattern.object, row, query.prefixes)
|
|
1595
|
+
|
|
1596
|
+
if subject and predicate and obj:
|
|
1597
|
+
# Mark as deleted
|
|
1598
|
+
deleted = self.store.mark_deleted(s=subject, p=predicate, o=obj)
|
|
1599
|
+
count += deleted
|
|
1600
|
+
|
|
1601
|
+
return {"count": count, "operation": "DELETE WHERE"}
|
|
1602
|
+
|
|
1603
|
+
def _execute_modify(
|
|
1604
|
+
self,
|
|
1605
|
+
query: ModifyQuery,
|
|
1606
|
+
provenance: Optional[ProvenanceContext] = None
|
|
1607
|
+
) -> dict:
|
|
1608
|
+
"""
|
|
1609
|
+
Execute a DELETE/INSERT WHERE (modify) query.
|
|
1610
|
+
|
|
1611
|
+
DELETE { <patterns> }
|
|
1612
|
+
INSERT { <patterns> }
|
|
1613
|
+
WHERE { <patterns> }
|
|
1614
|
+
|
|
1615
|
+
1. Execute WHERE to get variable bindings
|
|
1616
|
+
2. For each binding, delete matching patterns from DELETE clause
|
|
1617
|
+
3. For each binding, insert patterns from INSERT clause
|
|
1618
|
+
"""
|
|
1619
|
+
where = query.where
|
|
1620
|
+
prefixes = query.prefixes
|
|
1621
|
+
|
|
1622
|
+
# Execute WHERE to get bindings
|
|
1623
|
+
bindings = self._execute_where(where, prefixes)
|
|
1624
|
+
|
|
1625
|
+
if bindings is None or bindings.height == 0:
|
|
1626
|
+
# No matches - nothing to delete or insert
|
|
1627
|
+
return {
|
|
1628
|
+
"deleted": 0,
|
|
1629
|
+
"inserted": 0,
|
|
1630
|
+
"operation": "MODIFY"
|
|
1631
|
+
}
|
|
1632
|
+
|
|
1633
|
+
deleted_count = 0
|
|
1634
|
+
inserted_count = 0
|
|
1635
|
+
|
|
1636
|
+
# Process each row of bindings
|
|
1637
|
+
for i in range(bindings.height):
|
|
1638
|
+
row = bindings.row(i, named=True)
|
|
1639
|
+
|
|
1640
|
+
# Delete patterns
|
|
1641
|
+
for pattern in query.delete_patterns:
|
|
1642
|
+
subject = self._resolve_pattern_term(pattern.subject, row, query.prefixes)
|
|
1643
|
+
predicate = self._resolve_pattern_term(pattern.predicate, row, query.prefixes)
|
|
1644
|
+
obj = self._resolve_pattern_term(pattern.object, row, query.prefixes)
|
|
1645
|
+
|
|
1646
|
+
if subject and predicate and obj:
|
|
1647
|
+
deleted = self.store.mark_deleted(s=subject, p=predicate, o=obj)
|
|
1648
|
+
deleted_count += deleted
|
|
1649
|
+
|
|
1650
|
+
# Insert patterns
|
|
1651
|
+
for pattern in query.insert_patterns:
|
|
1652
|
+
subject = self._resolve_pattern_term(pattern.subject, row, query.prefixes)
|
|
1653
|
+
predicate = self._resolve_pattern_term(pattern.predicate, row, query.prefixes)
|
|
1654
|
+
obj = self._resolve_pattern_term(pattern.object, row, query.prefixes)
|
|
1655
|
+
|
|
1656
|
+
if subject and predicate and obj:
|
|
1657
|
+
prov = provenance or ProvenanceContext(source="SPARQL_UPDATE", confidence=1.0)
|
|
1658
|
+
self.store.add_triple(subject, predicate, obj, prov)
|
|
1659
|
+
inserted_count += 1
|
|
1660
|
+
|
|
1661
|
+
return {
|
|
1662
|
+
"deleted": deleted_count,
|
|
1663
|
+
"inserted": inserted_count,
|
|
1664
|
+
"operation": "MODIFY"
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
# =================================================================
|
|
1668
|
+
# Graph Management Execution Methods
|
|
1669
|
+
# =================================================================
|
|
1670
|
+
|
|
1671
|
+
def _execute_create_graph(self, query: CreateGraphQuery) -> dict:
|
|
1672
|
+
"""Execute a CREATE GRAPH query."""
|
|
1673
|
+
graph_uri = self._resolve_term_value(query.graph_uri, query.prefixes)
|
|
1674
|
+
try:
|
|
1675
|
+
self.store.create_graph(graph_uri)
|
|
1676
|
+
return {"operation": "CREATE GRAPH", "graph": graph_uri, "success": True}
|
|
1677
|
+
except ValueError as e:
|
|
1678
|
+
if query.silent:
|
|
1679
|
+
return {"operation": "CREATE GRAPH", "graph": graph_uri, "success": False, "reason": str(e)}
|
|
1680
|
+
raise
|
|
1681
|
+
|
|
1682
|
+
def _execute_drop_graph(self, query: DropGraphQuery) -> dict:
|
|
1683
|
+
"""Execute a DROP GRAPH query."""
|
|
1684
|
+
if query.target == "default":
|
|
1685
|
+
# Drop the default graph (clear triples with empty graph)
|
|
1686
|
+
self.store.clear_graph(None, silent=query.silent)
|
|
1687
|
+
return {"operation": "DROP", "target": "DEFAULT", "success": True}
|
|
1688
|
+
elif query.target == "named":
|
|
1689
|
+
# Drop all named graphs
|
|
1690
|
+
graphs = self.store.list_graphs()
|
|
1691
|
+
for g in graphs:
|
|
1692
|
+
if g: # Skip default graph
|
|
1693
|
+
self.store.drop_graph(g, silent=query.silent)
|
|
1694
|
+
return {"operation": "DROP", "target": "NAMED", "graphs_dropped": len([g for g in graphs if g]), "success": True}
|
|
1695
|
+
elif query.target == "all":
|
|
1696
|
+
# Drop all graphs including default
|
|
1697
|
+
graphs = self.store.list_graphs()
|
|
1698
|
+
for g in graphs:
|
|
1699
|
+
if g:
|
|
1700
|
+
self.store.drop_graph(g, silent=query.silent)
|
|
1701
|
+
self.store.clear_graph(None, silent=query.silent)
|
|
1702
|
+
return {"operation": "DROP", "target": "ALL", "success": True}
|
|
1703
|
+
else:
|
|
1704
|
+
# Drop specific graph
|
|
1705
|
+
graph_uri = self._resolve_term_value(query.graph_uri, query.prefixes)
|
|
1706
|
+
try:
|
|
1707
|
+
self.store.drop_graph(graph_uri, silent=query.silent)
|
|
1708
|
+
return {"operation": "DROP GRAPH", "graph": graph_uri, "success": True}
|
|
1709
|
+
except ValueError as e:
|
|
1710
|
+
if query.silent:
|
|
1711
|
+
return {"operation": "DROP GRAPH", "graph": graph_uri, "success": False, "reason": str(e)}
|
|
1712
|
+
raise
|
|
1713
|
+
|
|
1714
|
+
def _execute_clear_graph(self, query: ClearGraphQuery) -> dict:
|
|
1715
|
+
"""Execute a CLEAR GRAPH query."""
|
|
1716
|
+
if query.target == "default":
|
|
1717
|
+
count = self.store.clear_graph(None, silent=query.silent)
|
|
1718
|
+
return {"operation": "CLEAR", "target": "DEFAULT", "triples_cleared": count, "success": True}
|
|
1719
|
+
elif query.target == "named":
|
|
1720
|
+
total_cleared = 0
|
|
1721
|
+
graphs = self.store.list_graphs()
|
|
1722
|
+
for g in graphs:
|
|
1723
|
+
if g: # Skip default graph
|
|
1724
|
+
count = self.store.clear_graph(g, silent=query.silent)
|
|
1725
|
+
total_cleared += count
|
|
1726
|
+
return {"operation": "CLEAR", "target": "NAMED", "triples_cleared": total_cleared, "success": True}
|
|
1727
|
+
elif query.target == "all":
|
|
1728
|
+
total_cleared = 0
|
|
1729
|
+
graphs = self.store.list_graphs()
|
|
1730
|
+
for g in graphs:
|
|
1731
|
+
count = self.store.clear_graph(g if g else None, silent=query.silent)
|
|
1732
|
+
total_cleared += count
|
|
1733
|
+
return {"operation": "CLEAR", "target": "ALL", "triples_cleared": total_cleared, "success": True}
|
|
1734
|
+
else:
|
|
1735
|
+
# Clear specific graph
|
|
1736
|
+
graph_uri = self._resolve_term_value(query.graph_uri, query.prefixes)
|
|
1737
|
+
try:
|
|
1738
|
+
count = self.store.clear_graph(graph_uri, silent=query.silent)
|
|
1739
|
+
return {"operation": "CLEAR GRAPH", "graph": graph_uri, "triples_cleared": count, "success": True}
|
|
1740
|
+
except ValueError as e:
|
|
1741
|
+
if query.silent:
|
|
1742
|
+
return {"operation": "CLEAR GRAPH", "graph": graph_uri, "success": False, "reason": str(e)}
|
|
1743
|
+
raise
|
|
1744
|
+
|
|
1745
|
+
def _execute_load(self, query: LoadQuery, provenance: Optional[ProvenanceContext] = None) -> dict:
|
|
1746
|
+
"""Execute a LOAD query."""
|
|
1747
|
+
source_uri = self._resolve_term_value(query.source_uri, query.prefixes)
|
|
1748
|
+
graph_uri = None
|
|
1749
|
+
if query.graph_uri:
|
|
1750
|
+
graph_uri = self._resolve_term_value(query.graph_uri, query.prefixes)
|
|
1751
|
+
|
|
1752
|
+
try:
|
|
1753
|
+
count = self.store.load_graph(source_uri, graph_uri, silent=query.silent)
|
|
1754
|
+
return {
|
|
1755
|
+
"operation": "LOAD",
|
|
1756
|
+
"source": source_uri,
|
|
1757
|
+
"graph": graph_uri,
|
|
1758
|
+
"triples_loaded": count,
|
|
1759
|
+
"success": True
|
|
1760
|
+
}
|
|
1761
|
+
except Exception as e:
|
|
1762
|
+
if query.silent:
|
|
1763
|
+
return {
|
|
1764
|
+
"operation": "LOAD",
|
|
1765
|
+
"source": source_uri,
|
|
1766
|
+
"graph": graph_uri,
|
|
1767
|
+
"success": False,
|
|
1768
|
+
"reason": str(e)
|
|
1769
|
+
}
|
|
1770
|
+
raise
|
|
1771
|
+
|
|
1772
|
+
def _execute_copy_graph(self, query: CopyGraphQuery) -> dict:
|
|
1773
|
+
"""Execute a COPY graph query."""
|
|
1774
|
+
source = None
|
|
1775
|
+
if not query.source_is_default and query.source_graph:
|
|
1776
|
+
source = self._resolve_term_value(query.source_graph, query.prefixes)
|
|
1777
|
+
|
|
1778
|
+
dest = None
|
|
1779
|
+
if query.dest_graph:
|
|
1780
|
+
dest = self._resolve_term_value(query.dest_graph, query.prefixes)
|
|
1781
|
+
|
|
1782
|
+
try:
|
|
1783
|
+
count = self.store.copy_graph(source, dest, silent=query.silent)
|
|
1784
|
+
return {
|
|
1785
|
+
"operation": "COPY",
|
|
1786
|
+
"source": source or "DEFAULT",
|
|
1787
|
+
"destination": dest or "DEFAULT",
|
|
1788
|
+
"triples_copied": count,
|
|
1789
|
+
"success": True
|
|
1790
|
+
}
|
|
1791
|
+
except ValueError as e:
|
|
1792
|
+
if query.silent:
|
|
1793
|
+
return {
|
|
1794
|
+
"operation": "COPY",
|
|
1795
|
+
"source": source or "DEFAULT",
|
|
1796
|
+
"destination": dest or "DEFAULT",
|
|
1797
|
+
"success": False,
|
|
1798
|
+
"reason": str(e)
|
|
1799
|
+
}
|
|
1800
|
+
raise
|
|
1801
|
+
|
|
1802
|
+
def _execute_move_graph(self, query: MoveGraphQuery) -> dict:
|
|
1803
|
+
"""Execute a MOVE graph query."""
|
|
1804
|
+
source = None
|
|
1805
|
+
if not query.source_is_default and query.source_graph:
|
|
1806
|
+
source = self._resolve_term_value(query.source_graph, query.prefixes)
|
|
1807
|
+
|
|
1808
|
+
dest = None
|
|
1809
|
+
if query.dest_graph:
|
|
1810
|
+
dest = self._resolve_term_value(query.dest_graph, query.prefixes)
|
|
1811
|
+
|
|
1812
|
+
try:
|
|
1813
|
+
count = self.store.move_graph(source, dest, silent=query.silent)
|
|
1814
|
+
return {
|
|
1815
|
+
"operation": "MOVE",
|
|
1816
|
+
"source": source or "DEFAULT",
|
|
1817
|
+
"destination": dest or "DEFAULT",
|
|
1818
|
+
"triples_moved": count,
|
|
1819
|
+
"success": True
|
|
1820
|
+
}
|
|
1821
|
+
except ValueError as e:
|
|
1822
|
+
if query.silent:
|
|
1823
|
+
return {
|
|
1824
|
+
"operation": "MOVE",
|
|
1825
|
+
"source": source or "DEFAULT",
|
|
1826
|
+
"destination": dest or "DEFAULT",
|
|
1827
|
+
"success": False,
|
|
1828
|
+
"reason": str(e)
|
|
1829
|
+
}
|
|
1830
|
+
raise
|
|
1831
|
+
|
|
1832
|
+
def _execute_add_graph(self, query: AddGraphQuery) -> dict:
|
|
1833
|
+
"""Execute an ADD graph query."""
|
|
1834
|
+
source = None
|
|
1835
|
+
if not query.source_is_default and query.source_graph:
|
|
1836
|
+
source = self._resolve_term_value(query.source_graph, query.prefixes)
|
|
1837
|
+
|
|
1838
|
+
dest = None
|
|
1839
|
+
if query.dest_graph:
|
|
1840
|
+
dest = self._resolve_term_value(query.dest_graph, query.prefixes)
|
|
1841
|
+
|
|
1842
|
+
try:
|
|
1843
|
+
count = self.store.add_graph(source, dest, silent=query.silent)
|
|
1844
|
+
return {
|
|
1845
|
+
"operation": "ADD",
|
|
1846
|
+
"source": source or "DEFAULT",
|
|
1847
|
+
"destination": dest or "DEFAULT",
|
|
1848
|
+
"triples_added": count,
|
|
1849
|
+
"success": True
|
|
1850
|
+
}
|
|
1851
|
+
except ValueError as e:
|
|
1852
|
+
if query.silent:
|
|
1853
|
+
return {
|
|
1854
|
+
"operation": "ADD",
|
|
1855
|
+
"source": source or "DEFAULT",
|
|
1856
|
+
"destination": dest or "DEFAULT",
|
|
1857
|
+
"success": False,
|
|
1858
|
+
"reason": str(e)
|
|
1859
|
+
}
|
|
1860
|
+
raise
|
|
1861
|
+
|
|
1862
|
+
def _resolve_pattern_term(
|
|
1863
|
+
self,
|
|
1864
|
+
term: Term,
|
|
1865
|
+
bindings: dict[str, Any],
|
|
1866
|
+
prefixes: dict[str, str]
|
|
1867
|
+
) -> Optional[str]:
|
|
1868
|
+
"""
|
|
1869
|
+
Resolve a pattern term using variable bindings.
|
|
1870
|
+
|
|
1871
|
+
Args:
|
|
1872
|
+
term: The term (Variable, IRI, Literal, etc.)
|
|
1873
|
+
bindings: Variable bindings from WHERE execution
|
|
1874
|
+
prefixes: Prefix mappings
|
|
1875
|
+
|
|
1876
|
+
Returns:
|
|
1877
|
+
The resolved value or None if variable not bound
|
|
1878
|
+
"""
|
|
1879
|
+
if isinstance(term, Variable):
|
|
1880
|
+
value = bindings.get(term.name)
|
|
1881
|
+
if value is None:
|
|
1882
|
+
return None
|
|
1883
|
+
return str(value)
|
|
1884
|
+
else:
|
|
1885
|
+
return self._resolve_term_value(term, prefixes)
|
|
1886
|
+
|
|
1887
|
+
def _resolve_term_value(self, term: Term, prefixes: dict[str, str]) -> Any:
|
|
1888
|
+
"""Resolve a term to its actual value, expanding prefixes."""
|
|
1889
|
+
if isinstance(term, IRI):
|
|
1890
|
+
iri = term.value
|
|
1891
|
+
# Check if it's a prefixed name
|
|
1892
|
+
if ":" in iri and not iri.startswith("http"):
|
|
1893
|
+
prefix, local = iri.split(":", 1)
|
|
1894
|
+
if prefix in prefixes:
|
|
1895
|
+
return prefixes[prefix] + local
|
|
1896
|
+
return iri
|
|
1897
|
+
elif isinstance(term, Literal):
|
|
1898
|
+
return term.value
|
|
1899
|
+
elif isinstance(term, BlankNode):
|
|
1900
|
+
return f"_:{term.id}"
|
|
1901
|
+
else:
|
|
1902
|
+
return str(term)
|
|
1903
|
+
|
|
1904
|
+
|
|
1905
|
+
def execute_sparql(
|
|
1906
|
+
store: "TripleStore",
|
|
1907
|
+
query_string: str,
|
|
1908
|
+
provenance: Optional[ProvenanceContext] = None
|
|
1909
|
+
) -> Union[pl.DataFrame, bool, dict]:
|
|
1910
|
+
"""
|
|
1911
|
+
Convenience function to parse and execute a SPARQL-Star query.
|
|
1912
|
+
|
|
1913
|
+
Args:
|
|
1914
|
+
store: The TripleStore to query
|
|
1915
|
+
query_string: SPARQL-Star query string
|
|
1916
|
+
provenance: Optional provenance for INSERT/DELETE operations
|
|
1917
|
+
|
|
1918
|
+
Returns:
|
|
1919
|
+
Query results (DataFrame for SELECT, bool for ASK, dict for UPDATE)
|
|
1920
|
+
"""
|
|
1921
|
+
from rdf_starbase.sparql.parser import parse_query
|
|
1922
|
+
|
|
1923
|
+
query = parse_query(query_string)
|
|
1924
|
+
executor = SPARQLExecutor(store)
|
|
1925
|
+
return executor.execute(query, provenance)
|