rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,739 @@
1
+ """
2
+ Repository Management API Router.
3
+
4
+ Provides REST endpoints for managing multiple repositories:
5
+ - Create/delete repositories
6
+ - List repositories
7
+ - Get repository info
8
+ - Scoped SPARQL queries per repository
9
+ """
10
+
11
+ from pathlib import Path
12
+ from typing import Optional, Union
13
+ import os
14
+
15
+ from fastapi import APIRouter, HTTPException, Query
16
+ from pydantic import BaseModel, Field
17
+ import polars as pl
18
+
19
+ from rdf_starbase.repositories import RepositoryManager, RepositoryInfo
20
+ from rdf_starbase import execute_sparql
21
+
22
+
23
+ # =============================================================================
24
+ # Pydantic Models
25
+ # =============================================================================
26
+
27
+ class CreateRepositoryRequest(BaseModel):
28
+ """Request to create a new repository."""
29
+ name: str = Field(..., description="Unique repository name (alphanumeric, hyphens, underscores)")
30
+ description: str = Field(default="", description="Human-readable description")
31
+ tags: list[str] = Field(default_factory=list, description="Optional tags")
32
+
33
+
34
+ class UpdateRepositoryRequest(BaseModel):
35
+ """Request to update repository metadata."""
36
+ description: Optional[str] = Field(None, description="New description")
37
+ tags: Optional[list[str]] = Field(None, description="New tags")
38
+
39
+
40
+ class RenameRepositoryRequest(BaseModel):
41
+ """Request to rename a repository."""
42
+ new_name: str = Field(..., description="New repository name")
43
+
44
+
45
+ class SPARQLQueryRequest(BaseModel):
46
+ """SPARQL query for a specific repository."""
47
+ query: str = Field(..., description="SPARQL-Star query string")
48
+
49
+
50
+ class RepositoryResponse(BaseModel):
51
+ """Response containing repository info."""
52
+ name: str
53
+ description: str
54
+ tags: list[str]
55
+ created_at: str
56
+ triple_count: int
57
+ subject_count: int
58
+ predicate_count: int
59
+
60
+ @classmethod
61
+ def from_info(cls, info: RepositoryInfo) -> "RepositoryResponse":
62
+ return cls(
63
+ name=info.name,
64
+ description=info.description,
65
+ tags=info.tags,
66
+ created_at=info.created_at.isoformat(),
67
+ triple_count=info.triple_count,
68
+ subject_count=info.subject_count,
69
+ predicate_count=info.predicate_count,
70
+ )
71
+
72
+
73
+ def dataframe_to_records(df: pl.DataFrame) -> list[dict]:
74
+ """Convert Polars DataFrame to list of dicts for JSON serialization."""
75
+ from datetime import datetime
76
+ records = []
77
+ for row in df.iter_rows(named=True):
78
+ record = {}
79
+ for k, v in row.items():
80
+ if isinstance(v, datetime):
81
+ record[k] = v.isoformat()
82
+ elif v is None:
83
+ record[k] = None
84
+ else:
85
+ record[k] = v
86
+ records.append(record)
87
+ return records
88
+
89
+
90
+ def create_repository_router(
91
+ workspace_path: Optional[str | Path] = None
92
+ ) -> tuple[APIRouter, RepositoryManager]:
93
+ """
94
+ Create the repository management API router.
95
+
96
+ Args:
97
+ workspace_path: Path to store repositories (default: ./data/repositories)
98
+
99
+ Returns:
100
+ Tuple of (router, manager)
101
+ """
102
+ # Default workspace path
103
+ if workspace_path is None:
104
+ workspace_path = os.environ.get(
105
+ "RDFSTARBASE_REPOSITORY_PATH",
106
+ "./data/repositories"
107
+ )
108
+
109
+ manager = RepositoryManager(workspace_path)
110
+ router = APIRouter(prefix="/repositories", tags=["Repositories"])
111
+
112
+ # =========================================================================
113
+ # Repository CRUD
114
+ # =========================================================================
115
+
116
+ @router.get("")
117
+ async def list_repositories():
118
+ """List all repositories."""
119
+ repos = manager.list_repositories()
120
+ return {
121
+ "count": len(repos),
122
+ "repositories": [RepositoryResponse.from_info(r).model_dump() for r in repos]
123
+ }
124
+
125
+ @router.post("")
126
+ async def create_repository(request: CreateRepositoryRequest):
127
+ """Create a new repository."""
128
+ try:
129
+ info = manager.create(
130
+ name=request.name,
131
+ description=request.description,
132
+ tags=request.tags,
133
+ )
134
+ # Auto-save after creation
135
+ manager.save(request.name)
136
+ return {
137
+ "success": True,
138
+ "message": f"Repository '{request.name}' created",
139
+ "repository": RepositoryResponse.from_info(info).model_dump()
140
+ }
141
+ except ValueError as e:
142
+ raise HTTPException(status_code=400, detail=str(e))
143
+
144
+ @router.get("/{name}")
145
+ async def get_repository(name: str):
146
+ """Get repository info."""
147
+ try:
148
+ info = manager.get_info(name)
149
+ return RepositoryResponse.from_info(info).model_dump()
150
+ except ValueError as e:
151
+ raise HTTPException(status_code=404, detail=str(e))
152
+
153
+ @router.patch("/{name}")
154
+ async def update_repository(name: str, request: UpdateRepositoryRequest):
155
+ """Update repository metadata."""
156
+ try:
157
+ info = manager.update_info(
158
+ name=name,
159
+ description=request.description,
160
+ tags=request.tags,
161
+ )
162
+ return RepositoryResponse.from_info(info).model_dump()
163
+ except ValueError as e:
164
+ raise HTTPException(status_code=404, detail=str(e))
165
+
166
+ @router.delete("/{name}")
167
+ async def delete_repository(
168
+ name: str,
169
+ force: bool = Query(False, description="Force delete even if repository has data")
170
+ ):
171
+ """Delete a repository."""
172
+ try:
173
+ manager.delete(name, force=force)
174
+ return {
175
+ "success": True,
176
+ "message": f"Repository '{name}' deleted"
177
+ }
178
+ except ValueError as e:
179
+ if "does not exist" in str(e):
180
+ raise HTTPException(status_code=404, detail=str(e))
181
+ raise HTTPException(status_code=400, detail=str(e))
182
+
183
+ @router.post("/{name}/rename")
184
+ async def rename_repository(name: str, request: RenameRepositoryRequest):
185
+ """Rename a repository."""
186
+ try:
187
+ info = manager.rename(name, request.new_name)
188
+ return {
189
+ "success": True,
190
+ "message": f"Repository renamed from '{name}' to '{request.new_name}'",
191
+ "repository": RepositoryResponse.from_info(info).model_dump()
192
+ }
193
+ except ValueError as e:
194
+ if "does not exist" in str(e):
195
+ raise HTTPException(status_code=404, detail=str(e))
196
+ raise HTTPException(status_code=400, detail=str(e))
197
+
198
+ # =========================================================================
199
+ # Repository SPARQL
200
+ # =========================================================================
201
+
202
+ @router.post("/{name}/sparql")
203
+ async def repository_sparql(name: str, request: SPARQLQueryRequest):
204
+ """Execute a SPARQL query against a specific repository."""
205
+ try:
206
+ store = manager.get_store(name)
207
+ except ValueError as e:
208
+ raise HTTPException(status_code=404, detail=str(e))
209
+
210
+ try:
211
+ result = execute_sparql(store, request.query)
212
+
213
+ if isinstance(result, bool):
214
+ # ASK query
215
+ return {"type": "ask", "result": result}
216
+ elif isinstance(result, dict):
217
+ # UPDATE operation
218
+ # Auto-save after update
219
+ manager.save(name)
220
+ return {
221
+ "type": "update",
222
+ "operation": result.get("operation", "unknown"),
223
+ "count": result.get("count", 0),
224
+ "success": True,
225
+ }
226
+ elif isinstance(result, pl.DataFrame):
227
+ # SELECT query
228
+ return {
229
+ "type": "select",
230
+ "count": len(result),
231
+ "columns": result.columns,
232
+ "results": dataframe_to_records(result),
233
+ }
234
+ else:
235
+ return {"type": "unknown", "result": str(result)}
236
+
237
+ except Exception as e:
238
+ raise HTTPException(status_code=400, detail=f"Query error: {str(e)}")
239
+
240
+ # =========================================================================
241
+ # Repository Triple Management
242
+ # =========================================================================
243
+
244
+ @router.get("/{name}/triples")
245
+ async def get_repository_triples(
246
+ name: str,
247
+ subject: Optional[str] = Query(None, description="Filter by subject"),
248
+ predicate: Optional[str] = Query(None, description="Filter by predicate"),
249
+ object: Optional[str] = Query(None, description="Filter by object"),
250
+ limit: int = Query(100, ge=1, le=10000, description="Maximum results"),
251
+ ):
252
+ """Get triples from a specific repository."""
253
+ try:
254
+ store = manager.get_store(name)
255
+ except ValueError as e:
256
+ raise HTTPException(status_code=404, detail=str(e))
257
+
258
+ df = store.get_triples(
259
+ subject=subject,
260
+ predicate=predicate,
261
+ obj=object,
262
+ )
263
+
264
+ df = df.head(limit)
265
+
266
+ return {
267
+ "count": len(df),
268
+ "triples": dataframe_to_records(df),
269
+ }
270
+
271
+ @router.post("/{name}/triples/batch")
272
+ async def add_repository_triples_batch(
273
+ name: str,
274
+ triples: list[dict]
275
+ ):
276
+ """Add multiple triples to a specific repository."""
277
+ try:
278
+ store = manager.get_store(name)
279
+ except ValueError as e:
280
+ raise HTTPException(status_code=404, detail=str(e))
281
+
282
+ try:
283
+ count = store.add_triples_batch(triples)
284
+ # Auto-save after batch insert
285
+ manager.save(name)
286
+ return {
287
+ "success": True,
288
+ "count": count,
289
+ "message": f"Added {count} triples to repository '{name}'",
290
+ }
291
+ except Exception as e:
292
+ raise HTTPException(status_code=400, detail=str(e))
293
+
294
+ # =========================================================================
295
+ # Repository Stats
296
+ # =========================================================================
297
+
298
+ @router.get("/{name}/stats")
299
+ async def get_repository_stats(name: str):
300
+ """Get detailed statistics for a repository."""
301
+ try:
302
+ store = manager.get_store(name)
303
+ info = manager.get_info(name)
304
+ except ValueError as e:
305
+ raise HTTPException(status_code=404, detail=str(e))
306
+
307
+ return {
308
+ "name": name,
309
+ "description": info.description,
310
+ "created_at": info.created_at.isoformat(),
311
+ "stats": store.stats(),
312
+ }
313
+
314
+ # =========================================================================
315
+ # Persistence
316
+ # =========================================================================
317
+
318
+ @router.post("/{name}/save")
319
+ async def save_repository(name: str):
320
+ """Explicitly save a repository to disk."""
321
+ try:
322
+ manager.save(name)
323
+ return {
324
+ "success": True,
325
+ "message": f"Repository '{name}' saved"
326
+ }
327
+ except ValueError as e:
328
+ raise HTTPException(status_code=404, detail=str(e))
329
+
330
+ @router.post("/save-all")
331
+ async def save_all_repositories():
332
+ """Save all loaded repositories to disk."""
333
+ manager.save_all()
334
+ return {
335
+ "success": True,
336
+ "message": "All repositories saved"
337
+ }
338
+
339
+ # =========================================================================
340
+ # Example Datasets
341
+ # =========================================================================
342
+
343
+ @router.get("/examples/datasets")
344
+ async def list_example_datasets():
345
+ """List available example datasets."""
346
+ return {
347
+ "datasets": [
348
+ {
349
+ "id": "movies",
350
+ "name": "Movies & Directors",
351
+ "description": "Sample movie data with directors, actors, and genres. Great for learning SPARQL.",
352
+ "triple_count": 45,
353
+ "tags": ["movies", "entertainment", "schema.org"]
354
+ },
355
+ {
356
+ "id": "techcorp",
357
+ "name": "TechCorp Customer Service",
358
+ "description": "Customer service scenario with tickets, products, and customer data. Includes conflicting data from multiple sources.",
359
+ "triple_count": 35,
360
+ "tags": ["enterprise", "CRM", "support"]
361
+ },
362
+ {
363
+ "id": "knowledge_graph",
364
+ "name": "Simple Knowledge Graph",
365
+ "description": "Basic knowledge graph with people, organizations, and relationships.",
366
+ "triple_count": 28,
367
+ "tags": ["people", "organizations", "relationships"]
368
+ },
369
+ {
370
+ "id": "rdf_star_demo",
371
+ "name": "RDF-Star Demo",
372
+ "description": "Demonstrates RDF-Star features with quoted triples, annotations, and provenance metadata.",
373
+ "triple_count": 22,
374
+ "tags": ["rdf-star", "annotations", "provenance"]
375
+ }
376
+ ]
377
+ }
378
+
379
+ @router.post("/{name}/load-example/{dataset_id}")
380
+ async def load_example_dataset(name: str, dataset_id: str):
381
+ """Load an example dataset into a repository."""
382
+ try:
383
+ store = manager.get_store(name)
384
+ except ValueError as e:
385
+ raise HTTPException(status_code=404, detail=str(e))
386
+
387
+ datasets = {
388
+ "movies": get_movies_dataset_triples,
389
+ "techcorp": get_techcorp_dataset_triples,
390
+ "knowledge_graph": get_knowledge_graph_dataset_triples,
391
+ "rdf_star_demo": get_rdf_star_demo_dataset_triples,
392
+ }
393
+
394
+ if dataset_id not in datasets:
395
+ raise HTTPException(status_code=404, detail=f"Unknown dataset: {dataset_id}")
396
+
397
+ triples = datasets[dataset_id]()
398
+
399
+ try:
400
+ count = store.add_triples_batch(triples)
401
+ manager.save(name)
402
+ return {
403
+ "success": True,
404
+ "dataset": dataset_id,
405
+ "message": f"Loaded example dataset '{dataset_id}' into repository '{name}'",
406
+ "stats": store.stats()
407
+ }
408
+ except Exception as e:
409
+ raise HTTPException(status_code=400, detail=f"Failed to load dataset: {str(e)}")
410
+
411
+ return router, manager
412
+
413
+
414
+ # =============================================================================
415
+ # Example Datasets with Provenance
416
+ # =============================================================================
417
+
418
+ def get_movies_dataset_triples() -> list[dict]:
419
+ """Movies & Directors dataset with varied sources and confidence."""
420
+ RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
421
+ SCHEMA = "http://schema.org/"
422
+ EX = "http://example.org/"
423
+
424
+ triples = []
425
+
426
+ # Helper to add triple with provenance
427
+ def add(s, p, o, source, confidence):
428
+ triples.append({
429
+ "subject": s,
430
+ "predicate": p,
431
+ "object": o,
432
+ "source": source,
433
+ "confidence": confidence
434
+ })
435
+
436
+ # Directors - from Wikipedia (high confidence)
437
+ add(f"{EX}person/nolan", RDF_TYPE, f"{SCHEMA}Person", "Wikipedia", 0.98)
438
+ add(f"{EX}person/nolan", f"{SCHEMA}name", "Christopher Nolan", "Wikipedia", 0.99)
439
+ add(f"{EX}person/nolan", f"{SCHEMA}birthDate", "1970-07-30", "Wikipedia", 0.95)
440
+ add(f"{EX}person/nolan", f"{SCHEMA}nationality", "British-American", "Wikipedia", 0.92)
441
+
442
+ add(f"{EX}person/spielberg", RDF_TYPE, f"{SCHEMA}Person", "Wikipedia", 0.98)
443
+ add(f"{EX}person/spielberg", f"{SCHEMA}name", "Steven Spielberg", "Wikipedia", 0.99)
444
+ add(f"{EX}person/spielberg", f"{SCHEMA}birthDate", "1946-12-18", "Wikipedia", 0.97)
445
+
446
+ add(f"{EX}person/greta", RDF_TYPE, f"{SCHEMA}Person", "IMDb", 0.94)
447
+ add(f"{EX}person/greta", f"{SCHEMA}name", "Greta Gerwig", "IMDb", 0.96)
448
+ add(f"{EX}person/greta", f"{SCHEMA}birthDate", "1983-08-04", "IMDb", 0.90)
449
+
450
+ # Actors - from IMDb (good confidence)
451
+ add(f"{EX}person/dicaprio", RDF_TYPE, f"{SCHEMA}Person", "IMDb", 0.97)
452
+ add(f"{EX}person/dicaprio", f"{SCHEMA}name", "Leonardo DiCaprio", "IMDb", 0.99)
453
+ add(f"{EX}person/dicaprio", f"{SCHEMA}birthDate", "1974-11-11", "IMDb", 0.95)
454
+
455
+ add(f"{EX}person/cillian", RDF_TYPE, f"{SCHEMA}Person", "IMDb", 0.96)
456
+ add(f"{EX}person/cillian", f"{SCHEMA}name", "Cillian Murphy", "IMDb", 0.98)
457
+
458
+ add(f"{EX}person/margot", RDF_TYPE, f"{SCHEMA}Person", "IMDb", 0.97)
459
+ add(f"{EX}person/margot", f"{SCHEMA}name", "Margot Robbie", "IMDb", 0.99)
460
+
461
+ # Movies - mixed sources
462
+ # Inception - from multiple sources
463
+ add(f"{EX}movie/inception", RDF_TYPE, f"{SCHEMA}Movie", "IMDb", 0.99)
464
+ add(f"{EX}movie/inception", f"{SCHEMA}name", "Inception", "IMDb", 0.99)
465
+ add(f"{EX}movie/inception", f"{SCHEMA}datePublished", "2010", "IMDb", 0.98)
466
+ add(f"{EX}movie/inception", f"{SCHEMA}director", f"{EX}person/nolan", "IMDb", 0.99)
467
+ add(f"{EX}movie/inception", f"{SCHEMA}actor", f"{EX}person/dicaprio", "IMDb", 0.99)
468
+ add(f"{EX}movie/inception", f"{SCHEMA}genre", "Sci-Fi", "RottenTomatoes", 0.85)
469
+ add(f"{EX}movie/inception", f"{SCHEMA}duration", "PT2H28M", "IMDb", 0.97)
470
+
471
+ # Oppenheimer - recent film, high confidence
472
+ add(f"{EX}movie/oppenheimer", RDF_TYPE, f"{SCHEMA}Movie", "IMDb", 0.99)
473
+ add(f"{EX}movie/oppenheimer", f"{SCHEMA}name", "Oppenheimer", "IMDb", 0.99)
474
+ add(f"{EX}movie/oppenheimer", f"{SCHEMA}datePublished", "2023", "BoxOfficeMojo", 0.98)
475
+ add(f"{EX}movie/oppenheimer", f"{SCHEMA}director", f"{EX}person/nolan", "Wikipedia", 0.99)
476
+ add(f"{EX}movie/oppenheimer", f"{SCHEMA}actor", f"{EX}person/cillian", "IMDb", 0.98)
477
+ add(f"{EX}movie/oppenheimer", f"{SCHEMA}genre", "Drama", "RottenTomatoes", 0.88)
478
+ add(f"{EX}movie/oppenheimer", f"{SCHEMA}genre", "Biography", "Wikipedia", 0.82) # Multiple genres!
479
+
480
+ # Interstellar
481
+ add(f"{EX}movie/interstellar", RDF_TYPE, f"{SCHEMA}Movie", "IMDb", 0.99)
482
+ add(f"{EX}movie/interstellar", f"{SCHEMA}name", "Interstellar", "IMDb", 0.99)
483
+ add(f"{EX}movie/interstellar", f"{SCHEMA}datePublished", "2014", "IMDb", 0.98)
484
+ add(f"{EX}movie/interstellar", f"{SCHEMA}director", f"{EX}person/nolan", "Wikipedia", 0.99)
485
+ add(f"{EX}movie/interstellar", f"{SCHEMA}genre", "Sci-Fi", "IMDb", 0.92)
486
+
487
+ # Jurassic Park - classic film
488
+ add(f"{EX}movie/jurassic_park", RDF_TYPE, f"{SCHEMA}Movie", "Wikipedia", 0.99)
489
+ add(f"{EX}movie/jurassic_park", f"{SCHEMA}name", "Jurassic Park", "Wikipedia", 0.99)
490
+ add(f"{EX}movie/jurassic_park", f"{SCHEMA}datePublished", "1993", "Wikipedia", 0.99)
491
+ add(f"{EX}movie/jurassic_park", f"{SCHEMA}director", f"{EX}person/spielberg", "Wikipedia", 0.99)
492
+ add(f"{EX}movie/jurassic_park", f"{SCHEMA}genre", "Adventure", "IMDb", 0.90)
493
+
494
+ # Barbie - from entertainment news (slightly lower confidence)
495
+ add(f"{EX}movie/barbie", RDF_TYPE, f"{SCHEMA}Movie", "BoxOfficeMojo", 0.97)
496
+ add(f"{EX}movie/barbie", f"{SCHEMA}name", "Barbie", "BoxOfficeMojo", 0.99)
497
+ add(f"{EX}movie/barbie", f"{SCHEMA}datePublished", "2023", "BoxOfficeMojo", 0.98)
498
+ add(f"{EX}movie/barbie", f"{SCHEMA}director", f"{EX}person/greta", "IMDb", 0.97)
499
+ add(f"{EX}movie/barbie", f"{SCHEMA}actor", f"{EX}person/margot", "IMDb", 0.98)
500
+ add(f"{EX}movie/barbie", f"{SCHEMA}genre", "Comedy", "RottenTomatoes", 0.80)
501
+
502
+ return triples
503
+
504
+
505
+ def get_techcorp_dataset_triples() -> list[dict]:
506
+ """TechCorp Customer Service dataset with conflicting data from multiple sources."""
507
+ RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
508
+ TC = "http://techcorp.com/"
509
+ FOAF = "http://xmlns.com/foaf/0.1/"
510
+ RDFS = "http://www.w3.org/2000/01/rdf-schema#"
511
+
512
+ triples = []
513
+
514
+ def add(s, p, o, source, confidence):
515
+ triples.append({
516
+ "subject": s,
517
+ "predicate": p,
518
+ "object": o,
519
+ "source": source,
520
+ "confidence": confidence
521
+ })
522
+
523
+ # Customer C001 - Alice Johnson (CONFLICTING DATA from different systems!)
524
+ add(f"{TC}customer/C001", RDF_TYPE, f"{TC}Customer", "CRM_System", 0.99)
525
+ add(f"{TC}customer/C001", f"{FOAF}name", "Alice Johnson", "CRM_System", 0.98)
526
+ add(f"{TC}customer/C001", f"{TC}email", "alice@example.com", "CRM_System", 0.95)
527
+ add(f"{TC}customer/C001", f"{TC}tier", "Premium", "CRM_System", 0.90)
528
+ add(f"{TC}customer/C001", f"{TC}since", "2020-01-15", "CRM_System", 0.99)
529
+
530
+ # CONFLICT: Billing system says different tier!
531
+ add(f"{TC}customer/C001", f"{TC}tier", "Enterprise", "Billing_System", 0.85)
532
+ # CONFLICT: Support portal has different email
533
+ add(f"{TC}customer/C001", f"{TC}email", "a.johnson@corp.example.com", "Support_Portal", 0.75)
534
+
535
+ # Customer C002 - Bob Smith
536
+ add(f"{TC}customer/C002", RDF_TYPE, f"{TC}Customer", "CRM_System", 0.99)
537
+ add(f"{TC}customer/C002", f"{FOAF}name", "Bob Smith", "CRM_System", 0.97)
538
+ add(f"{TC}customer/C002", f"{TC}email", "bob@example.com", "CRM_System", 0.96)
539
+ add(f"{TC}customer/C002", f"{TC}tier", "Standard", "CRM_System", 0.92)
540
+ add(f"{TC}customer/C002", f"{TC}since", "2021-06-20", "CRM_System", 0.99)
541
+
542
+ # Customer C003 - Carol White (from legacy system)
543
+ add(f"{TC}customer/C003", RDF_TYPE, f"{TC}Customer", "Legacy_DB", 0.88)
544
+ add(f"{TC}customer/C003", f"{FOAF}name", "Carol White", "Legacy_DB", 0.85)
545
+ add(f"{TC}customer/C003", f"{TC}email", "carol@example.com", "Legacy_DB", 0.80)
546
+ add(f"{TC}customer/C003", f"{TC}tier", "Enterprise", "Billing_System", 0.95)
547
+
548
+ # Products - from Product Catalog (high confidence)
549
+ add(f"{TC}product/P001", RDF_TYPE, f"{TC}Product", "Product_Catalog", 0.99)
550
+ add(f"{TC}product/P001", f"{RDFS}label", "CloudSync Pro", "Product_Catalog", 0.99)
551
+ add(f"{TC}product/P001", f"{TC}category", "Software", "Product_Catalog", 0.98)
552
+ add(f"{TC}product/P001", f"{TC}price", "299.99", "Product_Catalog", 0.97)
553
+ # CONFLICT: Sales team has different price!
554
+ add(f"{TC}product/P001", f"{TC}price", "279.99", "Sales_Team", 0.70)
555
+
556
+ add(f"{TC}product/P002", RDF_TYPE, f"{TC}Product", "Product_Catalog", 0.99)
557
+ add(f"{TC}product/P002", f"{RDFS}label", "DataVault", "Product_Catalog", 0.99)
558
+ add(f"{TC}product/P002", f"{TC}category", "Storage", "Product_Catalog", 0.98)
559
+ add(f"{TC}product/P002", f"{TC}price", "499.99", "Product_Catalog", 0.97)
560
+
561
+ add(f"{TC}product/P003", RDF_TYPE, f"{TC}Product", "Product_Catalog", 0.99)
562
+ add(f"{TC}product/P003", f"{RDFS}label", "SecureNet", "Product_Catalog", 0.99)
563
+ add(f"{TC}product/P003", f"{TC}category", "Security", "Product_Catalog", 0.98)
564
+ add(f"{TC}product/P003", f"{TC}price", "199.99", "Product_Catalog", 0.97)
565
+
566
+ # Support Tickets - from different support channels
567
+ add(f"{TC}ticket/T001", RDF_TYPE, f"{TC}SupportTicket", "Support_Portal", 0.99)
568
+ add(f"{TC}ticket/T001", f"{TC}customer", f"{TC}customer/C001", "Support_Portal", 0.99)
569
+ add(f"{TC}ticket/T001", f"{TC}product", f"{TC}product/P001", "Support_Portal", 0.98)
570
+ add(f"{TC}ticket/T001", f"{TC}status", "Open", "Support_Portal", 0.95)
571
+ add(f"{TC}ticket/T001", f"{TC}priority", "High", "Support_Portal", 0.90)
572
+ add(f"{TC}ticket/T001", f"{TC}description", "Sync failing intermittently", "Support_Portal", 0.99)
573
+
574
+ add(f"{TC}ticket/T002", RDF_TYPE, f"{TC}SupportTicket", "Email_Integration", 0.95)
575
+ add(f"{TC}ticket/T002", f"{TC}customer", f"{TC}customer/C002", "Email_Integration", 0.92)
576
+ add(f"{TC}ticket/T002", f"{TC}product", f"{TC}product/P002", "Email_Integration", 0.90)
577
+ add(f"{TC}ticket/T002", f"{TC}status", "Resolved", "Support_Portal", 0.98)
578
+ add(f"{TC}ticket/T002", f"{TC}priority", "Medium", "Email_Integration", 0.85)
579
+ add(f"{TC}ticket/T002", f"{TC}description", "Storage quota question", "Email_Integration", 0.88)
580
+
581
+ add(f"{TC}ticket/T003", RDF_TYPE, f"{TC}SupportTicket", "Security_Ops", 0.99)
582
+ add(f"{TC}ticket/T003", f"{TC}customer", f"{TC}customer/C003", "Security_Ops", 0.98)
583
+ add(f"{TC}ticket/T003", f"{TC}product", f"{TC}product/P003", "Security_Ops", 0.99)
584
+ add(f"{TC}ticket/T003", f"{TC}status", "Open", "Security_Ops", 0.99)
585
+ add(f"{TC}ticket/T003", f"{TC}priority", "Critical", "Security_Ops", 0.99)
586
+ add(f"{TC}ticket/T003", f"{TC}description", "Security alert investigation", "Security_Ops", 0.99)
587
+
588
+ return triples
589
+
590
+
591
+ def get_knowledge_graph_dataset_triples() -> list[dict]:
592
+ """Simple Knowledge Graph with people and organizations."""
593
+ RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
594
+ FOAF = "http://xmlns.com/foaf/0.1/"
595
+ ORG = "http://www.w3.org/ns/org#"
596
+ EX = "http://example.org/"
597
+
598
+ triples = []
599
+
600
+ def add(s, p, o, source, confidence):
601
+ triples.append({
602
+ "subject": s,
603
+ "predicate": p,
604
+ "object": o,
605
+ "source": source,
606
+ "confidence": confidence
607
+ })
608
+
609
+ # People - from LinkedIn (professional network)
610
+ add(f"{EX}person/jane", RDF_TYPE, f"{FOAF}Person", "LinkedIn", 0.95)
611
+ add(f"{EX}person/jane", f"{FOAF}name", "Jane Doe", "LinkedIn", 0.98)
612
+ add(f"{EX}person/jane", f"{FOAF}age", "32", "LinkedIn", 0.75) # Age less reliable
613
+ add(f"{EX}person/jane", f"{FOAF}knows", f"{EX}person/john", "LinkedIn", 0.90)
614
+ add(f"{EX}person/jane", f"{FOAF}knows", f"{EX}person/alice", "LinkedIn", 0.92)
615
+ add(f"{EX}person/jane", f"{ORG}memberOf", f"{EX}org/acme", "LinkedIn", 0.97)
616
+
617
+ add(f"{EX}person/john", RDF_TYPE, f"{FOAF}Person", "LinkedIn", 0.95)
618
+ add(f"{EX}person/john", f"{FOAF}name", "John Smith", "LinkedIn", 0.97)
619
+ add(f"{EX}person/john", f"{FOAF}age", "28", "Facebook", 0.70) # Different source
620
+ add(f"{EX}person/john", f"{FOAF}knows", f"{EX}person/jane", "LinkedIn", 0.90)
621
+ add(f"{EX}person/john", f"{ORG}memberOf", f"{EX}org/globex", "CompanyWebsite", 0.99)
622
+
623
+ add(f"{EX}person/alice", RDF_TYPE, f"{FOAF}Person", "LinkedIn", 0.96)
624
+ add(f"{EX}person/alice", f"{FOAF}name", "Alice Chen", "LinkedIn", 0.98)
625
+ add(f"{EX}person/alice", f"{FOAF}age", "35", "LinkedIn", 0.72)
626
+ add(f"{EX}person/alice", f"{FOAF}knows", f"{EX}person/jane", "LinkedIn", 0.92)
627
+ add(f"{EX}person/alice", f"{FOAF}knows", f"{EX}person/bob", "Email_Analysis", 0.65)
628
+ add(f"{EX}person/alice", f"{ORG}headOf", f"{EX}org/acme", "CompanyWebsite", 0.99)
629
+
630
+ add(f"{EX}person/bob", RDF_TYPE, f"{FOAF}Person", "HR_System", 0.98)
631
+ add(f"{EX}person/bob", f"{FOAF}name", "Bob Williams", "HR_System", 0.99)
632
+ add(f"{EX}person/bob", f"{FOAF}age", "42", "HR_System", 0.95)
633
+ add(f"{EX}person/bob", f"{ORG}memberOf", f"{EX}org/initech", "HR_System", 0.98)
634
+
635
+ # Organizations - from company registries
636
+ add(f"{EX}org/acme", RDF_TYPE, f"{ORG}Organization", "SEC_Filings", 0.99)
637
+ add(f"{EX}org/acme", f"{FOAF}name", "Acme Corp", "SEC_Filings", 0.99)
638
+ add(f"{EX}org/acme", f"{ORG}hasSite", f"{EX}location/sf", "CompanyWebsite", 0.95)
639
+ add(f"{EX}org/acme", f"{EX}industry", "Technology", "CrunchBase", 0.88)
640
+
641
+ add(f"{EX}org/globex", RDF_TYPE, f"{ORG}Organization", "SEC_Filings", 0.99)
642
+ add(f"{EX}org/globex", f"{FOAF}name", "Globex Inc", "SEC_Filings", 0.99)
643
+ add(f"{EX}org/globex", f"{ORG}hasSite", f"{EX}location/nyc", "CompanyWebsite", 0.96)
644
+ add(f"{EX}org/globex", f"{EX}industry", "Finance", "Bloomberg", 0.95)
645
+
646
+ add(f"{EX}org/initech", RDF_TYPE, f"{ORG}Organization", "State_Registry", 0.97)
647
+ add(f"{EX}org/initech", f"{FOAF}name", "Initech", "State_Registry", 0.98)
648
+ add(f"{EX}org/initech", f"{ORG}hasSite", f"{EX}location/austin", "GoogleMaps", 0.85)
649
+ add(f"{EX}org/initech", f"{EX}industry", "Software", "LinkedIn", 0.80)
650
+
651
+ # Locations - from geographic databases
652
+ add(f"{EX}location/sf", RDF_TYPE, f"{EX}Location", "GeoNames", 0.99)
653
+ add(f"{EX}location/sf", f"{FOAF}name", "San Francisco", "GeoNames", 0.99)
654
+ add(f"{EX}location/sf", f"{EX}country", "USA", "GeoNames", 0.99)
655
+
656
+ add(f"{EX}location/nyc", RDF_TYPE, f"{EX}Location", "GeoNames", 0.99)
657
+ add(f"{EX}location/nyc", f"{FOAF}name", "New York City", "GeoNames", 0.99)
658
+ add(f"{EX}location/nyc", f"{EX}country", "USA", "GeoNames", 0.99)
659
+
660
+ add(f"{EX}location/austin", RDF_TYPE, f"{EX}Location", "GeoNames", 0.99)
661
+ add(f"{EX}location/austin", f"{FOAF}name", "Austin", "GeoNames", 0.99)
662
+ add(f"{EX}location/austin", f"{EX}country", "USA", "GeoNames", 0.99)
663
+
664
+ return triples
665
+
666
+
667
+ def get_rdf_star_demo_dataset_triples() -> list[dict]:
668
+ """RDF-Star Demo showing statement-level metadata."""
669
+ RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
670
+ EX = "http://example.org/"
671
+
672
+ triples = []
673
+
674
+ def add(s, p, o, source, confidence):
675
+ triples.append({
676
+ "subject": s,
677
+ "predicate": p,
678
+ "object": o,
679
+ "source": source,
680
+ "confidence": confidence
681
+ })
682
+
683
+ # Employee data from HR System (high confidence)
684
+ add(f"{EX}alice", f"{EX}name", "Alice", "HR_System", 0.99)
685
+ add(f"{EX}alice", f"{EX}worksAt", f"{EX}acme", "HR_System", 0.98)
686
+ add(f"{EX}alice", f"{EX}department", "Engineering", "HR_System", 0.97)
687
+ add(f"{EX}alice", f"{EX}startDate", "2020-03-15", "HR_System", 0.99)
688
+
689
+ # Salary from Payroll (very high confidence, sensitive)
690
+ add(f"{EX}alice", f"{EX}salary", "95000", "Payroll_System", 0.999)
691
+
692
+ # Manager relationship
693
+ add(f"{EX}bob", f"{EX}name", "Bob", "HR_System", 0.99)
694
+ add(f"{EX}bob", f"{EX}worksAt", f"{EX}acme", "HR_System", 0.98)
695
+ add(f"{EX}bob", f"{EX}manages", f"{EX}alice", "HR_System", 0.95)
696
+ add(f"{EX}bob", f"{EX}title", "Engineering Manager", "HR_System", 0.97)
697
+
698
+ # Company info from different sources
699
+ add(f"{EX}acme", f"{EX}name", "Acme Corporation", "SEC_Filings", 0.99)
700
+ add(f"{EX}acme", f"{EX}founded", "2010", "CrunchBase", 0.92)
701
+ add(f"{EX}acme", f"{EX}headquarters", "San Francisco", "CompanyWebsite", 0.95)
702
+ add(f"{EX}acme", f"{EX}employees", "500", "LinkedIn", 0.75) # Less reliable
703
+ add(f"{EX}acme", f"{EX}employees", "487", "SEC_Filings", 0.98) # Conflicting!
704
+
705
+ # Performance reviews (varying confidence)
706
+ add(f"{EX}alice", f"{EX}performanceRating", "Exceeds Expectations", "Performance_System", 0.90)
707
+ add(f"{EX}alice", f"{EX}lastReview", "2024-12-01", "Performance_System", 0.99)
708
+
709
+ # Skills from different sources
710
+ add(f"{EX}alice", f"{EX}skill", "Python", "LinkedIn", 0.85)
711
+ add(f"{EX}alice", f"{EX}skill", "Machine Learning", "LinkedIn", 0.80)
712
+ add(f"{EX}alice", f"{EX}skill", "Distributed Systems", "Manager_Assessment", 0.92)
713
+
714
+ # Project assignments
715
+ add(f"{EX}project/alpha", f"{EX}name", "Project Alpha", "Jira", 0.99)
716
+ add(f"{EX}project/alpha", f"{EX}status", "Active", "Jira", 0.98)
717
+ add(f"{EX}project/alpha", f"{EX}lead", f"{EX}alice", "Jira", 0.97)
718
+ add(f"{EX}alice", f"{EX}assignedTo", f"{EX}project/alpha", "Jira", 0.96)
719
+ add(f"{EX}bob", f"{EX}sponsors", f"{EX}project/alpha", "Executive_Dashboard", 0.88)
720
+
721
+ return triples
722
+
723
+
724
+ # Keep old SPARQL functions for backwards compatibility (unused)
725
+ def get_movies_dataset() -> str:
726
+ """Deprecated: Use get_movies_dataset_triples instead."""
727
+ return ""
728
+
729
+ def get_techcorp_dataset() -> str:
730
+ """Deprecated: Use get_techcorp_dataset_triples instead."""
731
+ return ""
732
+
733
+ def get_knowledge_graph_dataset() -> str:
734
+ """Deprecated: Use get_knowledge_graph_dataset_triples instead."""
735
+ return ""
736
+
737
+ def get_rdf_star_demo_dataset() -> str:
738
+ """Deprecated: Use get_rdf_star_demo_dataset_triples instead."""
739
+ return ""