rdf-starbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rdf_starbase/models.py ADDED
@@ -0,0 +1,92 @@
1
+ """
2
+ Core data models for RDF-StarBase.
3
+
4
+ Defines the fundamental structures for RDF-Star triples, quoted triples,
5
+ and provenance metadata.
6
+ """
7
+
8
+ from datetime import datetime, timezone
9
+ from typing import Optional, Any, Union
10
+ from uuid import uuid4, UUID
11
+ from pydantic import BaseModel, Field, ConfigDict
12
+
13
+
14
+ def _utc_now() -> datetime:
15
+ """Get current UTC time in a timezone-aware way."""
16
+ return datetime.now(timezone.utc)
17
+
18
+
19
+ class ProvenanceContext(BaseModel):
20
+ """
21
+ Provenance metadata for an assertion.
22
+
23
+ Tracks who made the assertion, when, how, and with what confidence.
24
+ """
25
+ model_config = ConfigDict(frozen=True)
26
+
27
+ source: str = Field(description="System, API, or person that asserted this")
28
+ timestamp: datetime = Field(default_factory=_utc_now, description="When the assertion was made")
29
+ confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence score [0,1]")
30
+ process: Optional[str] = Field(default=None, description="Process or method that generated this")
31
+ version: Optional[str] = Field(default=None, description="Version of the asserting system")
32
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional context")
33
+
34
+
35
+ class Triple(BaseModel):
36
+ """
37
+ A standard RDF triple: subject-predicate-object.
38
+
39
+ Forms the foundation of knowledge representation in RDF-StarBase.
40
+ """
41
+ model_config = ConfigDict(frozen=True)
42
+
43
+ subject: str = Field(description="Subject URI or blank node")
44
+ predicate: str = Field(description="Predicate URI")
45
+ object: Union[str, int, float, bool] = Field(description="Object URI, literal, or value")
46
+ graph: Optional[str] = Field(default=None, description="Named graph (optional quad)")
47
+
48
+ def __str__(self) -> str:
49
+ graph_str = f" [{self.graph}]" if self.graph else ""
50
+ return f"<{self.subject}> <{self.predicate}> {self._format_object()}{graph_str}"
51
+
52
+ def _format_object(self) -> str:
53
+ if isinstance(self.object, str) and self.object.startswith("http"):
54
+ return f"<{self.object}>"
55
+ elif isinstance(self.object, str):
56
+ return f'"{self.object}"'
57
+ return str(self.object)
58
+
59
+
60
+ class QuotedTriple(BaseModel):
61
+ """
62
+ An RDF-Star quoted triple: a triple that can be used as a subject or object.
63
+
64
+ This is the key innovation of RDF-Star - making statements about statements.
65
+ """
66
+ model_config = ConfigDict(frozen=True)
67
+
68
+ id: UUID = Field(default_factory=uuid4, description="Unique identifier for this quoted triple")
69
+ triple: Triple = Field(description="The quoted triple")
70
+ provenance: ProvenanceContext = Field(description="Provenance of this assertion")
71
+
72
+ def __str__(self) -> str:
73
+ return f"<<{self.triple}>> [from: {self.provenance.source}]"
74
+
75
+
76
+ class Assertion(BaseModel):
77
+ """
78
+ A complete assertion: triple + provenance.
79
+
80
+ This is the atomic unit of knowledge in RDF-StarBase.
81
+ """
82
+ model_config = ConfigDict(frozen=True)
83
+
84
+ id: UUID = Field(default_factory=uuid4, description="Unique assertion ID")
85
+ triple: Triple = Field(description="The asserted triple")
86
+ provenance: ProvenanceContext = Field(description="Who/when/how this was asserted")
87
+ superseded_by: Optional[UUID] = Field(default=None, description="ID of assertion that supersedes this")
88
+ deprecated: bool = Field(default=False, description="Whether this assertion is deprecated")
89
+
90
+ def __str__(self) -> str:
91
+ status = " [DEPRECATED]" if self.deprecated else ""
92
+ return f"{self.triple} (by {self.provenance.source} at {self.provenance.timestamp}){status}"
@@ -0,0 +1,540 @@
1
+ """
2
+ Assertion Registry for RDF-StarBase.
3
+
4
+ Tracks datasets, APIs, mappings, and materialization runs as first-class
5
+ entities with their own provenance. This enables answering questions like:
6
+ - Which datasets contributed to this assertion?
7
+ - When was this API last synced?
8
+ - What mappings transformed this data?
9
+ """
10
+
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime, timezone
13
+ from enum import Enum
14
+ from typing import Optional, Any
15
+ from uuid import UUID, uuid4
16
+ import json
17
+
18
+ import polars as pl
19
+
20
+ from rdf_starbase.models import ProvenanceContext
21
+
22
+
23
+ class SourceType(str, Enum):
24
+ """Types of data sources in the registry."""
25
+ DATASET = "dataset"
26
+ API = "api"
27
+ MAPPING = "mapping"
28
+ PROCESS = "process"
29
+ MANUAL = "manual"
30
+
31
+
32
+ class SourceStatus(str, Enum):
33
+ """Status of a registered source."""
34
+ ACTIVE = "active"
35
+ DEPRECATED = "deprecated"
36
+ ERROR = "error"
37
+ SYNCING = "syncing"
38
+
39
+
40
+ @dataclass
41
+ class RegisteredSource:
42
+ """
43
+ A data source registered in the Assertion Registry.
44
+
45
+ Represents datasets, APIs, mappings, or processes that contribute
46
+ assertions to the knowledge graph.
47
+ """
48
+ id: UUID
49
+ name: str
50
+ source_type: SourceType
51
+ uri: Optional[str] = None
52
+ description: Optional[str] = None
53
+ status: SourceStatus = SourceStatus.ACTIVE
54
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
55
+ last_sync: Optional[datetime] = None
56
+ sync_frequency: Optional[str] = None # e.g., "daily", "hourly", "manual"
57
+ owner: Optional[str] = None
58
+ schema_uri: Optional[str] = None
59
+ config: dict[str, Any] = field(default_factory=dict)
60
+ tags: list[str] = field(default_factory=list)
61
+
62
+ def to_dict(self) -> dict[str, Any]:
63
+ """Convert to dictionary for storage."""
64
+ return {
65
+ "id": str(self.id),
66
+ "name": self.name,
67
+ "source_type": self.source_type.value,
68
+ "uri": self.uri,
69
+ "description": self.description,
70
+ "status": self.status.value,
71
+ "created_at": self.created_at,
72
+ "last_sync": self.last_sync,
73
+ "sync_frequency": self.sync_frequency,
74
+ "owner": self.owner,
75
+ "schema_uri": self.schema_uri,
76
+ "config": json.dumps(self.config),
77
+ "tags": json.dumps(self.tags),
78
+ }
79
+
80
+ @classmethod
81
+ def from_dict(cls, data: dict[str, Any]) -> "RegisteredSource":
82
+ """Create from dictionary."""
83
+ return cls(
84
+ id=UUID(data["id"]),
85
+ name=data["name"],
86
+ source_type=SourceType(data["source_type"]),
87
+ uri=data.get("uri"),
88
+ description=data.get("description"),
89
+ status=SourceStatus(data.get("status", "active")),
90
+ created_at=data.get("created_at", datetime.now(timezone.utc)),
91
+ last_sync=data.get("last_sync"),
92
+ sync_frequency=data.get("sync_frequency"),
93
+ owner=data.get("owner"),
94
+ schema_uri=data.get("schema_uri"),
95
+ config=json.loads(data.get("config", "{}")),
96
+ tags=json.loads(data.get("tags", "[]")),
97
+ )
98
+
99
+
100
+ @dataclass
101
+ class SyncRun:
102
+ """
103
+ A record of a synchronization run from a source.
104
+
105
+ Tracks when data was pulled, how many assertions were created,
106
+ and any errors encountered.
107
+ """
108
+ id: UUID
109
+ source_id: UUID
110
+ started_at: datetime
111
+ completed_at: Optional[datetime] = None
112
+ status: str = "running" # running, success, failed, partial
113
+ assertions_created: int = 0
114
+ assertions_updated: int = 0
115
+ assertions_deprecated: int = 0
116
+ errors: list[str] = field(default_factory=list)
117
+ metadata: dict[str, Any] = field(default_factory=dict)
118
+
119
+ def to_dict(self) -> dict[str, Any]:
120
+ """Convert to dictionary for storage."""
121
+ return {
122
+ "id": str(self.id),
123
+ "source_id": str(self.source_id),
124
+ "started_at": self.started_at,
125
+ "completed_at": self.completed_at,
126
+ "status": self.status,
127
+ "assertions_created": self.assertions_created,
128
+ "assertions_updated": self.assertions_updated,
129
+ "assertions_deprecated": self.assertions_deprecated,
130
+ "errors": json.dumps(self.errors),
131
+ "metadata": json.dumps(self.metadata),
132
+ }
133
+
134
+
135
+ class AssertionRegistry:
136
+ """
137
+ Registry for tracking data sources and their sync history.
138
+
139
+ The Assertion Registry answers critical governance questions:
140
+ - Where did this data come from?
141
+ - When was it last updated?
142
+ - Which systems contribute to this knowledge graph?
143
+ - What's the lineage of this assertion?
144
+
145
+ Example:
146
+ >>> registry = AssertionRegistry()
147
+ >>>
148
+ >>> # Register a CRM API
149
+ >>> crm = registry.register_source(
150
+ ... name="Salesforce CRM",
151
+ ... source_type=SourceType.API,
152
+ ... uri="https://api.salesforce.com/v52",
153
+ ... owner="sales-team",
154
+ ... sync_frequency="hourly"
155
+ ... )
156
+ >>>
157
+ >>> # Start a sync run
158
+ >>> run = registry.start_sync(crm.id)
159
+ >>> # ... perform sync ...
160
+ >>> registry.complete_sync(run.id, assertions_created=150)
161
+ >>>
162
+ >>> # Query sources
163
+ >>> apis = registry.get_sources(source_type=SourceType.API)
164
+ """
165
+
166
+ def __init__(self):
167
+ """Initialize an empty registry."""
168
+ self._sources_df = pl.DataFrame({
169
+ "id": pl.Series([], dtype=pl.Utf8),
170
+ "name": pl.Series([], dtype=pl.Utf8),
171
+ "source_type": pl.Series([], dtype=pl.Utf8),
172
+ "uri": pl.Series([], dtype=pl.Utf8),
173
+ "description": pl.Series([], dtype=pl.Utf8),
174
+ "status": pl.Series([], dtype=pl.Utf8),
175
+ "created_at": pl.Series([], dtype=pl.Datetime("us", "UTC")),
176
+ "last_sync": pl.Series([], dtype=pl.Datetime("us", "UTC")),
177
+ "sync_frequency": pl.Series([], dtype=pl.Utf8),
178
+ "owner": pl.Series([], dtype=pl.Utf8),
179
+ "schema_uri": pl.Series([], dtype=pl.Utf8),
180
+ "config": pl.Series([], dtype=pl.Utf8),
181
+ "tags": pl.Series([], dtype=pl.Utf8),
182
+ })
183
+
184
+ self._syncs_df = pl.DataFrame({
185
+ "id": pl.Series([], dtype=pl.Utf8),
186
+ "source_id": pl.Series([], dtype=pl.Utf8),
187
+ "started_at": pl.Series([], dtype=pl.Datetime("us", "UTC")),
188
+ "completed_at": pl.Series([], dtype=pl.Datetime("us", "UTC")),
189
+ "status": pl.Series([], dtype=pl.Utf8),
190
+ "assertions_created": pl.Series([], dtype=pl.Int64),
191
+ "assertions_updated": pl.Series([], dtype=pl.Int64),
192
+ "assertions_deprecated": pl.Series([], dtype=pl.Int64),
193
+ "errors": pl.Series([], dtype=pl.Utf8),
194
+ "metadata": pl.Series([], dtype=pl.Utf8),
195
+ })
196
+
197
+ def register_source(
198
+ self,
199
+ name: str,
200
+ source_type: SourceType,
201
+ uri: Optional[str] = None,
202
+ description: Optional[str] = None,
203
+ owner: Optional[str] = None,
204
+ sync_frequency: Optional[str] = None,
205
+ schema_uri: Optional[str] = None,
206
+ config: Optional[dict[str, Any]] = None,
207
+ tags: Optional[list[str]] = None,
208
+ ) -> RegisteredSource:
209
+ """
210
+ Register a new data source in the registry.
211
+
212
+ Args:
213
+ name: Human-readable name for the source
214
+ source_type: Type of source (dataset, api, mapping, process)
215
+ uri: URI or connection string for the source
216
+ description: Optional description
217
+ owner: Team or person responsible for this source
218
+ sync_frequency: How often this source syncs (daily, hourly, etc.)
219
+ schema_uri: URI to schema definition
220
+ config: Additional configuration
221
+ tags: Tags for categorization
222
+
223
+ Returns:
224
+ The registered source with assigned ID
225
+ """
226
+ source = RegisteredSource(
227
+ id=uuid4(),
228
+ name=name,
229
+ source_type=source_type,
230
+ uri=uri,
231
+ description=description,
232
+ owner=owner,
233
+ sync_frequency=sync_frequency,
234
+ schema_uri=schema_uri,
235
+ config=config or {},
236
+ tags=tags or [],
237
+ )
238
+
239
+ new_row = pl.DataFrame([source.to_dict()])
240
+ self._sources_df = pl.concat([self._sources_df, new_row], how="vertical")
241
+
242
+ return source
243
+
244
+ def get_source(self, source_id: UUID) -> Optional[RegisteredSource]:
245
+ """Get a source by ID."""
246
+ filtered = self._sources_df.filter(pl.col("id") == str(source_id))
247
+ if len(filtered) == 0:
248
+ return None
249
+ return RegisteredSource.from_dict(filtered.row(0, named=True))
250
+
251
+ def get_source_by_name(self, name: str) -> Optional[RegisteredSource]:
252
+ """Get a source by name."""
253
+ filtered = self._sources_df.filter(pl.col("name") == name)
254
+ if len(filtered) == 0:
255
+ return None
256
+ return RegisteredSource.from_dict(filtered.row(0, named=True))
257
+
258
+ def get_sources(
259
+ self,
260
+ source_type: Optional[SourceType] = None,
261
+ status: Optional[SourceStatus] = None,
262
+ owner: Optional[str] = None,
263
+ tag: Optional[str] = None,
264
+ ) -> list[RegisteredSource]:
265
+ """
266
+ Query registered sources with optional filters.
267
+
268
+ Args:
269
+ source_type: Filter by source type
270
+ status: Filter by status
271
+ owner: Filter by owner
272
+ tag: Filter by tag (sources containing this tag)
273
+
274
+ Returns:
275
+ List of matching sources
276
+ """
277
+ df = self._sources_df
278
+
279
+ if source_type is not None:
280
+ df = df.filter(pl.col("source_type") == source_type.value)
281
+
282
+ if status is not None:
283
+ df = df.filter(pl.col("status") == status.value)
284
+
285
+ if owner is not None:
286
+ df = df.filter(pl.col("owner") == owner)
287
+
288
+ if tag is not None:
289
+ df = df.filter(pl.col("tags").str.contains(f'"{tag}"'))
290
+
291
+ return [RegisteredSource.from_dict(row) for row in df.iter_rows(named=True)]
292
+
293
+ def update_source_status(
294
+ self,
295
+ source_id: UUID,
296
+ status: SourceStatus,
297
+ ) -> None:
298
+ """Update the status of a source."""
299
+ self._sources_df = self._sources_df.with_columns(
300
+ pl.when(pl.col("id") == str(source_id))
301
+ .then(pl.lit(status.value))
302
+ .otherwise(pl.col("status"))
303
+ .alias("status")
304
+ )
305
+
306
+ def deprecate_source(self, source_id: UUID) -> None:
307
+ """Mark a source as deprecated."""
308
+ self.update_source_status(source_id, SourceStatus.DEPRECATED)
309
+
310
+ def start_sync(
311
+ self,
312
+ source_id: UUID,
313
+ metadata: Optional[dict[str, Any]] = None,
314
+ ) -> SyncRun:
315
+ """
316
+ Start a new synchronization run for a source.
317
+
318
+ Args:
319
+ source_id: ID of the source being synced
320
+ metadata: Optional metadata about the sync
321
+
322
+ Returns:
323
+ The created sync run
324
+ """
325
+ run = SyncRun(
326
+ id=uuid4(),
327
+ source_id=source_id,
328
+ started_at=datetime.now(timezone.utc),
329
+ metadata=metadata or {},
330
+ )
331
+
332
+ new_row = pl.DataFrame([run.to_dict()])
333
+ self._syncs_df = pl.concat([self._syncs_df, new_row], how="vertical")
334
+
335
+ # Update source status
336
+ self.update_source_status(source_id, SourceStatus.SYNCING)
337
+
338
+ return run
339
+
340
+ def complete_sync(
341
+ self,
342
+ run_id: UUID,
343
+ assertions_created: int = 0,
344
+ assertions_updated: int = 0,
345
+ assertions_deprecated: int = 0,
346
+ errors: Optional[list[str]] = None,
347
+ status: str = "success",
348
+ ) -> None:
349
+ """
350
+ Complete a synchronization run.
351
+
352
+ Args:
353
+ run_id: ID of the sync run
354
+ assertions_created: Number of new assertions
355
+ assertions_updated: Number of updated assertions
356
+ assertions_deprecated: Number of deprecated assertions
357
+ errors: Any errors encountered
358
+ status: Final status (success, failed, partial)
359
+ """
360
+ now = datetime.now(timezone.utc)
361
+ run_id_str = str(run_id)
362
+
363
+ # Get source_id before updating
364
+ run_row = self._syncs_df.filter(pl.col("id") == run_id_str)
365
+ if len(run_row) == 0:
366
+ raise ValueError(f"Sync run {run_id} not found")
367
+
368
+ source_id = run_row["source_id"][0]
369
+
370
+ # Update sync run
371
+ self._syncs_df = self._syncs_df.with_columns([
372
+ pl.when(pl.col("id") == run_id_str)
373
+ .then(pl.lit(now))
374
+ .otherwise(pl.col("completed_at"))
375
+ .alias("completed_at"),
376
+
377
+ pl.when(pl.col("id") == run_id_str)
378
+ .then(pl.lit(status))
379
+ .otherwise(pl.col("status"))
380
+ .alias("status"),
381
+
382
+ pl.when(pl.col("id") == run_id_str)
383
+ .then(pl.lit(assertions_created))
384
+ .otherwise(pl.col("assertions_created"))
385
+ .alias("assertions_created"),
386
+
387
+ pl.when(pl.col("id") == run_id_str)
388
+ .then(pl.lit(assertions_updated))
389
+ .otherwise(pl.col("assertions_updated"))
390
+ .alias("assertions_updated"),
391
+
392
+ pl.when(pl.col("id") == run_id_str)
393
+ .then(pl.lit(assertions_deprecated))
394
+ .otherwise(pl.col("assertions_deprecated"))
395
+ .alias("assertions_deprecated"),
396
+
397
+ pl.when(pl.col("id") == run_id_str)
398
+ .then(pl.lit(json.dumps(errors or [])))
399
+ .otherwise(pl.col("errors"))
400
+ .alias("errors"),
401
+ ])
402
+
403
+ # Update source last_sync and status
404
+ final_status = SourceStatus.ERROR if status == "failed" else SourceStatus.ACTIVE
405
+
406
+ self._sources_df = self._sources_df.with_columns([
407
+ pl.when(pl.col("id") == source_id)
408
+ .then(pl.lit(now))
409
+ .otherwise(pl.col("last_sync"))
410
+ .alias("last_sync"),
411
+
412
+ pl.when(pl.col("id") == source_id)
413
+ .then(pl.lit(final_status.value))
414
+ .otherwise(pl.col("status"))
415
+ .alias("status"),
416
+ ])
417
+
418
+ def get_sync_history(
419
+ self,
420
+ source_id: Optional[UUID] = None,
421
+ limit: int = 100,
422
+ ) -> pl.DataFrame:
423
+ """
424
+ Get synchronization history.
425
+
426
+ Args:
427
+ source_id: Filter by source (None for all)
428
+ limit: Maximum number of records
429
+
430
+ Returns:
431
+ DataFrame with sync history
432
+ """
433
+ df = self._syncs_df
434
+
435
+ if source_id is not None:
436
+ df = df.filter(pl.col("source_id") == str(source_id))
437
+
438
+ return df.sort("started_at", descending=True).head(limit)
439
+
440
+ def get_last_sync(self, source_id: UUID) -> Optional[SyncRun]:
441
+ """Get the most recent sync run for a source."""
442
+ history = self.get_sync_history(source_id, limit=1)
443
+ if len(history) == 0:
444
+ return None
445
+
446
+ row = history.row(0, named=True)
447
+ return SyncRun(
448
+ id=UUID(row["id"]),
449
+ source_id=UUID(row["source_id"]),
450
+ started_at=row["started_at"],
451
+ completed_at=row["completed_at"],
452
+ status=row["status"],
453
+ assertions_created=row["assertions_created"],
454
+ assertions_updated=row["assertions_updated"],
455
+ assertions_deprecated=row["assertions_deprecated"],
456
+ errors=json.loads(row["errors"]) if row["errors"] else [],
457
+ metadata=json.loads(row["metadata"]) if row["metadata"] else {},
458
+ )
459
+
460
+ def create_provenance_context(
461
+ self,
462
+ source_id: UUID,
463
+ confidence: float = 1.0,
464
+ process: Optional[str] = None,
465
+ ) -> ProvenanceContext:
466
+ """
467
+ Create a ProvenanceContext linked to a registered source.
468
+
469
+ This bridges the registry with the triple store, allowing
470
+ assertions to reference their source.
471
+
472
+ Args:
473
+ source_id: ID of the registered source
474
+ confidence: Confidence level for assertions
475
+ process: Optional process name
476
+
477
+ Returns:
478
+ ProvenanceContext that can be used with TripleStore.add_triple
479
+ """
480
+ source = self.get_source(source_id)
481
+ if source is None:
482
+ raise ValueError(f"Source {source_id} not found")
483
+
484
+ return ProvenanceContext(
485
+ source=source.name,
486
+ confidence=confidence,
487
+ process=process or f"{source.source_type.value}_sync",
488
+ metadata={"source_id": str(source_id), "source_uri": source.uri},
489
+ )
490
+
491
+ def get_stats(self) -> dict[str, Any]:
492
+ """Get registry statistics."""
493
+ sources_by_type = (
494
+ self._sources_df
495
+ .group_by("source_type")
496
+ .agg(pl.len().alias("count"))
497
+ )
498
+
499
+ sources_by_status = (
500
+ self._sources_df
501
+ .group_by("status")
502
+ .agg(pl.len().alias("count"))
503
+ )
504
+
505
+ total_syncs = len(self._syncs_df)
506
+ successful_syncs = len(self._syncs_df.filter(pl.col("status") == "success"))
507
+
508
+ return {
509
+ "total_sources": len(self._sources_df),
510
+ "sources_by_type": {
511
+ row["source_type"]: row["count"]
512
+ for row in sources_by_type.iter_rows(named=True)
513
+ },
514
+ "sources_by_status": {
515
+ row["status"]: row["count"]
516
+ for row in sources_by_status.iter_rows(named=True)
517
+ },
518
+ "total_sync_runs": total_syncs,
519
+ "successful_sync_runs": successful_syncs,
520
+ "sync_success_rate": successful_syncs / total_syncs if total_syncs > 0 else 0,
521
+ }
522
+
523
+ def save(self, path: str) -> None:
524
+ """
525
+ Save registry to Parquet files.
526
+
527
+ Creates two files:
528
+ - {path}_sources.parquet
529
+ - {path}_syncs.parquet
530
+ """
531
+ self._sources_df.write_parquet(f"{path}_sources.parquet")
532
+ self._syncs_df.write_parquet(f"{path}_syncs.parquet")
533
+
534
+ @classmethod
535
+ def load(cls, path: str) -> "AssertionRegistry":
536
+ """Load registry from Parquet files."""
537
+ registry = cls()
538
+ registry._sources_df = pl.read_parquet(f"{path}_sources.parquet")
539
+ registry._syncs_df = pl.read_parquet(f"{path}_syncs.parquet")
540
+ return registry