sutradb 0.3.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sutradb/__init__.py +5 -0
- sutradb/_version.py +1 -0
- sutradb/client.py +285 -0
- sutradb/jupyter.py +124 -0
- sutradb/langchain.py +150 -0
- sutradb/owl.py +304 -0
- sutradb-0.3.2.dist-info/METADATA +7 -0
- sutradb-0.3.2.dist-info/RECORD +9 -0
- sutradb-0.3.2.dist-info/WHEEL +5 -0
sutradb/__init__.py
ADDED
sutradb/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.2"
|
sutradb/client.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""SutraDB Python client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SutraError(Exception):
|
|
11
|
+
"""Exception raised by the SutraDB client on request failures."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, message: str, status_code: int | None = None) -> None:
|
|
14
|
+
super().__init__(message)
|
|
15
|
+
self.status_code = status_code
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SutraClient:
|
|
19
|
+
"""Client for interacting with a SutraDB server.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
endpoint: Base URL of the SutraDB HTTP server.
|
|
23
|
+
Defaults to ``http://localhost:3030``.
|
|
24
|
+
owl_validation: Enable client-side OWL constraint validation.
|
|
25
|
+
When True (default), inserts are checked against OWL axioms
|
|
26
|
+
stored in the database before being sent. Raises OWLViolation
|
|
27
|
+
on constraint violations. The database itself always accepts
|
|
28
|
+
all triples regardless of this setting.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
endpoint: str = "http://localhost:3030",
|
|
34
|
+
owl_validation: bool = True,
|
|
35
|
+
) -> None:
|
|
36
|
+
self.endpoint = endpoint.rstrip("/")
|
|
37
|
+
self._session = requests.Session()
|
|
38
|
+
self._session.headers.update({"User-Agent": "sutradb-python/0.1.0"})
|
|
39
|
+
self._owl_validation = owl_validation
|
|
40
|
+
self._owl_validator = None
|
|
41
|
+
|
|
42
|
+
# ------------------------------------------------------------------
|
|
43
|
+
# Internal helpers
|
|
44
|
+
# ------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
def _url(self, path: str) -> str:
|
|
47
|
+
return f"{self.endpoint}{path}"
|
|
48
|
+
|
|
49
|
+
def _request(
|
|
50
|
+
self,
|
|
51
|
+
method: str,
|
|
52
|
+
path: str,
|
|
53
|
+
*,
|
|
54
|
+
params: dict[str, Any] | None = None,
|
|
55
|
+
json: Any | None = None,
|
|
56
|
+
data: str | None = None,
|
|
57
|
+
headers: dict[str, str] | None = None,
|
|
58
|
+
) -> requests.Response:
|
|
59
|
+
"""Send an HTTP request and raise :class:`SutraError` on failure."""
|
|
60
|
+
try:
|
|
61
|
+
resp = self._session.request(
|
|
62
|
+
method,
|
|
63
|
+
self._url(path),
|
|
64
|
+
params=params,
|
|
65
|
+
json=json,
|
|
66
|
+
data=data,
|
|
67
|
+
headers=headers,
|
|
68
|
+
)
|
|
69
|
+
except requests.RequestException as exc:
|
|
70
|
+
raise SutraError(f"Connection error: {exc}") from exc
|
|
71
|
+
|
|
72
|
+
if not resp.ok:
|
|
73
|
+
raise SutraError(
|
|
74
|
+
f"HTTP {resp.status_code}: {resp.text}",
|
|
75
|
+
status_code=resp.status_code,
|
|
76
|
+
)
|
|
77
|
+
return resp
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
# OWL validation
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
def _ensure_owl_loaded(self) -> None:
|
|
84
|
+
"""Lazy-load OWL ontology from the database on first validation."""
|
|
85
|
+
if self._owl_validator is not None:
|
|
86
|
+
return
|
|
87
|
+
try:
|
|
88
|
+
from .owl import OWLValidator
|
|
89
|
+
|
|
90
|
+
self._owl_validator = OWLValidator()
|
|
91
|
+
self._owl_validator.load_from_client(self)
|
|
92
|
+
except Exception:
|
|
93
|
+
# If we can't load the ontology, skip validation silently
|
|
94
|
+
self._owl_validator = None
|
|
95
|
+
|
|
96
|
+
def reload_owl(self) -> None:
|
|
97
|
+
"""Force reload of OWL ontology from the database."""
|
|
98
|
+
self._owl_validator = None
|
|
99
|
+
self._ensure_owl_loaded()
|
|
100
|
+
|
|
101
|
+
# ------------------------------------------------------------------
|
|
102
|
+
# Public API
|
|
103
|
+
# ------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def health(self) -> bool:
|
|
106
|
+
"""Check whether the server is reachable.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
``True`` if the server responds to ``GET /health`` with a 2xx
|
|
110
|
+
status code, ``False`` otherwise.
|
|
111
|
+
"""
|
|
112
|
+
try:
|
|
113
|
+
self._request("GET", "/health")
|
|
114
|
+
return True
|
|
115
|
+
except SutraError:
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
def sparql(self, query: str) -> dict:
|
|
119
|
+
"""Execute a SPARQL query and return the parsed JSON result.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
query: A SPARQL 1.1 query string.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
The JSON response body as a Python dict (SPARQL JSON Results
|
|
126
|
+
format for SELECT/ASK, or a status dict for UPDATE).
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
SutraError: If the server returns a non-2xx status code.
|
|
130
|
+
"""
|
|
131
|
+
resp = self._request(
|
|
132
|
+
"GET",
|
|
133
|
+
"/sparql",
|
|
134
|
+
params={"query": query},
|
|
135
|
+
headers={"Accept": "application/sparql-results+json"},
|
|
136
|
+
)
|
|
137
|
+
return resp.json()
|
|
138
|
+
|
|
139
|
+
def insert_triples(
|
|
140
|
+
self, ntriples: str, batch_size: int = 5000
|
|
141
|
+
) -> dict[str, Any]:
|
|
142
|
+
"""Insert triples in N-Triples format, optionally in batches.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
ntriples: One or more triples in N-Triples syntax (one per line).
|
|
146
|
+
batch_size: Maximum number of triples to send per HTTP request.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
A dict ``{"inserted": int, "errors": list[str]}`` summarising the
|
|
150
|
+
outcome across all batches.
|
|
151
|
+
"""
|
|
152
|
+
# OWL validation (client-side, before sending to database)
|
|
153
|
+
if self._owl_validation:
|
|
154
|
+
self._ensure_owl_loaded()
|
|
155
|
+
if self._owl_validator and self._owl_validator.has_constraints():
|
|
156
|
+
from .owl import OWLViolation
|
|
157
|
+
|
|
158
|
+
violations = self._owl_validator.validate_ntriples(ntriples)
|
|
159
|
+
if violations:
|
|
160
|
+
raise violations[0] # Raise first violation
|
|
161
|
+
|
|
162
|
+
lines = [
|
|
163
|
+
line for line in ntriples.splitlines() if line.strip()
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
total_inserted = 0
|
|
167
|
+
errors: list[str] = []
|
|
168
|
+
|
|
169
|
+
for start in range(0, len(lines), batch_size):
|
|
170
|
+
batch = "\n".join(lines[start : start + batch_size])
|
|
171
|
+
try:
|
|
172
|
+
resp = self._request(
|
|
173
|
+
"POST",
|
|
174
|
+
"/triples",
|
|
175
|
+
data=batch,
|
|
176
|
+
headers={"Content-Type": "application/n-triples"},
|
|
177
|
+
)
|
|
178
|
+
body = resp.json()
|
|
179
|
+
total_inserted += body.get("inserted", 0)
|
|
180
|
+
batch_errors = body.get("errors", [])
|
|
181
|
+
if batch_errors:
|
|
182
|
+
errors.extend(batch_errors)
|
|
183
|
+
except SutraError as exc:
|
|
184
|
+
errors.append(str(exc))
|
|
185
|
+
|
|
186
|
+
return {"inserted": total_inserted, "errors": errors}
|
|
187
|
+
|
|
188
|
+
def declare_vector(
|
|
189
|
+
self,
|
|
190
|
+
predicate: str,
|
|
191
|
+
dimensions: int,
|
|
192
|
+
m: int = 16,
|
|
193
|
+
ef_construction: int = 200,
|
|
194
|
+
metric: str = "cosine",
|
|
195
|
+
) -> dict:
|
|
196
|
+
"""Declare an HNSW-indexed vector predicate.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
predicate: The IRI of the vector predicate (e.g.
|
|
200
|
+
``"http://example.org/hasEmbedding"``).
|
|
201
|
+
dimensions: The fixed dimensionality of vectors for this predicate.
|
|
202
|
+
m: HNSW ``M`` parameter (max connections per node per layer).
|
|
203
|
+
ef_construction: HNSW ``ef_construction`` beam width.
|
|
204
|
+
metric: Distance metric (``"cosine"``, ``"euclidean"``, or
|
|
205
|
+
``"dot"``).
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
The server response as a dict, typically containing ``status`` and
|
|
209
|
+
``predicate_id`` keys.
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
SutraError: If the server rejects the declaration.
|
|
213
|
+
"""
|
|
214
|
+
resp = self._request(
|
|
215
|
+
"POST",
|
|
216
|
+
"/vectors/declare",
|
|
217
|
+
json={
|
|
218
|
+
"predicate": predicate,
|
|
219
|
+
"dimensions": dimensions,
|
|
220
|
+
"m": m,
|
|
221
|
+
"ef_construction": ef_construction,
|
|
222
|
+
"metric": metric,
|
|
223
|
+
},
|
|
224
|
+
)
|
|
225
|
+
return resp.json()
|
|
226
|
+
|
|
227
|
+
def insert_vector(
|
|
228
|
+
self, predicate: str, subject: str, vector: list[float]
|
|
229
|
+
) -> dict:
|
|
230
|
+
"""Insert a single vector embedding.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
predicate: The IRI of the vector predicate.
|
|
234
|
+
subject: The IRI of the subject node.
|
|
235
|
+
vector: The embedding as a list of floats.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The server response as a dict, typically containing ``status`` and
|
|
239
|
+
``triple_id`` keys.
|
|
240
|
+
|
|
241
|
+
Raises:
|
|
242
|
+
SutraError: If the server rejects the insert.
|
|
243
|
+
"""
|
|
244
|
+
resp = self._request(
|
|
245
|
+
"POST",
|
|
246
|
+
"/vectors",
|
|
247
|
+
json={
|
|
248
|
+
"predicate": predicate,
|
|
249
|
+
"subject": subject,
|
|
250
|
+
"vector": vector,
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
return resp.json()
|
|
254
|
+
|
|
255
|
+
def insert_vectors_batch(
|
|
256
|
+
self,
|
|
257
|
+
predicate: str,
|
|
258
|
+
entries: list[tuple[str, list[float]]],
|
|
259
|
+
batch_size: int = 100,
|
|
260
|
+
) -> dict[str, Any]:
|
|
261
|
+
"""Insert multiple vectors in batches.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
predicate: The IRI of the vector predicate.
|
|
265
|
+
entries: A list of ``(subject_iri, vector)`` tuples.
|
|
266
|
+
batch_size: Maximum number of vectors to send per HTTP request.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
A dict ``{"inserted": int, "errors": list[str]}`` summarising
|
|
270
|
+
the outcome across all batches.
|
|
271
|
+
"""
|
|
272
|
+
total_inserted = 0
|
|
273
|
+
errors: list[str] = []
|
|
274
|
+
|
|
275
|
+
for start in range(0, len(entries), batch_size):
|
|
276
|
+
batch = entries[start : start + batch_size]
|
|
277
|
+
for subject, vector in batch:
|
|
278
|
+
try:
|
|
279
|
+
result = self.insert_vector(predicate, subject, vector)
|
|
280
|
+
if result.get("status") == "ok":
|
|
281
|
+
total_inserted += 1
|
|
282
|
+
except SutraError as exc:
|
|
283
|
+
errors.append(f"{subject}: {exc}")
|
|
284
|
+
|
|
285
|
+
return {"inserted": total_inserted, "errors": errors}
|
sutradb/jupyter.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Jupyter integration for SutraDB.
|
|
2
|
+
|
|
3
|
+
Provides %%sparql cell magic for executing SPARQL queries inline in
|
|
4
|
+
Jupyter notebooks with tabular result display.
|
|
5
|
+
|
|
6
|
+
Usage in a Jupyter notebook:
|
|
7
|
+
|
|
8
|
+
# First, load the extension
|
|
9
|
+
%load_ext sutradb.jupyter
|
|
10
|
+
|
|
11
|
+
# Then use the magic
|
|
12
|
+
%%sparql
|
|
13
|
+
SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10
|
|
14
|
+
|
|
15
|
+
# Or with a custom endpoint
|
|
16
|
+
%%sparql http://localhost:8080
|
|
17
|
+
SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from IPython.core.magic import register_cell_magic, needs_local_scope
|
|
23
|
+
import requests
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_default_endpoint = "http://localhost:3030"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _shorten(iri: str) -> str:
|
|
30
|
+
"""Shorten an IRI for display."""
|
|
31
|
+
prefixes = {
|
|
32
|
+
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
|
|
33
|
+
"http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
|
|
34
|
+
"http://www.w3.org/2002/07/owl#": "owl:",
|
|
35
|
+
"http://www.w3.org/2001/XMLSchema#": "xsd:",
|
|
36
|
+
"http://www.wikidata.org/entity/": "wd:",
|
|
37
|
+
"http://www.wikidata.org/prop/direct/": "wdt:",
|
|
38
|
+
"http://sutra.dev/": "sutra:",
|
|
39
|
+
"http://schema.org/": "schema:",
|
|
40
|
+
}
|
|
41
|
+
for full, short in prefixes.items():
|
|
42
|
+
if iri.startswith(full):
|
|
43
|
+
return short + iri[len(full):]
|
|
44
|
+
return iri
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@register_cell_magic
|
|
48
|
+
def sparql(line, cell):
|
|
49
|
+
"""Execute a SPARQL query against SutraDB.
|
|
50
|
+
|
|
51
|
+
Usage:
|
|
52
|
+
%%sparql [endpoint]
|
|
53
|
+
SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 10
|
|
54
|
+
"""
|
|
55
|
+
endpoint = line.strip() if line.strip() else _default_endpoint
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
resp = requests.post(
|
|
59
|
+
f"{endpoint}/sparql",
|
|
60
|
+
data=cell,
|
|
61
|
+
headers={"Accept": "application/sparql-results+json"},
|
|
62
|
+
timeout=30,
|
|
63
|
+
)
|
|
64
|
+
if resp.status_code != 200:
|
|
65
|
+
print(f"Error: HTTP {resp.status_code}")
|
|
66
|
+
print(resp.text)
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
data = resp.json()
|
|
70
|
+
columns = data.get("head", {}).get("vars", [])
|
|
71
|
+
bindings = data.get("results", {}).get("bindings", [])
|
|
72
|
+
|
|
73
|
+
if not bindings:
|
|
74
|
+
print("No results.")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
# Try to use pandas for nice display
|
|
78
|
+
try:
|
|
79
|
+
import pandas as pd
|
|
80
|
+
|
|
81
|
+
rows = []
|
|
82
|
+
for b in bindings:
|
|
83
|
+
row = {}
|
|
84
|
+
for col in columns:
|
|
85
|
+
val = b.get(col, {}).get("value", "")
|
|
86
|
+
row[col] = _shorten(val)
|
|
87
|
+
rows.append(row)
|
|
88
|
+
df = pd.DataFrame(rows, columns=columns)
|
|
89
|
+
from IPython.display import display
|
|
90
|
+
|
|
91
|
+
display(df)
|
|
92
|
+
except ImportError:
|
|
93
|
+
# Fallback: plain text table
|
|
94
|
+
# Header
|
|
95
|
+
widths = {c: max(len(c), 10) for c in columns}
|
|
96
|
+
for b in bindings[:20]:
|
|
97
|
+
for c in columns:
|
|
98
|
+
v = _shorten(b.get(c, {}).get("value", ""))
|
|
99
|
+
widths[c] = max(widths[c], min(len(v), 50))
|
|
100
|
+
|
|
101
|
+
header = " | ".join(c.ljust(widths[c]) for c in columns)
|
|
102
|
+
separator = "-+-".join("-" * widths[c] for c in columns)
|
|
103
|
+
print(header)
|
|
104
|
+
print(separator)
|
|
105
|
+
for b in bindings:
|
|
106
|
+
row = " | ".join(
|
|
107
|
+
_shorten(b.get(c, {}).get("value", ""))[:widths[c]].ljust(
|
|
108
|
+
widths[c]
|
|
109
|
+
)
|
|
110
|
+
for c in columns
|
|
111
|
+
)
|
|
112
|
+
print(row)
|
|
113
|
+
print(f"\n{len(bindings)} rows")
|
|
114
|
+
|
|
115
|
+
except requests.ConnectionError:
|
|
116
|
+
print(f"Error: Could not connect to SutraDB at {endpoint}")
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(f"Error: {e}")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def load_ipython_extension(ipython):
|
|
122
|
+
"""Called when %load_ext sutradb.jupyter is executed."""
|
|
123
|
+
# The @register_cell_magic decorator handles registration
|
|
124
|
+
pass
|
sutradb/langchain.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""LangChain integration for SutraDB.
|
|
2
|
+
|
|
3
|
+
Provides SutraDB as both a VectorStore and a knowledge graph for
|
|
4
|
+
Retrieval-Augmented Generation (RAG) pipelines.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from sutradb.langchain import SutraVectorStore
|
|
8
|
+
|
|
9
|
+
vectorstore = SutraVectorStore(
|
|
10
|
+
endpoint="http://localhost:3030",
|
|
11
|
+
predicate="http://sutra.dev/hasEmbedding",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Use with LangChain
|
|
15
|
+
retriever = vectorstore.as_retriever()
|
|
16
|
+
docs = retriever.get_relevant_documents("What is a transformer?")
|
|
17
|
+
|
|
18
|
+
Requires: pip install langchain-core
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import Any, Iterable, Optional
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from langchain_core.documents import Document
|
|
27
|
+
from langchain_core.vectorstores import VectorStore
|
|
28
|
+
from langchain_core.embeddings import Embeddings
|
|
29
|
+
except ImportError:
|
|
30
|
+
raise ImportError(
|
|
31
|
+
"langchain-core is required for LangChain integration. "
|
|
32
|
+
"Install it with: pip install langchain-core"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
from .client import SutraClient
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SutraVectorStore(VectorStore):
|
|
39
|
+
"""LangChain VectorStore backed by SutraDB.
|
|
40
|
+
|
|
41
|
+
Uses SutraDB's HNSW vector index for similarity search and
|
|
42
|
+
the RDF triple store for metadata/knowledge graph queries.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
endpoint: str = "http://localhost:3030",
|
|
48
|
+
predicate: str = "http://sutra.dev/hasEmbedding",
|
|
49
|
+
embedding: Optional[Embeddings] = None,
|
|
50
|
+
dimensions: int = 1024,
|
|
51
|
+
**kwargs: Any,
|
|
52
|
+
):
|
|
53
|
+
self._client = SutraClient(endpoint, owl_validation=False)
|
|
54
|
+
self._predicate = predicate
|
|
55
|
+
self._embedding = embedding
|
|
56
|
+
self._dimensions = dimensions
|
|
57
|
+
|
|
58
|
+
# Ensure vector predicate is declared
|
|
59
|
+
try:
|
|
60
|
+
self._client.declare_vector(predicate, dimensions)
|
|
61
|
+
except Exception:
|
|
62
|
+
pass # May already exist
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def embeddings(self) -> Optional[Embeddings]:
|
|
66
|
+
return self._embedding
|
|
67
|
+
|
|
68
|
+
def add_texts(
|
|
69
|
+
self,
|
|
70
|
+
texts: Iterable[str],
|
|
71
|
+
metadatas: Optional[list[dict]] = None,
|
|
72
|
+
**kwargs: Any,
|
|
73
|
+
) -> list[str]:
|
|
74
|
+
"""Add texts with embeddings to SutraDB."""
|
|
75
|
+
if self._embedding is None:
|
|
76
|
+
raise ValueError("Embeddings model required for add_texts")
|
|
77
|
+
|
|
78
|
+
texts_list = list(texts)
|
|
79
|
+
vectors = self._embedding.embed_documents(texts_list)
|
|
80
|
+
ids = []
|
|
81
|
+
|
|
82
|
+
for i, (text, vector) in enumerate(zip(texts_list, vectors)):
|
|
83
|
+
# Generate a subject IRI
|
|
84
|
+
import hashlib
|
|
85
|
+
text_hash = hashlib.md5(text.encode()).hexdigest()[:12]
|
|
86
|
+
subject = f"http://sutra.dev/doc/{text_hash}"
|
|
87
|
+
|
|
88
|
+
# Insert vector
|
|
89
|
+
self._client.insert_vector(self._predicate, subject, vector)
|
|
90
|
+
|
|
91
|
+
# Insert text as a triple
|
|
92
|
+
escaped = text.replace('"', '\\"').replace('\n', '\\n')
|
|
93
|
+
ntriples = f'<{subject}> <http://sutra.dev/text> "{escaped}" .'
|
|
94
|
+
|
|
95
|
+
# Insert metadata
|
|
96
|
+
if metadatas and i < len(metadatas):
|
|
97
|
+
for key, value in metadatas[i].items():
|
|
98
|
+
escaped_val = str(value).replace('"', '\\"')
|
|
99
|
+
ntriples += f'\n<{subject}> <http://sutra.dev/meta/{key}> "{escaped_val}" .'
|
|
100
|
+
|
|
101
|
+
self._client.insert_triples(ntriples)
|
|
102
|
+
ids.append(subject)
|
|
103
|
+
|
|
104
|
+
return ids
|
|
105
|
+
|
|
106
|
+
def similarity_search(
|
|
107
|
+
self,
|
|
108
|
+
query: str,
|
|
109
|
+
k: int = 4,
|
|
110
|
+
**kwargs: Any,
|
|
111
|
+
) -> list[Document]:
|
|
112
|
+
"""Search for similar documents."""
|
|
113
|
+
if self._embedding is None:
|
|
114
|
+
raise ValueError("Embeddings model required for similarity_search")
|
|
115
|
+
|
|
116
|
+
query_vector = self._embedding.embed_query(query)
|
|
117
|
+
vec_str = " ".join(f"{v:.6f}" for v in query_vector)
|
|
118
|
+
|
|
119
|
+
sparql = (
|
|
120
|
+
f'SELECT ?doc ?text WHERE {{\n'
|
|
121
|
+
f' VECTOR_SIMILAR(?doc <{self._predicate}> '
|
|
122
|
+
f'"{vec_str}"^^<http://sutra.dev/f32vec>, 0.5, k:={k})\n'
|
|
123
|
+
f' OPTIONAL {{ ?doc <http://sutra.dev/text> ?text }}\n'
|
|
124
|
+
f'}}'
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
result = self._client.sparql(sparql)
|
|
128
|
+
docs = []
|
|
129
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
130
|
+
doc_uri = row.get("doc", {}).get("value", "")
|
|
131
|
+
text = row.get("text", {}).get("value", "")
|
|
132
|
+
docs.append(Document(
|
|
133
|
+
page_content=text or doc_uri,
|
|
134
|
+
metadata={"source": doc_uri},
|
|
135
|
+
))
|
|
136
|
+
|
|
137
|
+
return docs
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def from_texts(
|
|
141
|
+
cls,
|
|
142
|
+
texts: list[str],
|
|
143
|
+
embedding: Embeddings,
|
|
144
|
+
metadatas: Optional[list[dict]] = None,
|
|
145
|
+
**kwargs: Any,
|
|
146
|
+
) -> "SutraVectorStore":
|
|
147
|
+
"""Create a SutraVectorStore from texts."""
|
|
148
|
+
store = cls(embedding=embedding, **kwargs)
|
|
149
|
+
store.add_texts(texts, metadatas=metadatas)
|
|
150
|
+
return store
|
sutradb/owl.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Client-side OWL validation for SutraDB Python SDK.
|
|
2
|
+
|
|
3
|
+
The database accepts all triples unconditionally. OWL validation
|
|
4
|
+
happens here in the SDK before sending data to the server. This
|
|
5
|
+
follows the "lean store, smart client" principle.
|
|
6
|
+
|
|
7
|
+
OWL validation is ENABLED by default. Disable it with:
|
|
8
|
+
client = SutraClient(owl_validation=False)
|
|
9
|
+
|
|
10
|
+
The validator loads OWL ontology triples from the database on first use,
|
|
11
|
+
caches them locally, and checks inserts against:
|
|
12
|
+
- rdfs:domain (property domain constraints)
|
|
13
|
+
- rdfs:range (property range constraints)
|
|
14
|
+
- rdfs:subClassOf (type hierarchy)
|
|
15
|
+
- owl:FunctionalProperty (max one value)
|
|
16
|
+
- owl:disjointWith (classes that can't overlap)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
24
|
+
RDFS_DOMAIN = "http://www.w3.org/2000/01/rdf-schema#domain"
|
|
25
|
+
RDFS_RANGE = "http://www.w3.org/2000/01/rdf-schema#range"
|
|
26
|
+
RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
|
|
27
|
+
OWL_FUNCTIONAL = "http://www.w3.org/2002/07/owl#FunctionalProperty"
|
|
28
|
+
OWL_DISJOINT = "http://www.w3.org/2002/07/owl#disjointWith"
|
|
29
|
+
OWL_EQUIVALENT_CLASS = "http://www.w3.org/2002/07/owl#equivalentClass"
|
|
30
|
+
OWL_SAME_AS = "http://www.w3.org/2002/07/owl#sameAs"
|
|
31
|
+
OWL_INVERSE_OF = "http://www.w3.org/2002/07/owl#inverseOf"
|
|
32
|
+
OWL_SOME_VALUES_FROM = "http://www.w3.org/2002/07/owl#someValuesFrom"
|
|
33
|
+
OWL_ALL_VALUES_FROM = "http://www.w3.org/2002/07/owl#allValuesFrom"
|
|
34
|
+
OWL_ON_PROPERTY = "http://www.w3.org/2002/07/owl#onProperty"
|
|
35
|
+
RDFS_SUB_PROPERTY_OF = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OWLViolation(Exception):
|
|
39
|
+
"""Raised when a triple violates an OWL constraint."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, message: str, constraint_type: str, triple: tuple):
|
|
42
|
+
super().__init__(message)
|
|
43
|
+
self.constraint_type = constraint_type
|
|
44
|
+
self.triple = triple
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class OWLValidator:
|
|
48
|
+
"""Client-side OWL constraint validator.
|
|
49
|
+
|
|
50
|
+
Loads ontology axioms from SutraDB and validates triples before insert.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self.domains: dict[str, str] = {} # property -> domain class
|
|
55
|
+
self.ranges: dict[str, str] = {} # property -> range class
|
|
56
|
+
self.subclass_of: dict[str, set[str]] = {} # class -> set of parent classes
|
|
57
|
+
self.sub_property_of: dict[str, set[str]] = {} # property -> parent properties
|
|
58
|
+
self.functional: set[str] = set() # functional properties
|
|
59
|
+
self.disjoint: dict[str, set[str]] = {} # class -> disjoint classes
|
|
60
|
+
self.equivalent_classes: dict[str, set[str]] = {} # class -> equivalent classes
|
|
61
|
+
self.same_as: dict[str, set[str]] = {} # entity -> same-as entities
|
|
62
|
+
self.inverse_of: dict[str, str] = {} # property -> inverse property
|
|
63
|
+
self.restrictions: list[dict] = [] # OWL restrictions (someValues, allValues)
|
|
64
|
+
self.entity_types: dict[str, set[str]] = {} # entity -> set of types
|
|
65
|
+
self._loaded = False
|
|
66
|
+
|
|
67
|
+
def load_from_client(self, client) -> None:
|
|
68
|
+
"""Load OWL ontology triples from a SutraDB client."""
|
|
69
|
+
# Load domain constraints
|
|
70
|
+
result = client.sparql(
|
|
71
|
+
f'SELECT ?p ?d WHERE {{ ?p <{RDFS_DOMAIN}> ?d }}'
|
|
72
|
+
)
|
|
73
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
74
|
+
p = row.get("p", {}).get("value", "")
|
|
75
|
+
d = row.get("d", {}).get("value", "")
|
|
76
|
+
if p and d:
|
|
77
|
+
self.domains[p] = d
|
|
78
|
+
|
|
79
|
+
# Load range constraints
|
|
80
|
+
result = client.sparql(
|
|
81
|
+
f'SELECT ?p ?r WHERE {{ ?p <{RDFS_RANGE}> ?r }}'
|
|
82
|
+
)
|
|
83
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
84
|
+
p = row.get("p", {}).get("value", "")
|
|
85
|
+
r = row.get("r", {}).get("value", "")
|
|
86
|
+
if p and r:
|
|
87
|
+
self.ranges[p] = r
|
|
88
|
+
|
|
89
|
+
# Load subclass hierarchy
|
|
90
|
+
result = client.sparql(
|
|
91
|
+
f'SELECT ?c ?parent WHERE {{ ?c <{RDFS_SUBCLASS_OF}> ?parent }}'
|
|
92
|
+
)
|
|
93
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
94
|
+
c = row.get("c", {}).get("value", "")
|
|
95
|
+
parent = row.get("parent", {}).get("value", "")
|
|
96
|
+
if c and parent:
|
|
97
|
+
self.subclass_of.setdefault(c, set()).add(parent)
|
|
98
|
+
|
|
99
|
+
# Load functional properties
|
|
100
|
+
result = client.sparql(
|
|
101
|
+
f'SELECT ?p WHERE {{ ?p <{RDF_TYPE}> <{OWL_FUNCTIONAL}> }}'
|
|
102
|
+
)
|
|
103
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
104
|
+
p = row.get("p", {}).get("value", "")
|
|
105
|
+
if p:
|
|
106
|
+
self.functional.add(p)
|
|
107
|
+
|
|
108
|
+
# Load property hierarchy
|
|
109
|
+
result = client.sparql(
|
|
110
|
+
f'SELECT ?p ?parent WHERE {{ ?p <{RDFS_SUB_PROPERTY_OF}> ?parent }}'
|
|
111
|
+
)
|
|
112
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
113
|
+
p = row.get("p", {}).get("value", "")
|
|
114
|
+
parent = row.get("parent", {}).get("value", "")
|
|
115
|
+
if p and parent:
|
|
116
|
+
self.sub_property_of.setdefault(p, set()).add(parent)
|
|
117
|
+
|
|
118
|
+
# Load equivalent classes
|
|
119
|
+
result = client.sparql(
|
|
120
|
+
f'SELECT ?a ?b WHERE {{ ?a <{OWL_EQUIVALENT_CLASS}> ?b }}'
|
|
121
|
+
)
|
|
122
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
123
|
+
a = row.get("a", {}).get("value", "")
|
|
124
|
+
b = row.get("b", {}).get("value", "")
|
|
125
|
+
if a and b:
|
|
126
|
+
self.equivalent_classes.setdefault(a, set()).add(b)
|
|
127
|
+
self.equivalent_classes.setdefault(b, set()).add(a)
|
|
128
|
+
|
|
129
|
+
# Load owl:sameAs
|
|
130
|
+
result = client.sparql(
|
|
131
|
+
f'SELECT ?a ?b WHERE {{ ?a <{OWL_SAME_AS}> ?b }}'
|
|
132
|
+
)
|
|
133
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
134
|
+
a = row.get("a", {}).get("value", "")
|
|
135
|
+
b = row.get("b", {}).get("value", "")
|
|
136
|
+
if a and b:
|
|
137
|
+
self.same_as.setdefault(a, set()).add(b)
|
|
138
|
+
self.same_as.setdefault(b, set()).add(a)
|
|
139
|
+
|
|
140
|
+
# Load owl:inverseOf
|
|
141
|
+
result = client.sparql(
|
|
142
|
+
f'SELECT ?p ?inv WHERE {{ ?p <{OWL_INVERSE_OF}> ?inv }}'
|
|
143
|
+
)
|
|
144
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
145
|
+
p = row.get("p", {}).get("value", "")
|
|
146
|
+
inv = row.get("inv", {}).get("value", "")
|
|
147
|
+
if p and inv:
|
|
148
|
+
self.inverse_of[p] = inv
|
|
149
|
+
self.inverse_of[inv] = p
|
|
150
|
+
|
|
151
|
+
# Load entity types (for validation)
|
|
152
|
+
result = client.sparql(
|
|
153
|
+
f'SELECT ?e ?t WHERE {{ ?e <{RDF_TYPE}> ?t }} LIMIT 10000'
|
|
154
|
+
)
|
|
155
|
+
for row in result.get("results", {}).get("bindings", []):
|
|
156
|
+
e = row.get("e", {}).get("value", "")
|
|
157
|
+
t = row.get("t", {}).get("value", "")
|
|
158
|
+
if e and t:
|
|
159
|
+
self.entity_types.setdefault(e, set()).add(t)
|
|
160
|
+
|
|
161
|
+
self._loaded = True
|
|
162
|
+
|
|
163
|
+
def is_loaded(self) -> bool:
|
|
164
|
+
"""Whether the ontology has been loaded."""
|
|
165
|
+
return self._loaded
|
|
166
|
+
|
|
167
|
+
def has_constraints(self) -> bool:
|
|
168
|
+
"""Whether any OWL constraints exist in the database."""
|
|
169
|
+
return bool(
|
|
170
|
+
self.domains or self.ranges or self.functional or self.disjoint
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def get_all_types(self, class_iri: str) -> set[str]:
|
|
174
|
+
"""Get a class and all its ancestors via rdfs:subClassOf."""
|
|
175
|
+
result = {class_iri}
|
|
176
|
+
queue = [class_iri]
|
|
177
|
+
while queue:
|
|
178
|
+
current = queue.pop()
|
|
179
|
+
for parent in self.subclass_of.get(current, set()):
|
|
180
|
+
if parent not in result:
|
|
181
|
+
result.add(parent)
|
|
182
|
+
queue.append(parent)
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
def validate_triple(
|
|
186
|
+
self, subject: str, predicate: str, obj: str
|
|
187
|
+
) -> Optional[OWLViolation]:
|
|
188
|
+
"""Validate a single triple against OWL constraints.
|
|
189
|
+
|
|
190
|
+
Returns None if valid, or an OWLViolation if invalid.
|
|
191
|
+
"""
|
|
192
|
+
triple = (subject, predicate, obj)
|
|
193
|
+
|
|
194
|
+
# Domain check
|
|
195
|
+
if predicate in self.domains:
|
|
196
|
+
expected_domain = self.domains[predicate]
|
|
197
|
+
subject_types = self.entity_types.get(subject, set())
|
|
198
|
+
if subject_types:
|
|
199
|
+
all_types = set()
|
|
200
|
+
for t in subject_types:
|
|
201
|
+
all_types |= self.get_all_types(t)
|
|
202
|
+
if expected_domain not in all_types:
|
|
203
|
+
return OWLViolation(
|
|
204
|
+
f"Domain violation: {predicate} requires subject of type "
|
|
205
|
+
f"{expected_domain}, but {subject} has types {subject_types}",
|
|
206
|
+
"domain",
|
|
207
|
+
triple,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Range check
|
|
211
|
+
if predicate in self.ranges and not obj.startswith('"'):
|
|
212
|
+
expected_range = self.ranges[predicate]
|
|
213
|
+
object_types = self.entity_types.get(obj, set())
|
|
214
|
+
if object_types:
|
|
215
|
+
all_types = set()
|
|
216
|
+
for t in object_types:
|
|
217
|
+
all_types |= self.get_all_types(t)
|
|
218
|
+
if expected_range not in all_types:
|
|
219
|
+
return OWLViolation(
|
|
220
|
+
f"Range violation: {predicate} requires object of type "
|
|
221
|
+
f"{expected_range}, but {obj} has types {object_types}",
|
|
222
|
+
"range",
|
|
223
|
+
triple,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Disjoint class check (when assigning a type)
|
|
227
|
+
if predicate == RDF_TYPE:
|
|
228
|
+
existing_types = self.entity_types.get(subject, set())
|
|
229
|
+
for existing_type in existing_types:
|
|
230
|
+
disjoint = self.disjoint.get(existing_type, set())
|
|
231
|
+
if obj in disjoint:
|
|
232
|
+
return OWLViolation(
|
|
233
|
+
f"Disjoint violation: {subject} is already type "
|
|
234
|
+
f"{existing_type}, which is disjoint with {obj}",
|
|
235
|
+
"disjoint",
|
|
236
|
+
triple,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
return None # Valid
|
|
240
|
+
|
|
241
|
+
def generate_verification_queries(self) -> list[tuple[str, str]]:
|
|
242
|
+
"""Generate SPARQL queries that check for OWL constraint violations.
|
|
243
|
+
|
|
244
|
+
Returns a list of (description, sparql_query) tuples.
|
|
245
|
+
Each query returns rows that represent violations.
|
|
246
|
+
"""
|
|
247
|
+
queries = []
|
|
248
|
+
|
|
249
|
+
# Domain violations
|
|
250
|
+
for prop, domain_class in self.domains.items():
|
|
251
|
+
queries.append((
|
|
252
|
+
f"Domain violation: {prop} requires subject of type {domain_class}",
|
|
253
|
+
f"SELECT ?s WHERE {{ ?s <{prop}> ?o . "
|
|
254
|
+
f"FILTER NOT EXISTS {{ ?s <{RDF_TYPE}> <{domain_class}> }} }}"
|
|
255
|
+
))
|
|
256
|
+
|
|
257
|
+
# Range violations
|
|
258
|
+
for prop, range_class in self.ranges.items():
|
|
259
|
+
queries.append((
|
|
260
|
+
f"Range violation: {prop} requires object of type {range_class}",
|
|
261
|
+
f"SELECT ?o WHERE {{ ?s <{prop}> ?o . "
|
|
262
|
+
f"FILTER NOT EXISTS {{ ?o <{RDF_TYPE}> <{range_class}> }} }}"
|
|
263
|
+
))
|
|
264
|
+
|
|
265
|
+
# Functional property violations (more than one value)
|
|
266
|
+
for prop in self.functional:
|
|
267
|
+
queries.append((
|
|
268
|
+
f"Functional violation: {prop} should have at most one value per subject",
|
|
269
|
+
f"SELECT ?s WHERE {{ ?s <{prop}> ?o1 . ?s <{prop}> ?o2 . "
|
|
270
|
+
f"FILTER(?o1 != ?o2) }}"
|
|
271
|
+
))
|
|
272
|
+
|
|
273
|
+
# Disjoint class violations
|
|
274
|
+
for cls, disjoint_set in self.disjoint.items():
|
|
275
|
+
for other in disjoint_set:
|
|
276
|
+
queries.append((
|
|
277
|
+
f"Disjoint violation: {cls} and {other} cannot overlap",
|
|
278
|
+
f"SELECT ?x WHERE {{ ?x <{RDF_TYPE}> <{cls}> . "
|
|
279
|
+
f"?x <{RDF_TYPE}> <{other}> }}"
|
|
280
|
+
))
|
|
281
|
+
|
|
282
|
+
return queries
|
|
283
|
+
|
|
284
|
+
def validate_ntriples(self, ntriples: str) -> list[OWLViolation]:
|
|
285
|
+
"""Validate a block of N-Triples. Returns list of violations."""
|
|
286
|
+
violations = []
|
|
287
|
+
for line in ntriples.splitlines():
|
|
288
|
+
line = line.strip()
|
|
289
|
+
if not line or line.startswith("#"):
|
|
290
|
+
continue
|
|
291
|
+
# Simple N-Triples parsing (subject predicate object .)
|
|
292
|
+
parts = line.split(None, 2)
|
|
293
|
+
if len(parts) < 3:
|
|
294
|
+
continue
|
|
295
|
+
s = parts[0].strip("<>")
|
|
296
|
+
p = parts[1].strip("<>")
|
|
297
|
+
o_raw = parts[2].rstrip(" .")
|
|
298
|
+
o = o_raw.strip("<>") if o_raw.startswith("<") else o_raw
|
|
299
|
+
|
|
300
|
+
violation = self.validate_triple(s, p, o)
|
|
301
|
+
if violation:
|
|
302
|
+
violations.append(violation)
|
|
303
|
+
|
|
304
|
+
return violations
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
sutradb/__init__.py,sha256=L5l3MVIjH2JVgYMQU70_YrrAJyIIEp-sAyS3XsHapSg,157
|
|
2
|
+
sutradb/_version.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
|
|
3
|
+
sutradb/client.py,sha256=wziV1VyT9zRJs9NbE9u0Gu-DzGYiYWRx6jAzvjGgass,9420
|
|
4
|
+
sutradb/jupyter.py,sha256=3mkpibI3pONYJmLHmn4VLpMFFPcvvCGGkLxrRjyufYU,3746
|
|
5
|
+
sutradb/langchain.py,sha256=qlKnXkk1U54KrSO84s48LxgzuEl7SWInDiRdOGNQgdQ,4770
|
|
6
|
+
sutradb/owl.py,sha256=alOZu6f57EGGVVTaO2q7r2H2kOsqYu1XJ9n5Q_s8tK4,12382
|
|
7
|
+
sutradb-0.3.2.dist-info/METADATA,sha256=V1b3Z_KXtELm6ksDTnwYm01MWQFMBq2xQhTk-V-lLps,223
|
|
8
|
+
sutradb-0.3.2.dist-info/WHEEL,sha256=e22IIVjxDyt0lABi4WpktFIGsmO_ebSDXLnPUbPK0E0,105
|
|
9
|
+
sutradb-0.3.2.dist-info/RECORD,,
|