knowledge-graph-rdbms 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/CLAUDE.md +3 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/PKG-INFO +6 -2
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/README.md +5 -1
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/__init__.py +1 -1
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/base.py +2 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/postgres.py +67 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/cli.py +43 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/graph.py +102 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/mcp_server.py +26 -1
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/pyproject.toml +1 -1
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_cli.py +20 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_mcp_server.py +13 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_postgres.py +22 -0
- knowledge_graph_rdbms-0.1.4/tests/test_schema.py +97 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/.claude/skills/kg-compose/SKILL.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/.github/workflows/ci.yml +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/.github/workflows/publish.yml +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/.gitignore +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/CODE_OF_CONDUCT.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/CONTRIBUTING.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/LICENSE +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/SECURITY.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/assets/crossover.png +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/assets/read_latency.png +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/assets/runtimes.png +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/assets/write_throughput.png +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/README.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/benchmark.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/charts.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/neo4j/README.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/neo4j/headtohead.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/postgres/README.md +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/postgres/benchmark.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/postgres/charts.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/compare.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/run_bun.js +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/run_node.mjs +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/run_python.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/__init__.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/neo4j.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/sqlite.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/events.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/invariants.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/policy.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/rdf.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/resolver.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/kgrdbms/service.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_bulk.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_events.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_graph.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_policy.py +0 -0
- {knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/tests/test_rdf.py +0 -0
|
@@ -21,6 +21,8 @@ python bench/charts.py # render assets/*.png from bench data (
|
|
|
21
21
|
python bench/runtimes/compare.py # CPython vs Node vs Bun SQLite comparison
|
|
22
22
|
|
|
23
23
|
kg stats # default ontology (~/.kgrdbms/graph.db)
|
|
24
|
+
kg schema # observed vocabulary: kinds, edge types, labels, keys-per-kind
|
|
25
|
+
kg schema --samples # + example ids and enum-like property values per kind
|
|
24
26
|
kg ontology list # the registry (the "db of dbs")
|
|
25
27
|
kg ontology create coffee --stance inferential # register a named ontology
|
|
26
28
|
kg --ontology coffee node add drink:latte --kind Drink # route to it (resolver)
|
|
@@ -107,4 +109,5 @@ Node ids follow `prefix:reference` — `person:ada-lovelace`, `company:apple`, `
|
|
|
107
109
|
- **Properties round-trip as JSON.** Storage is `value_json`; ints/bools/lists/objects come back as their JSON type. CLI `--prop key=value` parses value as JSON when possible, else keeps it as a string.
|
|
108
110
|
- **Per-call writes each commit (one fsync).** Wrapping work in `batch()` / using `add_nodes` / `add_edges` collapses to one transaction (~10× faster). Don't add a per-row commit inside a bulk loop.
|
|
109
111
|
- **Reads are not in `service.py`** by design — callers hit `Graph` directly. Don't route reads through the gate.
|
|
112
|
+
- **`schema()` is the discovery primitive.** It returns the *observed* vocabulary (kinds, edge types, labels, and property keys per kind, with counts; `samples=True` adds example ids + enum-like values) so a consumer can query by real values instead of guessing. It's a read like any other (`GraphBackend` method, Postgres port, stub line), and the MCP server instructions tell models to call `kg_schema` first. The schema is *observed, not enforced* — the graph stays schemaless; this just profiles what's there.
|
|
110
113
|
- **CLI exit codes are contractual:** `0` ok, `1` not found / bad input, `2` policy denial, `3` invariant violation. Preserve these.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowledge-graph-rdbms
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: A label property graph on an RDBMS (SQLite): nodes, typed edges, an append-only event log, and an optional MCP server.
|
|
5
5
|
Project-URL: Homepage, https://github.com/cunicopia-dev/knowledge-graph-rdbms
|
|
6
6
|
Project-URL: Repository, https://github.com/cunicopia-dev/knowledge-graph-rdbms
|
|
@@ -536,6 +536,8 @@ kg out person:ada # outbound edges
|
|
|
536
536
|
kg path person:ada field:cs # shortest path
|
|
537
537
|
kg nodes-by-kind Person
|
|
538
538
|
kg stats
|
|
539
|
+
kg schema # observed vocabulary — kinds, edge types, labels, keys-per-kind
|
|
540
|
+
kg schema --samples # + example ids and enum-like property values per kind
|
|
539
541
|
kg --json node get person:ada # machine-readable output for piping
|
|
540
542
|
|
|
541
543
|
kg events -n 10 # tail the event log
|
|
@@ -570,7 +572,8 @@ Or hand-edit a client config (e.g. Claude Desktop):
|
|
|
570
572
|
{ "mcpServers": { "kgrdbms": { "command": "kgrdbms-mcp" } } }
|
|
571
573
|
```
|
|
572
574
|
|
|
573
|
-
It exposes `kg_`-prefixed tools for reads (`
|
|
575
|
+
It exposes `kg_`-prefixed tools for reads (`kg_schema` — the vocabulary, meant to
|
|
576
|
+
be called first; `kg_node_get`, `kg_nodes_by_kind`,
|
|
574
577
|
`kg_neighborhood`, `kg_shortest_path`, `kg_descendants`, …), gated writes
|
|
575
578
|
(`kg_node_upsert`, `kg_edge_add`, `kg_node_delete`, …), bulk composition
|
|
576
579
|
(`kg_import` — a whole `{nodes, edges}` batch in one call, so an agent populates
|
|
@@ -743,6 +746,7 @@ replayable.
|
|
|
743
746
|
| Command | What it does |
|
|
744
747
|
| ------------------------------- | --------------------------------------------- |
|
|
745
748
|
| `kg stats` | node/edge counts and db path |
|
|
749
|
+
| `kg schema [--samples]` | observed vocabulary: kinds, edge types, labels, keys-per-kind |
|
|
746
750
|
| `kg node add ID --kind K …` | create or update a node (gated + logged) |
|
|
747
751
|
| `kg node get ID` | fetch a node |
|
|
748
752
|
| `kg node del ID` | delete a node (cascades edges) |
|
|
@@ -505,6 +505,8 @@ kg out person:ada # outbound edges
|
|
|
505
505
|
kg path person:ada field:cs # shortest path
|
|
506
506
|
kg nodes-by-kind Person
|
|
507
507
|
kg stats
|
|
508
|
+
kg schema # observed vocabulary — kinds, edge types, labels, keys-per-kind
|
|
509
|
+
kg schema --samples # + example ids and enum-like property values per kind
|
|
508
510
|
kg --json node get person:ada # machine-readable output for piping
|
|
509
511
|
|
|
510
512
|
kg events -n 10 # tail the event log
|
|
@@ -539,7 +541,8 @@ Or hand-edit a client config (e.g. Claude Desktop):
|
|
|
539
541
|
{ "mcpServers": { "kgrdbms": { "command": "kgrdbms-mcp" } } }
|
|
540
542
|
```
|
|
541
543
|
|
|
542
|
-
It exposes `kg_`-prefixed tools for reads (`
|
|
544
|
+
It exposes `kg_`-prefixed tools for reads (`kg_schema` — the vocabulary, meant to
|
|
545
|
+
be called first; `kg_node_get`, `kg_nodes_by_kind`,
|
|
543
546
|
`kg_neighborhood`, `kg_shortest_path`, `kg_descendants`, …), gated writes
|
|
544
547
|
(`kg_node_upsert`, `kg_edge_add`, `kg_node_delete`, …), bulk composition
|
|
545
548
|
(`kg_import` — a whole `{nodes, edges}` batch in one call, so an agent populates
|
|
@@ -712,6 +715,7 @@ replayable.
|
|
|
712
715
|
| Command | What it does |
|
|
713
716
|
| ------------------------------- | --------------------------------------------- |
|
|
714
717
|
| `kg stats` | node/edge counts and db path |
|
|
718
|
+
| `kg schema [--samples]` | observed vocabulary: kinds, edge types, labels, keys-per-kind |
|
|
715
719
|
| `kg node add ID --kind K …` | create or update a node (gated + logged) |
|
|
716
720
|
| `kg node get ID` | fetch a node |
|
|
717
721
|
| `kg node del ID` | delete a node (cascades edges) |
|
|
@@ -47,6 +47,7 @@ class GraphBackend(Protocol):
|
|
|
47
47
|
def count_edges_by_type(self) -> dict[str, int]: ...
|
|
48
48
|
def total_nodes(self) -> int: ...
|
|
49
49
|
def total_edges(self) -> int: ...
|
|
50
|
+
def schema(self, *, samples: bool = ..., sample_limit: int = ...) -> dict: ...
|
|
50
51
|
# bulk: a context manager that defers commits to one transaction
|
|
51
52
|
def batch(self) -> Any: ...
|
|
52
53
|
def close(self) -> None: ...
|
|
@@ -93,6 +94,7 @@ class _StubBackend:
|
|
|
93
94
|
def count_edges_by_type(self, *a: Any, **k: Any) -> dict[str, int]: return self._todo("count_edges_by_type")
|
|
94
95
|
def total_nodes(self, *a: Any, **k: Any) -> int: return self._todo("total_nodes")
|
|
95
96
|
def total_edges(self, *a: Any, **k: Any) -> int: return self._todo("total_edges")
|
|
97
|
+
def schema(self, *a: Any, **k: Any) -> dict: return self._todo("schema")
|
|
96
98
|
|
|
97
99
|
@contextmanager
|
|
98
100
|
def batch(self) -> Iterator["_StubBackend"]:
|
|
@@ -450,6 +450,73 @@ class PostgresGraph:
|
|
|
450
450
|
def total_edges(self) -> int:
|
|
451
451
|
return self.conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"]
|
|
452
452
|
|
|
453
|
+
def schema(self, *, samples: bool = False, sample_limit: int = 20) -> dict:
|
|
454
|
+
"""Observed schema (kinds, edge types, labels, property keys per kind),
|
|
455
|
+
mirroring Graph.schema. Pure aggregates over the same five tables; jsonb
|
|
456
|
+
values arrive already parsed, so no json.loads on the sampling path."""
|
|
457
|
+
kinds = self.count_nodes_by_kind()
|
|
458
|
+
edge_types = self.count_edges_by_type()
|
|
459
|
+
labels = {
|
|
460
|
+
r["label"]: r["c"]
|
|
461
|
+
for r in self.conn.execute(
|
|
462
|
+
"SELECT label, COUNT(*) AS c FROM node_labels GROUP BY label ORDER BY c DESC"
|
|
463
|
+
).fetchall()
|
|
464
|
+
}
|
|
465
|
+
node_keys_by_kind: dict[str, dict[str, int]] = {}
|
|
466
|
+
for r in self.conn.execute(
|
|
467
|
+
"SELECT n.kind AS kind, p.key AS key, COUNT(*) AS c "
|
|
468
|
+
"FROM node_properties p JOIN nodes n ON n.id = p.node_id "
|
|
469
|
+
"GROUP BY n.kind, p.key ORDER BY n.kind, c DESC"
|
|
470
|
+
).fetchall():
|
|
471
|
+
node_keys_by_kind.setdefault(r["kind"], {})[r["key"]] = r["c"]
|
|
472
|
+
edge_keys = {
|
|
473
|
+
r["key"]: r["c"]
|
|
474
|
+
for r in self.conn.execute(
|
|
475
|
+
"SELECT key, COUNT(*) AS c FROM edge_properties GROUP BY key ORDER BY c DESC"
|
|
476
|
+
).fetchall()
|
|
477
|
+
}
|
|
478
|
+
result: dict[str, Any] = {
|
|
479
|
+
"nodes_total": self.total_nodes(),
|
|
480
|
+
"edges_total": self.total_edges(),
|
|
481
|
+
"kinds": kinds,
|
|
482
|
+
"edge_types": edge_types,
|
|
483
|
+
"labels": labels,
|
|
484
|
+
"node_keys_by_kind": node_keys_by_kind,
|
|
485
|
+
"edge_keys": edge_keys,
|
|
486
|
+
}
|
|
487
|
+
if samples:
|
|
488
|
+
result["samples"] = self._schema_samples(kinds, node_keys_by_kind, sample_limit)
|
|
489
|
+
return result
|
|
490
|
+
|
|
491
|
+
def _schema_samples(
|
|
492
|
+
self, kinds: dict[str, int], node_keys_by_kind: dict[str, dict[str, int]], sample_limit: int
|
|
493
|
+
) -> dict[str, dict]:
|
|
494
|
+
from kgrdbms.graph import _scalar_samples
|
|
495
|
+
|
|
496
|
+
samples: dict[str, dict] = {}
|
|
497
|
+
for kind in kinds:
|
|
498
|
+
example_ids = [
|
|
499
|
+
r["id"]
|
|
500
|
+
for r in self.conn.execute(
|
|
501
|
+
"SELECT id FROM nodes WHERE kind=%s ORDER BY id LIMIT 5", (kind,)
|
|
502
|
+
).fetchall()
|
|
503
|
+
]
|
|
504
|
+
values: dict[str, list] = {}
|
|
505
|
+
for key in node_keys_by_kind.get(kind, {}):
|
|
506
|
+
rows = self.conn.execute(
|
|
507
|
+
"SELECT DISTINCT p.value_json FROM node_properties p "
|
|
508
|
+
"JOIN nodes n ON n.id = p.node_id "
|
|
509
|
+
"WHERE n.kind=%s AND p.key=%s LIMIT %s",
|
|
510
|
+
(kind, key, sample_limit + 1),
|
|
511
|
+
).fetchall()
|
|
512
|
+
if len(rows) > sample_limit:
|
|
513
|
+
continue # open-ended / free-text field — don't enumerate it
|
|
514
|
+
vals = _scalar_samples(r["value_json"] for r in rows) # jsonb already parsed
|
|
515
|
+
if vals is not None:
|
|
516
|
+
values[key] = vals
|
|
517
|
+
samples[kind] = {"example_ids": example_ids, "values": values}
|
|
518
|
+
return samples
|
|
519
|
+
|
|
453
520
|
# ---- hydration (jsonb returns parsed values — no json.loads) --------
|
|
454
521
|
|
|
455
522
|
def _hydrate_node(self, row: dict) -> Node:
|
|
@@ -173,6 +173,43 @@ def cmd_stats(app: App, args) -> int:
|
|
|
173
173
|
return 0
|
|
174
174
|
|
|
175
175
|
|
|
176
|
+
def cmd_schema(app: App, args) -> int:
|
|
177
|
+
"""The observed schema: kinds, edge types, labels, and property keys per kind.
|
|
178
|
+
|
|
179
|
+
The map to read *before* querying an unfamiliar ontology — so you query by a
|
|
180
|
+
kind/label/key that actually exists instead of guessing.
|
|
181
|
+
"""
|
|
182
|
+
res = app.graph.schema(samples=args.samples)
|
|
183
|
+
if res.get("ontology") is None and app.ontology:
|
|
184
|
+
res = {"ontology": app.ontology, **res}
|
|
185
|
+
|
|
186
|
+
lines: list[str] = []
|
|
187
|
+
if app.ontology:
|
|
188
|
+
lines.append(f"ontology: {app.ontology}")
|
|
189
|
+
lines.append(f"nodes: {res['nodes_total']:,} edges: {res['edges_total']:,}")
|
|
190
|
+
lines.append("")
|
|
191
|
+
lines.append("kinds (node count) and their property keys:")
|
|
192
|
+
for kind, n in res["kinds"].items():
|
|
193
|
+
keys = res["node_keys_by_kind"].get(kind, {})
|
|
194
|
+
keystr = ", ".join(f"{k}×{c}" for k, c in keys.items()) or "(no properties)"
|
|
195
|
+
lines.append(f" {kind} ×{n}")
|
|
196
|
+
lines.append(f" keys: {keystr}")
|
|
197
|
+
if args.samples:
|
|
198
|
+
samp = res.get("samples", {}).get(kind, {})
|
|
199
|
+
ex = ", ".join(samp.get("example_ids", []))
|
|
200
|
+
if ex:
|
|
201
|
+
lines.append(f" e.g. {ex}")
|
|
202
|
+
for k, vals in samp.get("values", {}).items():
|
|
203
|
+
lines.append(f" {k} ∈ {{{', '.join(str(v) for v in vals)}}}")
|
|
204
|
+
lines.append("")
|
|
205
|
+
lines.append("edge types: " + (", ".join(f"{t}×{c}" for t, c in res["edge_types"].items()) or "(none)"))
|
|
206
|
+
lines.append("labels: " + (", ".join(f"{l}×{c}" for l, c in res["labels"].items()) or "(none)"))
|
|
207
|
+
if res["edge_keys"]:
|
|
208
|
+
lines.append("edge keys: " + ", ".join(f"{k}×{c}" for k, c in res["edge_keys"].items()))
|
|
209
|
+
app.emit(res, "\n".join(lines))
|
|
210
|
+
return 0
|
|
211
|
+
|
|
212
|
+
|
|
176
213
|
# ---- registry handlers (the control plane / db-of-dbs) ---------------
|
|
177
214
|
|
|
178
215
|
|
|
@@ -432,6 +469,12 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
432
469
|
sp = sub.add_parser("stats", help="node/edge counts, db path, active ontology")
|
|
433
470
|
sp.set_defaults(func=cmd_stats)
|
|
434
471
|
|
|
472
|
+
sp = sub.add_parser("schema", help="observed schema: kinds, edge types, labels, "
|
|
473
|
+
"property keys per kind (read this before querying)")
|
|
474
|
+
sp.add_argument("--samples", action="store_true",
|
|
475
|
+
help="also show example node ids and enum-like property values per kind")
|
|
476
|
+
sp.set_defaults(func=cmd_schema)
|
|
477
|
+
|
|
435
478
|
# ---- ontology registry (the db-of-dbs) ----
|
|
436
479
|
ont = sub.add_parser("ontology", help="manage the ontology registry").add_subparsers(dest="action", required=True)
|
|
437
480
|
a = ont.add_parser("list", help="list registered ontologies")
|
|
@@ -187,6 +187,26 @@ def _normalize_edge(spec: "Edge | dict | tuple | list") -> tuple[str, str, str,
|
|
|
187
187
|
raise TypeError("edge spec must be an Edge, dict, or (from, to, type[, properties]) tuple")
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
def _scalar_samples(values: Iterable[Any], *, max_str: int = 80) -> list | None:
|
|
191
|
+
"""Bounded distinct scalar values for schema sampling.
|
|
192
|
+
|
|
193
|
+
Returns the values sorted, or None to signal "not an enumerable vocabulary"
|
|
194
|
+
— i.e. some value is a list/object or an over-long string, so showing it as a
|
|
195
|
+
closed set would mislead. Keeps `schema(samples=True)` from dumping free-text.
|
|
196
|
+
"""
|
|
197
|
+
out: list = []
|
|
198
|
+
for v in values:
|
|
199
|
+
if isinstance(v, str):
|
|
200
|
+
if len(v) > max_str:
|
|
201
|
+
return None
|
|
202
|
+
out.append(v)
|
|
203
|
+
elif isinstance(v, (int, float)): # bool is a subclass of int — included
|
|
204
|
+
out.append(v)
|
|
205
|
+
else: # list / dict / None
|
|
206
|
+
return None
|
|
207
|
+
return sorted(out, key=str) if out else None
|
|
208
|
+
|
|
209
|
+
|
|
190
210
|
# ---- graph ----------------------------------------------------------
|
|
191
211
|
|
|
192
212
|
|
|
@@ -622,6 +642,88 @@ class Graph:
|
|
|
622
642
|
def total_edges(self) -> int:
|
|
623
643
|
return self.conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"]
|
|
624
644
|
|
|
645
|
+
# ---- schema (observed TBox: the map of what's in the graph) ----
|
|
646
|
+
|
|
647
|
+
def schema(self, *, samples: bool = False, sample_limit: int = 20) -> dict:
|
|
648
|
+
"""The observed schema of the graph — the vocabulary needed to query it
|
|
649
|
+
without guessing.
|
|
650
|
+
|
|
651
|
+
Returns kinds, edge types, labels, and property keys *per kind*, each
|
|
652
|
+
with counts. This is a profile of what actually occurs (the graph is
|
|
653
|
+
schemaless — nothing is enforced), the property-graph analogue of an
|
|
654
|
+
ontology's TBox derived from its ABox.
|
|
655
|
+
|
|
656
|
+
`samples=True` additionally returns, per kind, a few example node ids
|
|
657
|
+
(revealing the id/CURIE convention) and — for enum-like properties — the
|
|
658
|
+
bounded set of distinct scalar values a key takes, turning "there is a
|
|
659
|
+
key `status`" into "status is one of {active, archived}". A key whose
|
|
660
|
+
distinct values exceed `sample_limit` (a free-text field) is left
|
|
661
|
+
un-enumerated rather than dumped.
|
|
662
|
+
|
|
663
|
+
Read-only; pure GROUP BY aggregates. The intended first call for any
|
|
664
|
+
consumer dropped into an unfamiliar ontology.
|
|
665
|
+
"""
|
|
666
|
+
kinds = self.count_nodes_by_kind()
|
|
667
|
+
edge_types = self.count_edges_by_type()
|
|
668
|
+
labels = {
|
|
669
|
+
r["label"]: r["c"]
|
|
670
|
+
for r in self.conn.execute(
|
|
671
|
+
"SELECT label, COUNT(*) AS c FROM node_labels GROUP BY label ORDER BY c DESC"
|
|
672
|
+
).fetchall()
|
|
673
|
+
}
|
|
674
|
+
node_keys_by_kind: dict[str, dict[str, int]] = {}
|
|
675
|
+
for r in self.conn.execute(
|
|
676
|
+
"SELECT n.kind AS kind, p.key AS key, COUNT(*) AS c "
|
|
677
|
+
"FROM node_properties p JOIN nodes n ON n.id = p.node_id "
|
|
678
|
+
"GROUP BY n.kind, p.key ORDER BY n.kind, c DESC"
|
|
679
|
+
).fetchall():
|
|
680
|
+
node_keys_by_kind.setdefault(r["kind"], {})[r["key"]] = r["c"]
|
|
681
|
+
edge_keys = {
|
|
682
|
+
r["key"]: r["c"]
|
|
683
|
+
for r in self.conn.execute(
|
|
684
|
+
"SELECT key, COUNT(*) AS c FROM edge_properties GROUP BY key ORDER BY c DESC"
|
|
685
|
+
).fetchall()
|
|
686
|
+
}
|
|
687
|
+
result: dict[str, Any] = {
|
|
688
|
+
"nodes_total": self.total_nodes(),
|
|
689
|
+
"edges_total": self.total_edges(),
|
|
690
|
+
"kinds": kinds,
|
|
691
|
+
"edge_types": edge_types,
|
|
692
|
+
"labels": labels,
|
|
693
|
+
"node_keys_by_kind": node_keys_by_kind,
|
|
694
|
+
"edge_keys": edge_keys,
|
|
695
|
+
}
|
|
696
|
+
if samples:
|
|
697
|
+
result["samples"] = self._schema_samples(kinds, node_keys_by_kind, sample_limit)
|
|
698
|
+
return result
|
|
699
|
+
|
|
700
|
+
def _schema_samples(
|
|
701
|
+
self, kinds: dict[str, int], node_keys_by_kind: dict[str, dict[str, int]], sample_limit: int
|
|
702
|
+
) -> dict[str, dict]:
|
|
703
|
+
samples: dict[str, dict] = {}
|
|
704
|
+
for kind in kinds:
|
|
705
|
+
example_ids = [
|
|
706
|
+
r["id"]
|
|
707
|
+
for r in self.conn.execute(
|
|
708
|
+
"SELECT id FROM nodes WHERE kind=? ORDER BY id LIMIT 5", (kind,)
|
|
709
|
+
).fetchall()
|
|
710
|
+
]
|
|
711
|
+
values: dict[str, list] = {}
|
|
712
|
+
for key in node_keys_by_kind.get(kind, {}):
|
|
713
|
+
rows = self.conn.execute(
|
|
714
|
+
"SELECT DISTINCT p.value_json FROM node_properties p "
|
|
715
|
+
"JOIN nodes n ON n.id = p.node_id "
|
|
716
|
+
"WHERE n.kind=? AND p.key=? LIMIT ?",
|
|
717
|
+
(kind, key, sample_limit + 1),
|
|
718
|
+
).fetchall()
|
|
719
|
+
if len(rows) > sample_limit:
|
|
720
|
+
continue # open-ended / free-text field — don't enumerate it
|
|
721
|
+
vals = _scalar_samples(json.loads(r["value_json"]) for r in rows)
|
|
722
|
+
if vals is not None:
|
|
723
|
+
values[key] = vals
|
|
724
|
+
samples[kind] = {"example_ids": example_ids, "values": values}
|
|
725
|
+
return samples
|
|
726
|
+
|
|
625
727
|
# ---- hydration -------------------------------------------------
|
|
626
728
|
|
|
627
729
|
def _hydrate_node(self, row: sqlite3.Row) -> Node:
|
|
@@ -21,6 +21,8 @@ Tool surface (all prefixed kg_):
|
|
|
21
21
|
|
|
22
22
|
reads
|
|
23
23
|
kg_stats — node/edge counts, the db path, the active ontology
|
|
24
|
+
kg_schema — observed vocabulary (kinds, edge types, labels, keys);
|
|
25
|
+
read this FIRST to query without guessing
|
|
24
26
|
kg_node_get — fetch a node by id
|
|
25
27
|
kg_nodes_by_kind — list all nodes of a kind
|
|
26
28
|
kg_nodes_by_label — list all nodes carrying a label
|
|
@@ -90,7 +92,10 @@ mcp = FastMCP(
|
|
|
90
92
|
"prefixed kg_ read or mutate nodes (id, kind, name, labels, properties) "
|
|
91
93
|
"and typed directed edges. Every tool takes an optional `ontology` name "
|
|
92
94
|
"(omit for the default); use kg_ontologies_list to discover them and "
|
|
93
|
-
"kg_ontology_create to add one.
|
|
95
|
+
"kg_ontology_create to add one. When working with an ontology whose "
|
|
96
|
+
"contents you don't already know, call kg_schema FIRST — it returns the "
|
|
97
|
+
"exact kinds, edge types, labels, and property keys so you can query by "
|
|
98
|
+
"real values instead of guessing. Writes are gated by compiled-in "
|
|
94
99
|
"invariants and a configurable policy, and recorded to an append-only, "
|
|
95
100
|
"replayable event log."
|
|
96
101
|
),
|
|
@@ -197,6 +202,26 @@ def kg_stats(ontology: str | None = None) -> dict:
|
|
|
197
202
|
}
|
|
198
203
|
|
|
199
204
|
|
|
205
|
+
@mcp.tool()
|
|
206
|
+
def kg_schema(samples: bool = False, ontology: str | None = None) -> dict:
|
|
207
|
+
"""The observed schema of an ontology — CALL THIS FIRST when you don't already
|
|
208
|
+
know what an ontology contains, before kg_nodes_by_kind / kg_nodes_by_label /
|
|
209
|
+
kg_node_get. It tells you the exact vocabulary so you never have to guess.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
- kinds — every node `kind` and its count
|
|
213
|
+
- edge_types — every edge `type` and its count
|
|
214
|
+
- labels — every label and its count
|
|
215
|
+
- node_keys_by_kind— for each kind, which property keys its nodes carry (+counts)
|
|
216
|
+
- edge_keys — property keys that appear on edges
|
|
217
|
+
|
|
218
|
+
With samples=True, also returns per kind a few example node ids (showing the
|
|
219
|
+
id/CURIE convention) and, for enum-like properties, the set of distinct values
|
|
220
|
+
a key takes (free-text keys are left un-enumerated). Read-only; cheap.
|
|
221
|
+
"""
|
|
222
|
+
return _bundle(ontology).backend.schema(samples=samples)
|
|
223
|
+
|
|
224
|
+
|
|
200
225
|
@mcp.tool()
|
|
201
226
|
def kg_node_get(id: str, ontology: str | None = None) -> dict | None:
|
|
202
227
|
"""Fetch a single node by id."""
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "knowledge-graph-rdbms"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.4"
|
|
8
8
|
description = "A label property graph on an RDBMS (SQLite): nodes, typed edges, an append-only event log, and an optional MCP server."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -211,3 +211,23 @@ def test_edge_add_missing_endpoint_exits_1(db, capsys):
|
|
|
211
211
|
assert run(db, "edge", "add", "x:1", "y:1", "LINK") == 1
|
|
212
212
|
err = capsys.readouterr().err
|
|
213
213
|
assert "to node 'y:1' does not exist" in err and "Traceback" not in err
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def test_schema_json_lists_kinds_and_keys(db, capsys):
|
|
217
|
+
run(db, "node", "add", "person:ada", "--kind", "Person", "--prop", "role=analyst")
|
|
218
|
+
run(db, "node", "add", "memory:m1", "--kind", "Memory", "--prop", "importance=high")
|
|
219
|
+
capsys.readouterr()
|
|
220
|
+
assert run(db, "schema", as_json=True) == 0
|
|
221
|
+
payload = json.loads(capsys.readouterr().out)
|
|
222
|
+
assert payload["kinds"] == {"Person": 1, "Memory": 1}
|
|
223
|
+
assert payload["node_keys_by_kind"]["Person"] == {"role": 1}
|
|
224
|
+
assert payload["node_keys_by_kind"]["Memory"] == {"importance": 1}
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def test_schema_samples_human_shows_enum_values(db, capsys):
|
|
228
|
+
run(db, "node", "add", "memory:m1", "--kind", "Memory", "--prop", "importance=high")
|
|
229
|
+
run(db, "node", "add", "memory:m2", "--kind", "Memory", "--prop", "importance=low")
|
|
230
|
+
capsys.readouterr()
|
|
231
|
+
assert run(db, "schema", "--samples") == 0
|
|
232
|
+
out = capsys.readouterr().out
|
|
233
|
+
assert "importance" in out and "high" in out and "low" in out
|
|
@@ -62,6 +62,19 @@ def test_nodes_by_kind_and_label(mcp_mod):
|
|
|
62
62
|
assert tagged == {"a:1", "b:1"}
|
|
63
63
|
|
|
64
64
|
|
|
65
|
+
def test_schema_exposes_vocabulary(mcp_mod):
|
|
66
|
+
mcp_mod.kg_node_upsert(id="a:1", kind="A", name="1", labels=["Tagged"],
|
|
67
|
+
properties={"status": "active"})
|
|
68
|
+
mcp_mod.kg_node_upsert(id="a:2", kind="A", name="2", properties={"status": "archived"})
|
|
69
|
+
s = mcp_mod.kg_schema()
|
|
70
|
+
assert s["kinds"] == {"A": 2}
|
|
71
|
+
assert s["labels"] == {"Tagged": 1}
|
|
72
|
+
assert s["node_keys_by_kind"]["A"] == {"status": 2}
|
|
73
|
+
# samples enumerate the enum-like status values
|
|
74
|
+
s2 = mcp_mod.kg_schema(samples=True)
|
|
75
|
+
assert s2["samples"]["A"]["values"]["status"] == ["active", "archived"]
|
|
76
|
+
|
|
77
|
+
|
|
65
78
|
def test_edges_out_and_shortest_path(mcp_mod):
|
|
66
79
|
for nid in ("x:1", "x:2", "x:3"):
|
|
67
80
|
mcp_mod.kg_node_upsert(id=nid, kind="X", name=nid)
|
|
@@ -107,6 +107,28 @@ def test_bulk_add_nodes_and_edges(pg):
|
|
|
107
107
|
assert g.out("b:1")[0][0].properties == {"w": 9}
|
|
108
108
|
|
|
109
109
|
|
|
110
|
+
def test_schema_on_postgres_mirrors_sqlite(pg):
|
|
111
|
+
g = pg.backend
|
|
112
|
+
g.add_nodes([
|
|
113
|
+
{"id": "person:ada", "kind": "Person", "name": "Ada",
|
|
114
|
+
"labels": ["important"], "properties": {"role": "analyst"}},
|
|
115
|
+
{"id": "person:alan", "kind": "Person", "name": "Alan",
|
|
116
|
+
"properties": {"role": "logician"}},
|
|
117
|
+
{"id": "memory:m1", "kind": "Memory", "name": "m1",
|
|
118
|
+
"properties": {"importance": "high", "content": "x" * 200}},
|
|
119
|
+
])
|
|
120
|
+
g.add_edges([("person:ada", "memory:m1", "WROTE", {"year": 1843})])
|
|
121
|
+
s = g.schema(samples=True)
|
|
122
|
+
assert s["kinds"] == {"Person": 2, "Memory": 1}
|
|
123
|
+
assert s["edge_types"] == {"WROTE": 1}
|
|
124
|
+
assert s["labels"] == {"important": 1}
|
|
125
|
+
assert s["node_keys_by_kind"]["Person"]["role"] == 2
|
|
126
|
+
assert s["edge_keys"] == {"year": 1}
|
|
127
|
+
# enum enumerated, long free-text content omitted
|
|
128
|
+
assert s["samples"]["Memory"]["values"]["importance"] == ["high"]
|
|
129
|
+
assert "content" not in s["samples"]["Memory"]["values"]
|
|
130
|
+
|
|
131
|
+
|
|
110
132
|
def test_replay_rebuilds_postgres_from_sqlite_log(pg):
|
|
111
133
|
service.upsert_node(pg.backend, pg.events, id="p:1", kind="Person", name="One", actor="t")
|
|
112
134
|
service.upsert_node(pg.backend, pg.events, id="p:2", kind="Person", name="Two", actor="t")
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""schema() — the observed TBox an LLM reads before querying, so it never guesses."""
|
|
2
|
+
|
|
3
|
+
from kgrdbms.graph import Graph, _scalar_samples
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _seed(g: Graph) -> None:
|
|
7
|
+
g.add_node("person:ada", kind="Person", name="Ada Lovelace",
|
|
8
|
+
labels={"Person", "important"}, properties={"role": "analyst", "born": 1815})
|
|
9
|
+
g.add_node("person:alan", kind="Person", name="Alan Turing",
|
|
10
|
+
labels={"Person"}, properties={"role": "logician", "born": 1912})
|
|
11
|
+
g.add_node("memory:m1", kind="Memory", name="note one",
|
|
12
|
+
properties={"content": "a free-text body well over eighty characters long so the "
|
|
13
|
+
"schema sampler treats it as prose, not an enumerable value set",
|
|
14
|
+
"importance": "high"})
|
|
15
|
+
g.add_node("memory:m2", kind="Memory", name="note two",
|
|
16
|
+
properties={"content": "another distinct free-text body, also comfortably past the "
|
|
17
|
+
"eighty-character cap that marks a property as un-enumerable prose",
|
|
18
|
+
"importance": "low"})
|
|
19
|
+
g.add_edge("person:ada", "memory:m1", "WROTE", properties={"year": 1843})
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_schema_reports_kinds_edge_types_labels(tmp_path):
|
|
23
|
+
g = Graph(path=tmp_path / "s.db")
|
|
24
|
+
_seed(g)
|
|
25
|
+
s = g.schema()
|
|
26
|
+
assert s["nodes_total"] == 4
|
|
27
|
+
assert s["edges_total"] == 1
|
|
28
|
+
assert s["kinds"] == {"Person": 2, "Memory": 2}
|
|
29
|
+
assert s["edge_types"] == {"WROTE": 1}
|
|
30
|
+
assert s["labels"] == {"Person": 2, "important": 1}
|
|
31
|
+
assert s["edge_keys"] == {"year": 1}
|
|
32
|
+
g.close()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_schema_property_keys_are_grouped_by_kind(tmp_path):
|
|
36
|
+
g = Graph(path=tmp_path / "k.db")
|
|
37
|
+
_seed(g)
|
|
38
|
+
s = g.schema()
|
|
39
|
+
assert set(s["node_keys_by_kind"]["Person"]) == {"role", "born"}
|
|
40
|
+
assert s["node_keys_by_kind"]["Person"]["role"] == 2
|
|
41
|
+
assert set(s["node_keys_by_kind"]["Memory"]) == {"content", "importance"}
|
|
42
|
+
g.close()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_schema_kind_with_no_properties_still_listed(tmp_path):
|
|
46
|
+
g = Graph(path=tmp_path / "np.db")
|
|
47
|
+
g.add_node("tag:x", kind="Tag", name="x") # no properties at all
|
|
48
|
+
s = g.schema()
|
|
49
|
+
assert s["kinds"] == {"Tag": 1}
|
|
50
|
+
assert s["node_keys_by_kind"].get("Tag", {}) == {}
|
|
51
|
+
g.close()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_schema_samples_enumerate_enum_keys_but_not_freetext(tmp_path):
|
|
55
|
+
g = Graph(path=tmp_path / "samp.db")
|
|
56
|
+
_seed(g)
|
|
57
|
+
s = g.schema(samples=True)
|
|
58
|
+
mem = s["samples"]["Memory"]
|
|
59
|
+
# example ids reveal the CURIE convention
|
|
60
|
+
assert mem["example_ids"] == ["memory:m1", "memory:m2"]
|
|
61
|
+
# importance is enum-like → enumerated; content is free-text → omitted
|
|
62
|
+
assert mem["values"]["importance"] == ["high", "low"]
|
|
63
|
+
assert "content" not in mem["values"]
|
|
64
|
+
# numeric enum on Person too
|
|
65
|
+
assert s["samples"]["Person"]["values"]["born"] == [1815, 1912]
|
|
66
|
+
g.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_schema_samples_respects_sample_limit(tmp_path):
|
|
70
|
+
g = Graph(path=tmp_path / "lim.db")
|
|
71
|
+
for i in range(30):
|
|
72
|
+
g.add_node(f"n:{i}", kind="K", name=str(i), properties={"v": i})
|
|
73
|
+
s = g.schema(samples=True, sample_limit=10)
|
|
74
|
+
# 30 distinct values > limit 10 → not enumerated
|
|
75
|
+
assert "v" not in s["samples"]["K"]["values"]
|
|
76
|
+
g.close()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_scalar_samples_helper_rejects_nonscalar_and_longstrings():
|
|
80
|
+
assert _scalar_samples([1, 2, 3]) == [1, 2, 3]
|
|
81
|
+
assert _scalar_samples(["b", "a"]) == ["a", "b"]
|
|
82
|
+
assert _scalar_samples([True, False]) == [False, True]
|
|
83
|
+
assert _scalar_samples([["a", "b"]]) is None # list value
|
|
84
|
+
assert _scalar_samples([{"k": "v"}]) is None # object value
|
|
85
|
+
assert _scalar_samples(["x" * 200]) is None # over-long string
|
|
86
|
+
assert _scalar_samples([]) is None # nothing to show
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_schema_empty_graph(tmp_path):
|
|
90
|
+
g = Graph(path=tmp_path / "empty.db")
|
|
91
|
+
s = g.schema()
|
|
92
|
+
assert s == {
|
|
93
|
+
"nodes_total": 0, "edges_total": 0,
|
|
94
|
+
"kinds": {}, "edge_types": {}, "labels": {},
|
|
95
|
+
"node_keys_by_kind": {}, "edge_keys": {},
|
|
96
|
+
}
|
|
97
|
+
g.close()
|
{knowledge_graph_rdbms-0.1.3 → knowledge_graph_rdbms-0.1.4}/.claude/skills/kg-compose/SKILL.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|