knowledge-graph-rdbms 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/CLAUDE.md +3 -0
  2. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/PKG-INFO +7 -3
  3. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/README.md +6 -2
  4. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/__init__.py +1 -1
  5. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/base.py +2 -0
  6. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/postgres.py +67 -0
  7. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/cli.py +49 -0
  8. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/events.py +19 -1
  9. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/graph.py +102 -0
  10. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/mcp_server.py +26 -1
  11. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/rdf.py +26 -8
  12. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/service.py +11 -1
  13. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/pyproject.toml +1 -1
  14. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_cli.py +42 -0
  15. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_events.py +49 -0
  16. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_mcp_server.py +13 -0
  17. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_postgres.py +22 -0
  18. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_rdf.py +41 -0
  19. knowledge_graph_rdbms-0.1.4/tests/test_schema.py +97 -0
  20. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/.claude/skills/kg-compose/SKILL.md +0 -0
  21. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/.github/workflows/ci.yml +0 -0
  22. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/.github/workflows/publish.yml +0 -0
  23. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/.gitignore +0 -0
  24. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/CODE_OF_CONDUCT.md +0 -0
  25. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/CONTRIBUTING.md +0 -0
  26. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/LICENSE +0 -0
  27. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/SECURITY.md +0 -0
  28. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/assets/crossover.png +0 -0
  29. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/assets/read_latency.png +0 -0
  30. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/assets/runtimes.png +0 -0
  31. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/assets/write_throughput.png +0 -0
  32. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/README.md +0 -0
  33. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/benchmark.py +0 -0
  34. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/charts.py +0 -0
  35. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/neo4j/README.md +0 -0
  36. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/neo4j/headtohead.py +0 -0
  37. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/postgres/README.md +0 -0
  38. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/postgres/benchmark.py +0 -0
  39. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/postgres/charts.py +0 -0
  40. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/compare.py +0 -0
  41. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/run_bun.js +0 -0
  42. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/run_node.mjs +0 -0
  43. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/bench/runtimes/run_python.py +0 -0
  44. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/__init__.py +0 -0
  45. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/neo4j.py +0 -0
  46. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/backends/sqlite.py +0 -0
  47. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/invariants.py +0 -0
  48. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/policy.py +0 -0
  49. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/kgrdbms/resolver.py +0 -0
  50. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_bulk.py +0 -0
  51. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_graph.py +0 -0
  52. {knowledge_graph_rdbms-0.1.2 → knowledge_graph_rdbms-0.1.4}/tests/test_policy.py +0 -0
@@ -21,6 +21,8 @@ python bench/charts.py # render assets/*.png from bench data (
21
21
  python bench/runtimes/compare.py # CPython vs Node vs Bun SQLite comparison
22
22
 
23
23
  kg stats # default ontology (~/.kgrdbms/graph.db)
24
+ kg schema # observed vocabulary: kinds, edge types, labels, keys-per-kind
25
+ kg schema --samples # + example ids and enum-like property values per kind
24
26
  kg ontology list # the registry (the "db of dbs")
25
27
  kg ontology create coffee --stance inferential # register a named ontology
26
28
  kg --ontology coffee node add drink:latte --kind Drink # route to it (resolver)
@@ -107,4 +109,5 @@ Node ids follow `prefix:reference` — `person:ada-lovelace`, `company:apple`, `
107
109
  - **Properties round-trip as JSON.** Storage is `value_json`; ints/bools/lists/objects come back as their JSON type. CLI `--prop key=value` parses value as JSON when possible, else keeps it as a string.
108
110
  - **Per-call writes each commit (one fsync).** Wrapping work in `batch()` / using `add_nodes` / `add_edges` collapses to one transaction (~10× faster). Don't add a per-row commit inside a bulk loop.
109
111
  - **Reads are not in `service.py`** by design — callers hit `Graph` directly. Don't route reads through the gate.
112
+ - **`schema()` is the discovery primitive.** It returns the *observed* vocabulary (kinds, edge types, labels, and property keys per kind, with counts; `samples=True` adds example ids + enum-like values) so a consumer can query by real values instead of guessing. It's a read like any other (`GraphBackend` method, Postgres port, stub line), and the MCP server instructions tell models to call `kg_schema` first. The schema is *observed, not enforced* — the graph stays schemaless; this just profiles what's there.
110
113
  - **CLI exit codes are contractual:** `0` ok, `1` not found / bad input, `2` policy denial, `3` invariant violation. Preserve these.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowledge-graph-rdbms
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: A label property graph on an RDBMS (SQLite): nodes, typed edges, an append-only event log, and an optional MCP server.
5
5
  Project-URL: Homepage, https://github.com/cunicopia-dev/knowledge-graph-rdbms
6
6
  Project-URL: Repository, https://github.com/cunicopia-dev/knowledge-graph-rdbms
@@ -34,7 +34,7 @@ Description-Content-Type: text/markdown
34
34
  ![Python](https://img.shields.io/badge/python-3.10%2B-3776AB?logo=python&logoColor=white)
35
35
  ![License: MIT](https://img.shields.io/badge/license-MIT-green)
36
36
  ![core dependencies: 0](https://img.shields.io/badge/core_dependencies-0-success)
37
- ![tests: 79 passing](https://img.shields.io/badge/tests-79_passing-brightgreen)
37
+ ![tests: 87 passing](https://img.shields.io/badge/tests-87_passing-brightgreen)
38
38
  ![storage: SQLite](https://img.shields.io/badge/storage-SQLite-003B57?logo=sqlite&logoColor=white)
39
39
  ![MCP](https://img.shields.io/badge/MCP-ready-FF6F00)
40
40
 
@@ -536,6 +536,8 @@ kg out person:ada # outbound edges
536
536
  kg path person:ada field:cs # shortest path
537
537
  kg nodes-by-kind Person
538
538
  kg stats
539
+ kg schema # observed vocabulary — kinds, edge types, labels, keys-per-kind
540
+ kg schema --samples # + example ids and enum-like property values per kind
539
541
  kg --json node get person:ada # machine-readable output for piping
540
542
 
541
543
  kg events -n 10 # tail the event log
@@ -570,7 +572,8 @@ Or hand-edit a client config (e.g. Claude Desktop):
570
572
  { "mcpServers": { "kgrdbms": { "command": "kgrdbms-mcp" } } }
571
573
  ```
572
574
 
573
- It exposes `kg_`-prefixed tools for reads (`kg_node_get`, `kg_nodes_by_kind`,
575
+ It exposes `kg_`-prefixed tools for reads (`kg_schema` — the vocabulary, meant to
576
+ be called first; `kg_node_get`, `kg_nodes_by_kind`,
574
577
  `kg_neighborhood`, `kg_shortest_path`, `kg_descendants`, …), gated writes
575
578
  (`kg_node_upsert`, `kg_edge_add`, `kg_node_delete`, …), bulk composition
576
579
  (`kg_import` — a whole `{nodes, edges}` batch in one call, so an agent populates
@@ -743,6 +746,7 @@ replayable.
743
746
  | Command | What it does |
744
747
  | ------------------------------- | --------------------------------------------- |
745
748
  | `kg stats` | node/edge counts and db path |
749
+ | `kg schema [--samples]` | observed vocabulary: kinds, edge types, labels, keys-per-kind |
746
750
  | `kg node add ID --kind K …` | create or update a node (gated + logged) |
747
751
  | `kg node get ID` | fetch a node |
748
752
  | `kg node del ID` | delete a node (cascades edges) |
@@ -3,7 +3,7 @@
3
3
  ![Python](https://img.shields.io/badge/python-3.10%2B-3776AB?logo=python&logoColor=white)
4
4
  ![License: MIT](https://img.shields.io/badge/license-MIT-green)
5
5
  ![core dependencies: 0](https://img.shields.io/badge/core_dependencies-0-success)
6
- ![tests: 79 passing](https://img.shields.io/badge/tests-79_passing-brightgreen)
6
+ ![tests: 87 passing](https://img.shields.io/badge/tests-87_passing-brightgreen)
7
7
  ![storage: SQLite](https://img.shields.io/badge/storage-SQLite-003B57?logo=sqlite&logoColor=white)
8
8
  ![MCP](https://img.shields.io/badge/MCP-ready-FF6F00)
9
9
 
@@ -505,6 +505,8 @@ kg out person:ada # outbound edges
505
505
  kg path person:ada field:cs # shortest path
506
506
  kg nodes-by-kind Person
507
507
  kg stats
508
+ kg schema # observed vocabulary — kinds, edge types, labels, keys-per-kind
509
+ kg schema --samples # + example ids and enum-like property values per kind
508
510
  kg --json node get person:ada # machine-readable output for piping
509
511
 
510
512
  kg events -n 10 # tail the event log
@@ -539,7 +541,8 @@ Or hand-edit a client config (e.g. Claude Desktop):
539
541
  { "mcpServers": { "kgrdbms": { "command": "kgrdbms-mcp" } } }
540
542
  ```
541
543
 
542
- It exposes `kg_`-prefixed tools for reads (`kg_node_get`, `kg_nodes_by_kind`,
544
+ It exposes `kg_`-prefixed tools for reads (`kg_schema` — the vocabulary, meant to
545
+ be called first; `kg_node_get`, `kg_nodes_by_kind`,
543
546
  `kg_neighborhood`, `kg_shortest_path`, `kg_descendants`, …), gated writes
544
547
  (`kg_node_upsert`, `kg_edge_add`, `kg_node_delete`, …), bulk composition
545
548
  (`kg_import` — a whole `{nodes, edges}` batch in one call, so an agent populates
@@ -712,6 +715,7 @@ replayable.
712
715
  | Command | What it does |
713
716
  | ------------------------------- | --------------------------------------------- |
714
717
  | `kg stats` | node/edge counts and db path |
718
+ | `kg schema [--samples]` | observed vocabulary: kinds, edge types, labels, keys-per-kind |
715
719
  | `kg node add ID --kind K …` | create or update a node (gated + logged) |
716
720
  | `kg node get ID` | fetch a node |
717
721
  | `kg node del ID` | delete a node (cascades edges) |
@@ -12,7 +12,7 @@ A small, dependency-free knowledge-graph core:
12
12
 
13
13
  from __future__ import annotations
14
14
 
15
- __version__ = "0.1.2"
15
+ __version__ = "0.1.4"
16
16
 
17
17
  from kgrdbms.graph import Edge, Graph, Node, default_graph_path, slug
18
18
  from kgrdbms.events import (
@@ -47,6 +47,7 @@ class GraphBackend(Protocol):
47
47
  def count_edges_by_type(self) -> dict[str, int]: ...
48
48
  def total_nodes(self) -> int: ...
49
49
  def total_edges(self) -> int: ...
50
+ def schema(self, *, samples: bool = ..., sample_limit: int = ...) -> dict: ...
50
51
  # bulk: a context manager that defers commits to one transaction
51
52
  def batch(self) -> Any: ...
52
53
  def close(self) -> None: ...
@@ -93,6 +94,7 @@ class _StubBackend:
93
94
  def count_edges_by_type(self, *a: Any, **k: Any) -> dict[str, int]: return self._todo("count_edges_by_type")
94
95
  def total_nodes(self, *a: Any, **k: Any) -> int: return self._todo("total_nodes")
95
96
  def total_edges(self, *a: Any, **k: Any) -> int: return self._todo("total_edges")
97
+ def schema(self, *a: Any, **k: Any) -> dict: return self._todo("schema")
96
98
 
97
99
  @contextmanager
98
100
  def batch(self) -> Iterator["_StubBackend"]:
@@ -450,6 +450,73 @@ class PostgresGraph:
450
450
  def total_edges(self) -> int:
451
451
  return self.conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"]
452
452
 
453
+ def schema(self, *, samples: bool = False, sample_limit: int = 20) -> dict:
454
+ """Observed schema (kinds, edge types, labels, property keys per kind),
455
+ mirroring Graph.schema. Pure aggregates over the same five tables; jsonb
456
+ values arrive already parsed, so no json.loads on the sampling path."""
457
+ kinds = self.count_nodes_by_kind()
458
+ edge_types = self.count_edges_by_type()
459
+ labels = {
460
+ r["label"]: r["c"]
461
+ for r in self.conn.execute(
462
+ "SELECT label, COUNT(*) AS c FROM node_labels GROUP BY label ORDER BY c DESC"
463
+ ).fetchall()
464
+ }
465
+ node_keys_by_kind: dict[str, dict[str, int]] = {}
466
+ for r in self.conn.execute(
467
+ "SELECT n.kind AS kind, p.key AS key, COUNT(*) AS c "
468
+ "FROM node_properties p JOIN nodes n ON n.id = p.node_id "
469
+ "GROUP BY n.kind, p.key ORDER BY n.kind, c DESC"
470
+ ).fetchall():
471
+ node_keys_by_kind.setdefault(r["kind"], {})[r["key"]] = r["c"]
472
+ edge_keys = {
473
+ r["key"]: r["c"]
474
+ for r in self.conn.execute(
475
+ "SELECT key, COUNT(*) AS c FROM edge_properties GROUP BY key ORDER BY c DESC"
476
+ ).fetchall()
477
+ }
478
+ result: dict[str, Any] = {
479
+ "nodes_total": self.total_nodes(),
480
+ "edges_total": self.total_edges(),
481
+ "kinds": kinds,
482
+ "edge_types": edge_types,
483
+ "labels": labels,
484
+ "node_keys_by_kind": node_keys_by_kind,
485
+ "edge_keys": edge_keys,
486
+ }
487
+ if samples:
488
+ result["samples"] = self._schema_samples(kinds, node_keys_by_kind, sample_limit)
489
+ return result
490
+
491
+ def _schema_samples(
492
+ self, kinds: dict[str, int], node_keys_by_kind: dict[str, dict[str, int]], sample_limit: int
493
+ ) -> dict[str, dict]:
494
+ from kgrdbms.graph import _scalar_samples
495
+
496
+ samples: dict[str, dict] = {}
497
+ for kind in kinds:
498
+ example_ids = [
499
+ r["id"]
500
+ for r in self.conn.execute(
501
+ "SELECT id FROM nodes WHERE kind=%s ORDER BY id LIMIT 5", (kind,)
502
+ ).fetchall()
503
+ ]
504
+ values: dict[str, list] = {}
505
+ for key in node_keys_by_kind.get(kind, {}):
506
+ rows = self.conn.execute(
507
+ "SELECT DISTINCT p.value_json FROM node_properties p "
508
+ "JOIN nodes n ON n.id = p.node_id "
509
+ "WHERE n.kind=%s AND p.key=%s LIMIT %s",
510
+ (kind, key, sample_limit + 1),
511
+ ).fetchall()
512
+ if len(rows) > sample_limit:
513
+ continue # open-ended / free-text field — don't enumerate it
514
+ vals = _scalar_samples(r["value_json"] for r in rows) # jsonb already parsed
515
+ if vals is not None:
516
+ values[key] = vals
517
+ samples[kind] = {"example_ids": example_ids, "values": values}
518
+ return samples
519
+
453
520
  # ---- hydration (jsonb returns parsed values — no json.loads) --------
454
521
 
455
522
  def _hydrate_node(self, row: dict) -> Node:
@@ -21,6 +21,7 @@ from __future__ import annotations
21
21
 
22
22
  import argparse
23
23
  import json
24
+ import sqlite3
24
25
  import sys
25
26
  from typing import Any
26
27
 
@@ -172,6 +173,43 @@ def cmd_stats(app: App, args) -> int:
172
173
  return 0
173
174
 
174
175
 
176
+ def cmd_schema(app: App, args) -> int:
177
+ """The observed schema: kinds, edge types, labels, and property keys per kind.
178
+
179
+ The map to read *before* querying an unfamiliar ontology — so you query by a
180
+ kind/label/key that actually exists instead of guessing.
181
+ """
182
+ res = app.graph.schema(samples=args.samples)
183
+ if res.get("ontology") is None and app.ontology:
184
+ res = {"ontology": app.ontology, **res}
185
+
186
+ lines: list[str] = []
187
+ if app.ontology:
188
+ lines.append(f"ontology: {app.ontology}")
189
+ lines.append(f"nodes: {res['nodes_total']:,} edges: {res['edges_total']:,}")
190
+ lines.append("")
191
+ lines.append("kinds (node count) and their property keys:")
192
+ for kind, n in res["kinds"].items():
193
+ keys = res["node_keys_by_kind"].get(kind, {})
194
+ keystr = ", ".join(f"{k}×{c}" for k, c in keys.items()) or "(no properties)"
195
+ lines.append(f" {kind} ×{n}")
196
+ lines.append(f" keys: {keystr}")
197
+ if args.samples:
198
+ samp = res.get("samples", {}).get(kind, {})
199
+ ex = ", ".join(samp.get("example_ids", []))
200
+ if ex:
201
+ lines.append(f" e.g. {ex}")
202
+ for k, vals in samp.get("values", {}).items():
203
+ lines.append(f" {k} ∈ {{{', '.join(str(v) for v in vals)}}}")
204
+ lines.append("")
205
+ lines.append("edge types: " + (", ".join(f"{t}×{c}" for t, c in res["edge_types"].items()) or "(none)"))
206
+ lines.append("labels: " + (", ".join(f"{l}×{c}" for l, c in res["labels"].items()) or "(none)"))
207
+ if res["edge_keys"]:
208
+ lines.append("edge keys: " + ", ".join(f"{k}×{c}" for k, c in res["edge_keys"].items()))
209
+ app.emit(res, "\n".join(lines))
210
+ return 0
211
+
212
+
175
213
  # ---- registry handlers (the control plane / db-of-dbs) ---------------
176
214
 
177
215
 
@@ -431,6 +469,12 @@ def build_parser() -> argparse.ArgumentParser:
431
469
  sp = sub.add_parser("stats", help="node/edge counts, db path, active ontology")
432
470
  sp.set_defaults(func=cmd_stats)
433
471
 
472
+ sp = sub.add_parser("schema", help="observed schema: kinds, edge types, labels, "
473
+ "property keys per kind (read this before querying)")
474
+ sp.add_argument("--samples", action="store_true",
475
+ help="also show example node ids and enum-like property values per kind")
476
+ sp.set_defaults(func=cmd_schema)
477
+
434
478
  # ---- ontology registry (the db-of-dbs) ----
435
479
  ont = sub.add_parser("ontology", help="manage the ontology registry").add_subparsers(dest="action", required=True)
436
480
  a = ont.add_parser("list", help="list registered ontologies")
@@ -600,6 +644,11 @@ def main(argv: list[str] | None = None) -> int:
600
644
  except (KeyError, ValueError, FileNotFoundError) as e:
601
645
  print(f"error: {e}", file=sys.stderr)
602
646
  return 1
647
+ except sqlite3.IntegrityError as e:
648
+ # Safety net for any FK/constraint path not pre-checked in service.py
649
+ # (e.g. restoring a deleted node whose edge endpoint is since gone).
650
+ print(f"error: {e}", file=sys.stderr)
651
+ return 1
603
652
  finally:
604
653
  app.close()
605
654
 
@@ -53,6 +53,7 @@ OP_NODE_DEL_PROPERTY = "NODE_DEL_PROPERTY"
53
53
  OP_EDGE_ADD = "EDGE_ADD"
54
54
  OP_EDGE_REMOVE = "EDGE_REMOVE"
55
55
  OP_RESTORE = "RESTORE" # re-create a captured node + its edges (used to undo a delete)
56
+ OP_NODE_RESTORE_STATE = "NODE_RESTORE_STATE" # exact-restore a node to a prior snapshot (undo of an upsert)
56
57
  OP_BATCH = "BATCH" # add many nodes + edges in one event
57
58
  OP_GENESIS = "GENESIS"
58
59
 
@@ -226,7 +227,11 @@ class EventLog:
226
227
  prior = p.get("prior")
227
228
  if prior is None:
228
229
  return OP_NODE_DELETE, {"node": p["after"], "edges": []}
229
- return OP_NODE_UPSERT, {"after": prior, "prior": p["after"]}
230
+ # A plain re-upsert of `prior` would MERGE, so it cannot remove the
231
+ # labels/properties this upsert *added* — leaving a non-inverse. The
232
+ # exact-restore op carries the prior snapshot plus the `added` delta,
233
+ # so it can strip those additions and rebuild `prior` precisely.
234
+ return OP_NODE_RESTORE_STATE, {"node": prior, "added": p["after"]}
230
235
  if op == OP_NODE_DELETE:
231
236
  return OP_RESTORE, {"node": p["node"], "edges": p.get("edges", [])}
232
237
  if op == OP_NODE_SET_LABEL:
@@ -261,6 +266,19 @@ def apply_event(graph: "Graph", ev: GraphEvent) -> None:
261
266
  labels=spec.get("labels", []), properties=spec.get("properties", {}))
262
267
  for e in p.get("edges", []):
263
268
  graph.add_edge(e["from"], e["to"], e["type"], e.get("properties", {}))
269
+ elif op == OP_NODE_RESTORE_STATE:
270
+ # Exact-restore a node to `node`, removing the labels/properties that the
271
+ # reverted upsert added (`added` is that upsert's delta). add_node is a
272
+ # merge, so we must strip the additions first, then rebuild the snapshot.
273
+ target = p["node"]
274
+ added = p.get("added", {})
275
+ nid = target["id"]
276
+ for label in set(added.get("labels", [])) - set(target.get("labels", [])):
277
+ graph.remove_label(nid, label)
278
+ for key in set(added.get("properties", {}).keys()) - set(target.get("properties", {}).keys()):
279
+ graph.del_property(nid, key)
280
+ graph.add_node(nid, target["kind"], target["name"],
281
+ labels=target.get("labels", []), properties=target.get("properties", {}))
264
282
  elif op == OP_NODE_SET_LABEL:
265
283
  graph.add_label(p["id"], p["label"])
266
284
  elif op == OP_NODE_REMOVE_LABEL:
@@ -187,6 +187,26 @@ def _normalize_edge(spec: "Edge | dict | tuple | list") -> tuple[str, str, str,
187
187
  raise TypeError("edge spec must be an Edge, dict, or (from, to, type[, properties]) tuple")
188
188
 
189
189
 
190
+ def _scalar_samples(values: Iterable[Any], *, max_str: int = 80) -> list | None:
191
+ """Bounded distinct scalar values for schema sampling.
192
+
193
+ Returns the values sorted, or None to signal "not an enumerable vocabulary"
194
+ — i.e. some value is a list/object or an over-long string, so showing it as a
195
+ closed set would mislead. Keeps `schema(samples=True)` from dumping free-text.
196
+ """
197
+ out: list = []
198
+ for v in values:
199
+ if isinstance(v, str):
200
+ if len(v) > max_str:
201
+ return None
202
+ out.append(v)
203
+ elif isinstance(v, (int, float)): # bool is a subclass of int — included
204
+ out.append(v)
205
+ else: # list / dict / None
206
+ return None
207
+ return sorted(out, key=str) if out else None
208
+
209
+
190
210
  # ---- graph ----------------------------------------------------------
191
211
 
192
212
 
@@ -622,6 +642,88 @@ class Graph:
622
642
  def total_edges(self) -> int:
623
643
  return self.conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"]
624
644
 
645
+ # ---- schema (observed TBox: the map of what's in the graph) ----
646
+
647
+ def schema(self, *, samples: bool = False, sample_limit: int = 20) -> dict:
648
+ """The observed schema of the graph — the vocabulary needed to query it
649
+ without guessing.
650
+
651
+ Returns kinds, edge types, labels, and property keys *per kind*, each
652
+ with counts. This is a profile of what actually occurs (the graph is
653
+ schemaless — nothing is enforced), the property-graph analogue of an
654
+ ontology's TBox derived from its ABox.
655
+
656
+ `samples=True` additionally returns, per kind, a few example node ids
657
+ (revealing the id/CURIE convention) and — for enum-like properties — the
658
+ bounded set of distinct scalar values a key takes, turning "there is a
659
+ key `status`" into "status is one of {active, archived}". A key whose
660
+ distinct values exceed `sample_limit` (a free-text field) is left
661
+ un-enumerated rather than dumped.
662
+
663
+ Read-only; pure GROUP BY aggregates. The intended first call for any
664
+ consumer dropped into an unfamiliar ontology.
665
+ """
666
+ kinds = self.count_nodes_by_kind()
667
+ edge_types = self.count_edges_by_type()
668
+ labels = {
669
+ r["label"]: r["c"]
670
+ for r in self.conn.execute(
671
+ "SELECT label, COUNT(*) AS c FROM node_labels GROUP BY label ORDER BY c DESC"
672
+ ).fetchall()
673
+ }
674
+ node_keys_by_kind: dict[str, dict[str, int]] = {}
675
+ for r in self.conn.execute(
676
+ "SELECT n.kind AS kind, p.key AS key, COUNT(*) AS c "
677
+ "FROM node_properties p JOIN nodes n ON n.id = p.node_id "
678
+ "GROUP BY n.kind, p.key ORDER BY n.kind, c DESC"
679
+ ).fetchall():
680
+ node_keys_by_kind.setdefault(r["kind"], {})[r["key"]] = r["c"]
681
+ edge_keys = {
682
+ r["key"]: r["c"]
683
+ for r in self.conn.execute(
684
+ "SELECT key, COUNT(*) AS c FROM edge_properties GROUP BY key ORDER BY c DESC"
685
+ ).fetchall()
686
+ }
687
+ result: dict[str, Any] = {
688
+ "nodes_total": self.total_nodes(),
689
+ "edges_total": self.total_edges(),
690
+ "kinds": kinds,
691
+ "edge_types": edge_types,
692
+ "labels": labels,
693
+ "node_keys_by_kind": node_keys_by_kind,
694
+ "edge_keys": edge_keys,
695
+ }
696
+ if samples:
697
+ result["samples"] = self._schema_samples(kinds, node_keys_by_kind, sample_limit)
698
+ return result
699
+
700
+ def _schema_samples(
701
+ self, kinds: dict[str, int], node_keys_by_kind: dict[str, dict[str, int]], sample_limit: int
702
+ ) -> dict[str, dict]:
703
+ samples: dict[str, dict] = {}
704
+ for kind in kinds:
705
+ example_ids = [
706
+ r["id"]
707
+ for r in self.conn.execute(
708
+ "SELECT id FROM nodes WHERE kind=? ORDER BY id LIMIT 5", (kind,)
709
+ ).fetchall()
710
+ ]
711
+ values: dict[str, list] = {}
712
+ for key in node_keys_by_kind.get(kind, {}):
713
+ rows = self.conn.execute(
714
+ "SELECT DISTINCT p.value_json FROM node_properties p "
715
+ "JOIN nodes n ON n.id = p.node_id "
716
+ "WHERE n.kind=? AND p.key=? LIMIT ?",
717
+ (kind, key, sample_limit + 1),
718
+ ).fetchall()
719
+ if len(rows) > sample_limit:
720
+ continue # open-ended / free-text field — don't enumerate it
721
+ vals = _scalar_samples(json.loads(r["value_json"]) for r in rows)
722
+ if vals is not None:
723
+ values[key] = vals
724
+ samples[kind] = {"example_ids": example_ids, "values": values}
725
+ return samples
726
+
625
727
  # ---- hydration -------------------------------------------------
626
728
 
627
729
  def _hydrate_node(self, row: sqlite3.Row) -> Node:
@@ -21,6 +21,8 @@ Tool surface (all prefixed kg_):
21
21
 
22
22
  reads
23
23
  kg_stats — node/edge counts, the db path, the active ontology
24
+ kg_schema — observed vocabulary (kinds, edge types, labels, keys);
25
+ read this FIRST to query without guessing
24
26
  kg_node_get — fetch a node by id
25
27
  kg_nodes_by_kind — list all nodes of a kind
26
28
  kg_nodes_by_label — list all nodes carrying a label
@@ -90,7 +92,10 @@ mcp = FastMCP(
90
92
  "prefixed kg_ read or mutate nodes (id, kind, name, labels, properties) "
91
93
  "and typed directed edges. Every tool takes an optional `ontology` name "
92
94
  "(omit for the default); use kg_ontologies_list to discover them and "
93
- "kg_ontology_create to add one. Writes are gated by compiled-in "
95
+ "kg_ontology_create to add one. When working with an ontology whose "
96
+ "contents you don't already know, call kg_schema FIRST — it returns the "
97
+ "exact kinds, edge types, labels, and property keys so you can query by "
98
+ "real values instead of guessing. Writes are gated by compiled-in "
94
99
  "invariants and a configurable policy, and recorded to an append-only, "
95
100
  "replayable event log."
96
101
  ),
@@ -197,6 +202,26 @@ def kg_stats(ontology: str | None = None) -> dict:
197
202
  }
198
203
 
199
204
 
205
+ @mcp.tool()
206
+ def kg_schema(samples: bool = False, ontology: str | None = None) -> dict:
207
+ """The observed schema of an ontology — CALL THIS FIRST when you don't already
208
+ know what an ontology contains, before kg_nodes_by_kind / kg_nodes_by_label /
209
+ kg_node_get. It tells you the exact vocabulary so you never have to guess.
210
+
211
+ Returns:
212
+ - kinds — every node `kind` and its count
213
+ - edge_types — every edge `type` and its count
214
+ - labels — every label and its count
215
+ - node_keys_by_kind— for each kind, which property keys its nodes carry (+counts)
216
+ - edge_keys — property keys that appear on edges
217
+
218
+ With samples=True, also returns per kind a few example node ids (showing the
219
+ id/CURIE convention) and, for enum-like properties, the set of distinct values
220
+ a key takes (free-text keys are left un-enumerated). Read-only; cheap.
221
+ """
222
+ return _bundle(ontology).backend.schema(samples=samples)
223
+
224
+
200
225
  @mcp.tool()
201
226
  def kg_node_get(id: str, ontology: str | None = None) -> dict | None:
202
227
  """Fetch a single node by id."""
@@ -29,6 +29,7 @@ import json
29
29
  import re
30
30
  from dataclasses import dataclass, field
31
31
  from typing import Any, Iterator
32
+ from urllib.parse import quote, unquote
32
33
 
33
34
  from kgrdbms.backends.base import GraphBackend
34
35
  from kgrdbms.graph import Edge, Node
@@ -78,6 +79,22 @@ XSD = "http://www.w3.org/2001/XMLSchema#"
78
79
  KG = "https://kg.local/vocab#"
79
80
 
80
81
 
82
+ def _enc(segment: str) -> str:
83
+ """Percent-encode an arbitrary string into a valid IRI path/fragment segment.
84
+
85
+ Node references are slug-safe, but `kind`, edge `type`, and property `key`
86
+ are free-form user text — a space or '%' there would otherwise emit an IRI
87
+ no conformant RDF store will parse. `safe=""` also encodes '/', so a key like
88
+ 'a/b' can't masquerade as a path boundary. Inverted by `_dec` on import.
89
+ """
90
+ return quote(segment, safe="")
91
+
92
+
93
+ def _dec(segment: str) -> str:
94
+ """Invert `_enc` — percent-decode an IRI segment back to its stored value."""
95
+ return unquote(segment)
96
+
97
+
81
98
  # ---- IRI context: the CURIE -> IRI expansion table -------------------
82
99
 
83
100
 
@@ -106,13 +123,13 @@ class IriContext:
106
123
  base = self.prefix_bases.get(prefix, f"{self.default_base}{prefix}/")
107
124
  else:
108
125
  prefix, ref, base = "", node_id, self.default_base
109
- return Iri(f"{base}{ref}")
126
+ return Iri(f"{base}{_enc(ref)}")
110
127
 
111
128
  def prop_predicate(self, key: str) -> Iri:
112
- return Iri(f"{self.prop_base}{key}")
129
+ return Iri(f"{self.prop_base}{_enc(key)}")
113
130
 
114
131
  def edge_predicate(self, edge_type: str) -> Iri:
115
- return Iri(f"{self.edge_base}{edge_type}")
132
+ return Iri(f"{self.edge_base}{_enc(edge_type)}")
116
133
 
117
134
 
118
135
  # ---- value -> literal typing -----------------------------------------
@@ -163,7 +180,7 @@ def node_to_triples(node: Node, ctx: IriContext) -> list[Triple]:
163
180
  s = ctx.expand_node(node.id)
164
181
  triples: list[Triple] = [
165
182
  # kind -> rdf:type, pointing at a class IRI under the kg vocab.
166
- (s, Iri(f"{RDF}type"), Iri(f"{KG}{node.kind}")),
183
+ (s, Iri(f"{RDF}type"), Iri(f"{KG}{_enc(node.kind)}")),
167
184
  ]
168
185
  if node.name:
169
186
  triples.append((s, Iri(f"{KG}name"), Literal(node.name)))
@@ -494,18 +511,19 @@ def contract_iri(iri: str, ctx: IriContext) -> str:
494
511
  # Explicit prefix bindings win (longest base first to avoid prefix overlap).
495
512
  for prefix, base in sorted(ctx.prefix_bases.items(), key=lambda kv: -len(kv[1])):
496
513
  if iri.startswith(base):
497
- return f"{prefix}:{iri[len(base):]}"
514
+ return f"{prefix}:{_dec(iri[len(base):])}"
498
515
  if iri.startswith(ctx.default_base):
499
516
  rest = iri[len(ctx.default_base):]
500
517
  if "/" in rest:
501
518
  prefix, ref = rest.split("/", 1)
502
- return f"{prefix}:{ref}"
503
- return rest
519
+ return f"{prefix}:{_dec(ref)}"
520
+ return _dec(rest)
504
521
  return iri # foreign IRI — keep verbatim
505
522
 
506
523
 
507
524
  def _local_after(iri: str, base: str) -> str | None:
508
- return iri[len(base):] if iri.startswith(base) else None
525
+ """Strip `base` and percent-decode the remaining segment (kind/type/key)."""
526
+ return _dec(iri[len(base):]) if iri.startswith(base) else None
509
527
 
510
528
 
511
529
  def triples_to_graph(triples: list[Triple], ctx: IriContext | None = None) -> tuple[list[dict], list[dict]]:
@@ -95,6 +95,11 @@ def upsert_node(
95
95
 
96
96
  def set_label(graph: Graph, events: EventLog, id: str, label: str, actor: str = "anonymous") -> Node | None:
97
97
  guard(graph, _node_ctx(graph, id, "node_set_label"))
98
+ node = graph.node(id)
99
+ if node is None:
100
+ raise ValueError(f"node {id!r} does not exist")
101
+ if label in node.labels:
102
+ return node # already present: a true no-op, so don't log a non-invertible event
98
103
  graph.add_label(id, label)
99
104
  events.record(actor, OP_NODE_SET_LABEL, {"id": id, "label": label})
100
105
  return graph.node(id)
@@ -107,7 +112,9 @@ def set_property(
107
112
  ctx.property_key = key
108
113
  guard(graph, ctx)
109
114
  prior_node = graph.node(id)
110
- prior_value = prior_node.properties.get(key, _MISSING) if prior_node else _MISSING
115
+ if prior_node is None:
116
+ raise ValueError(f"node {id!r} does not exist")
117
+ prior_value = prior_node.properties.get(key, _MISSING)
111
118
  graph.set_property(id, key, value)
112
119
  events.record(actor, OP_NODE_SET_PROPERTY, {"id": id, "key": key, "value": value, "prior": prior_value})
113
120
  return graph.node(id)
@@ -139,6 +146,9 @@ def add_edge(
139
146
  ) -> Edge:
140
147
  ctx = MutationContext(operation="edge_add", edge_type=type, from_node_id=from_id, to_node_id=to_id)
141
148
  guard(graph, ctx)
149
+ for endpoint, role in ((from_id, "from"), (to_id, "to")):
150
+ if graph.node(endpoint) is None:
151
+ raise ValueError(f"{role} node {endpoint!r} does not exist")
142
152
  edge = graph.add_edge(from_node=from_id, to_node=to_id, type=type, properties=properties or {})
143
153
  events.record(actor, OP_EDGE_ADD, {"edge": edge_spec(edge)})
144
154
  return edge
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "knowledge-graph-rdbms"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "A label property graph on an RDBMS (SQLite): nodes, typed edges, an append-only event log, and an optional MCP server."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -189,3 +189,45 @@ def test_rdf_export_lossy_reports_dropped(db, capsys):
189
189
  assert "rel/influences" in captured.out # bare edge present
190
190
  assert "prop/since" not in captured.out # property dropped
191
191
  assert "dropped" in captured.err # but loudly, not silently
192
+
193
+
194
+ # ---- regression: FK violations exit 1 cleanly, no traceback ---------
195
+
196
+
197
+ def test_set_label_missing_node_exits_1(db, capsys):
198
+ assert run(db, "node", "add-label", "ghost:1", "L") == 1
199
+ err = capsys.readouterr().err
200
+ assert "does not exist" in err and "Traceback" not in err
201
+
202
+
203
+ def test_set_prop_missing_node_exits_1(db, capsys):
204
+ assert run(db, "node", "set-prop", "ghost:1", "k", "1") == 1
205
+ assert "does not exist" in capsys.readouterr().err
206
+
207
+
208
+ def test_edge_add_missing_endpoint_exits_1(db, capsys):
209
+ run(db, "node", "add", "x:1", "--kind", "T")
210
+ capsys.readouterr()
211
+ assert run(db, "edge", "add", "x:1", "y:1", "LINK") == 1
212
+ err = capsys.readouterr().err
213
+ assert "to node 'y:1' does not exist" in err and "Traceback" not in err
214
+
215
+
216
+ def test_schema_json_lists_kinds_and_keys(db, capsys):
217
+ run(db, "node", "add", "person:ada", "--kind", "Person", "--prop", "role=analyst")
218
+ run(db, "node", "add", "memory:m1", "--kind", "Memory", "--prop", "importance=high")
219
+ capsys.readouterr()
220
+ assert run(db, "schema", as_json=True) == 0
221
+ payload = json.loads(capsys.readouterr().out)
222
+ assert payload["kinds"] == {"Person": 1, "Memory": 1}
223
+ assert payload["node_keys_by_kind"]["Person"] == {"role": 1}
224
+ assert payload["node_keys_by_kind"]["Memory"] == {"importance": 1}
225
+
226
+
227
+ def test_schema_samples_human_shows_enum_values(db, capsys):
228
+ run(db, "node", "add", "memory:m1", "--kind", "Memory", "--prop", "importance=high")
229
+ run(db, "node", "add", "memory:m2", "--kind", "Memory", "--prop", "importance=low")
230
+ capsys.readouterr()
231
+ assert run(db, "schema", "--samples") == 0
232
+ out = capsys.readouterr().out
233
+ assert "importance" in out and "high" in out and "low" in out
@@ -154,3 +154,52 @@ def test_batch_op_is_replayable(tmp_path):
154
154
  assert g.node("n:1") is not None
155
155
  assert g.out("n:1", "LINK")
156
156
  g.close()
157
+
158
+
159
+ # ---- regression: revert of an upsert is a TRUE inverse --------------
160
+
161
+
162
+ def test_revert_upsert_removes_added_labels_and_props(tmp_path):
163
+ """An upsert that ADDS labels/props must, on revert, restore the node
164
+ exactly to its prior state — not merely overwrite changed values.
165
+ Regression for: compensation merged instead of replacing."""
166
+ from kgrdbms import service
167
+
168
+ g, log = _fresh(tmp_path)
169
+ service.upsert_node(g, log, id="t:1", kind="T", labels=["A"], properties={"color": "red"})
170
+ service.upsert_node(g, log, id="t:1", kind="T", labels=["B"],
171
+ properties={"color": "blue", "size": "big"})
172
+ ev2 = log.tail(1)[0].id
173
+
174
+ service.revert_event(log, ev2)
175
+ n = g.node("t:1")
176
+ assert sorted(n.labels) == ["A"] # added label B removed
177
+ assert n.properties == {"color": "red"} # added prop dropped, value restored
178
+
179
+
180
+ def test_replay_after_revert_is_consistent(tmp_path):
181
+ """The new restore-state op must itself be replayable: rebuilding from the
182
+ log reproduces the post-revert state."""
183
+ from kgrdbms import service
184
+
185
+ g, log = _fresh(tmp_path)
186
+ service.upsert_node(g, log, id="t:1", kind="T", labels=["A"], properties={"color": "red"})
187
+ service.upsert_node(g, log, id="t:1", kind="T", labels=["B"], properties={"color": "blue"})
188
+ service.revert_event(log, log.tail(1)[0].id)
189
+
190
+ service.replay_log(g, log)
191
+ n = g.node("t:1")
192
+ assert sorted(n.labels) == ["A"] and n.properties == {"color": "red"}
193
+
194
+
195
+ def test_resetting_existing_label_is_noop_not_logged(tmp_path):
196
+ """Re-adding a label a node already has must not log an event whose revert
197
+ would then remove the pre-existing label."""
198
+ from kgrdbms import service
199
+
200
+ g, log = _fresh(tmp_path)
201
+ service.upsert_node(g, log, id="t:1", kind="T", labels=["A"])
202
+ before = log.count()
203
+ service.set_label(g, log, "t:1", "A") # already present -> no-op
204
+ assert log.count() == before # nothing logged
205
+ assert "A" in g.node("t:1").labels
@@ -62,6 +62,19 @@ def test_nodes_by_kind_and_label(mcp_mod):
62
62
  assert tagged == {"a:1", "b:1"}
63
63
 
64
64
 
65
+ def test_schema_exposes_vocabulary(mcp_mod):
66
+ mcp_mod.kg_node_upsert(id="a:1", kind="A", name="1", labels=["Tagged"],
67
+ properties={"status": "active"})
68
+ mcp_mod.kg_node_upsert(id="a:2", kind="A", name="2", properties={"status": "archived"})
69
+ s = mcp_mod.kg_schema()
70
+ assert s["kinds"] == {"A": 2}
71
+ assert s["labels"] == {"Tagged": 1}
72
+ assert s["node_keys_by_kind"]["A"] == {"status": 2}
73
+ # samples enumerate the enum-like status values
74
+ s2 = mcp_mod.kg_schema(samples=True)
75
+ assert s2["samples"]["A"]["values"]["status"] == ["active", "archived"]
76
+
77
+
65
78
  def test_edges_out_and_shortest_path(mcp_mod):
66
79
  for nid in ("x:1", "x:2", "x:3"):
67
80
  mcp_mod.kg_node_upsert(id=nid, kind="X", name=nid)
@@ -107,6 +107,28 @@ def test_bulk_add_nodes_and_edges(pg):
107
107
  assert g.out("b:1")[0][0].properties == {"w": 9}
108
108
 
109
109
 
110
+ def test_schema_on_postgres_mirrors_sqlite(pg):
111
+ g = pg.backend
112
+ g.add_nodes([
113
+ {"id": "person:ada", "kind": "Person", "name": "Ada",
114
+ "labels": ["important"], "properties": {"role": "analyst"}},
115
+ {"id": "person:alan", "kind": "Person", "name": "Alan",
116
+ "properties": {"role": "logician"}},
117
+ {"id": "memory:m1", "kind": "Memory", "name": "m1",
118
+ "properties": {"importance": "high", "content": "x" * 200}},
119
+ ])
120
+ g.add_edges([("person:ada", "memory:m1", "WROTE", {"year": 1843})])
121
+ s = g.schema(samples=True)
122
+ assert s["kinds"] == {"Person": 2, "Memory": 1}
123
+ assert s["edge_types"] == {"WROTE": 1}
124
+ assert s["labels"] == {"important": 1}
125
+ assert s["node_keys_by_kind"]["Person"]["role"] == 2
126
+ assert s["edge_keys"] == {"year": 1}
127
+ # enum enumerated, long free-text content omitted
128
+ assert s["samples"]["Memory"]["values"]["importance"] == ["high"]
129
+ assert "content" not in s["samples"]["Memory"]["values"]
130
+
131
+
110
132
  def test_replay_rebuilds_postgres_from_sqlite_log(pg):
111
133
  service.upsert_node(pg.backend, pg.events, id="p:1", kind="Person", name="One", actor="t")
112
134
  service.upsert_node(pg.backend, pg.events, id="p:2", kind="Person", name="Two", actor="t")
@@ -97,6 +97,47 @@ def test_rdf_star_annotates_the_quoted_triple(populated):
97
97
  assert "<https://kg.local/prop/since>" in nt
98
98
 
99
99
 
100
+ def test_special_chars_produce_valid_iris(populated):
101
+ """Regression: kind / edge-type / property-key with spaces or punctuation
102
+ must percent-encode into valid IRIs, not emit a raw space a store rejects."""
103
+ g = _fresh_graph()
104
+ g.add_node("topic:ml", kind="Knowledge Area", name="ML",
105
+ properties={"first name": "Ada", "rate %": 50, "a/b": 1})
106
+ g.add_node("topic:cs", kind="Knowledge Area", name="CS")
107
+ g.add_edge("topic:ml", "topic:cs", "is part of", properties={"note": "x"})
108
+
109
+ nt = rdf.export(g, "ntriples")
110
+ assert "Knowledge%20Area" in nt
111
+ assert "prop/first%20name" in nt
112
+ assert "rel/is%20part%20of" in nt
113
+ assert "prop/a%2Fb" in nt # '/' encoded so it can't fake a path boundary
114
+ # No IRI reference may contain a raw space (would break any conformant parser).
115
+ import re
116
+ for iri in re.findall(r"<([^<>]+)>", nt):
117
+ assert " " not in iri, f"raw space in IRI: {iri!r}"
118
+
119
+
120
+ def test_special_chars_round_trip_and_rdflib_accepts(populated):
121
+ rdflib = pytest.importorskip("rdflib")
122
+ g = _fresh_graph()
123
+ g.add_node("topic:ml", kind="Knowledge Area", name="ML",
124
+ properties={"first name": "Ada", "rate %": 50, "a/b": 1})
125
+ g.add_node("topic:cs", kind="Knowledge Area", name="CS")
126
+ g.add_edge("topic:ml", "topic:cs", "is part of", properties={"note": "x"})
127
+
128
+ # A conformant store accepts the export (reification — rdflib is RDF 1.1).
129
+ ctx = rdf.IriContext(edge_strategy="reification")
130
+ rdflib.Graph().parse(data=rdf.export(g, "ntriples", ctx), format="nt")
131
+
132
+ # And the values survive a full star round-trip unchanged.
133
+ dst = _reimport(rdf.export(g, "ntriples"), "ntriples")
134
+ n = dst.node("topic:ml")
135
+ assert n.kind == "Knowledge Area"
136
+ assert n.properties == {"first name": "Ada", "rate %": 50, "a/b": 1}
137
+ e = dst.out("topic:ml")[0][0]
138
+ assert e.type == "is part of" and e.properties == {"note": "x"}
139
+
140
+
100
141
  def test_reification_emits_statement_node(populated):
101
142
  ctx = rdf.IriContext(edge_strategy="reification")
102
143
  triples = rdf.export_graph(populated, ctx)
@@ -0,0 +1,97 @@
1
+ """schema() — the observed TBox an LLM reads before querying, so it never guesses."""
2
+
3
+ from kgrdbms.graph import Graph, _scalar_samples
4
+
5
+
6
+ def _seed(g: Graph) -> None:
7
+ g.add_node("person:ada", kind="Person", name="Ada Lovelace",
8
+ labels={"Person", "important"}, properties={"role": "analyst", "born": 1815})
9
+ g.add_node("person:alan", kind="Person", name="Alan Turing",
10
+ labels={"Person"}, properties={"role": "logician", "born": 1912})
11
+ g.add_node("memory:m1", kind="Memory", name="note one",
12
+ properties={"content": "a free-text body well over eighty characters long so the "
13
+ "schema sampler treats it as prose, not an enumerable value set",
14
+ "importance": "high"})
15
+ g.add_node("memory:m2", kind="Memory", name="note two",
16
+ properties={"content": "another distinct free-text body, also comfortably past the "
17
+ "eighty-character cap that marks a property as un-enumerable prose",
18
+ "importance": "low"})
19
+ g.add_edge("person:ada", "memory:m1", "WROTE", properties={"year": 1843})
20
+
21
+
22
+ def test_schema_reports_kinds_edge_types_labels(tmp_path):
23
+ g = Graph(path=tmp_path / "s.db")
24
+ _seed(g)
25
+ s = g.schema()
26
+ assert s["nodes_total"] == 4
27
+ assert s["edges_total"] == 1
28
+ assert s["kinds"] == {"Person": 2, "Memory": 2}
29
+ assert s["edge_types"] == {"WROTE": 1}
30
+ assert s["labels"] == {"Person": 2, "important": 1}
31
+ assert s["edge_keys"] == {"year": 1}
32
+ g.close()
33
+
34
+
35
+ def test_schema_property_keys_are_grouped_by_kind(tmp_path):
36
+ g = Graph(path=tmp_path / "k.db")
37
+ _seed(g)
38
+ s = g.schema()
39
+ assert set(s["node_keys_by_kind"]["Person"]) == {"role", "born"}
40
+ assert s["node_keys_by_kind"]["Person"]["role"] == 2
41
+ assert set(s["node_keys_by_kind"]["Memory"]) == {"content", "importance"}
42
+ g.close()
43
+
44
+
45
+ def test_schema_kind_with_no_properties_still_listed(tmp_path):
46
+ g = Graph(path=tmp_path / "np.db")
47
+ g.add_node("tag:x", kind="Tag", name="x") # no properties at all
48
+ s = g.schema()
49
+ assert s["kinds"] == {"Tag": 1}
50
+ assert s["node_keys_by_kind"].get("Tag", {}) == {}
51
+ g.close()
52
+
53
+
54
+ def test_schema_samples_enumerate_enum_keys_but_not_freetext(tmp_path):
55
+ g = Graph(path=tmp_path / "samp.db")
56
+ _seed(g)
57
+ s = g.schema(samples=True)
58
+ mem = s["samples"]["Memory"]
59
+ # example ids reveal the CURIE convention
60
+ assert mem["example_ids"] == ["memory:m1", "memory:m2"]
61
+ # importance is enum-like → enumerated; content is free-text → omitted
62
+ assert mem["values"]["importance"] == ["high", "low"]
63
+ assert "content" not in mem["values"]
64
+ # numeric enum on Person too
65
+ assert s["samples"]["Person"]["values"]["born"] == [1815, 1912]
66
+ g.close()
67
+
68
+
69
+ def test_schema_samples_respects_sample_limit(tmp_path):
70
+ g = Graph(path=tmp_path / "lim.db")
71
+ for i in range(30):
72
+ g.add_node(f"n:{i}", kind="K", name=str(i), properties={"v": i})
73
+ s = g.schema(samples=True, sample_limit=10)
74
+ # 30 distinct values > limit 10 → not enumerated
75
+ assert "v" not in s["samples"]["K"]["values"]
76
+ g.close()
77
+
78
+
79
+ def test_scalar_samples_helper_rejects_nonscalar_and_longstrings():
80
+ assert _scalar_samples([1, 2, 3]) == [1, 2, 3]
81
+ assert _scalar_samples(["b", "a"]) == ["a", "b"]
82
+ assert _scalar_samples([True, False]) == [False, True]
83
+ assert _scalar_samples([["a", "b"]]) is None # list value
84
+ assert _scalar_samples([{"k": "v"}]) is None # object value
85
+ assert _scalar_samples(["x" * 200]) is None # over-long string
86
+ assert _scalar_samples([]) is None # nothing to show
87
+
88
+
89
+ def test_schema_empty_graph(tmp_path):
90
+ g = Graph(path=tmp_path / "empty.db")
91
+ s = g.schema()
92
+ assert s == {
93
+ "nodes_total": 0, "edges_total": 0,
94
+ "kinds": {}, "edge_types": {}, "labels": {},
95
+ "node_keys_by_kind": {}, "edge_keys": {},
96
+ }
97
+ g.close()