knowledge-worker 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mygraph/owl_io.py ADDED
@@ -0,0 +1,202 @@
1
+ """
2
+ owl_io.py — Turtle (OWL) sibling serialization for the graph.
3
+
4
+ JSON stays canonical. mygraph.ttl is generated from JSON at any time and can be
5
+ re-imported losslessly (round-trip on node + edge counts is a hard test).
6
+
7
+ Mapping follows the graph model in SPEC.md and the pipeline in DESIGN.md:
8
+
9
+ - Node type → owl:Class under rb:Concept
10
+ - Edge type → owl:ObjectProperty
11
+ - Node id → IRI <http://mygraph.local/{id}>
12
+ - Node label / body → rdfs:label / rdfs:comment
13
+ - Edge metadata → rb:Assertion (reified) with rb:confidence,
14
+ rb:excerpt, rb:sourceId, rb:createdAt, rb:lastSeen
15
+ - Source node → rb:Source (subclass of dcterms:ProvenanceEntity)
16
+
17
+ Requires `rdflib` (`pip install -e ".[rdf]"`).
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ from mygraph import Graph, Node, Edge, NODE_TYPES, EDGE_TYPES, resolve_graph_path
26
+
27
+ NS = "http://mygraph.local/"
28
+ RB = "http://mygraph.local/schema#"
29
+ DCTERMS = "http://purl.org/dc/terms/"
30
+
31
+
32
+ def _require_rdflib():
33
+ try:
34
+ import rdflib # type: ignore
35
+ return rdflib
36
+ except ImportError as e:
37
+ raise SystemExit(
38
+ "owl_io: `rdflib` is not installed. Run:\n"
39
+ ' pip install -e ".[rdf]"'
40
+ ) from e
41
+
42
+
43
+ def _iri(rdflib, suffix: str):
44
+ # rdflib URIRef accepts strings; we keep IDs verbatim (slug already URL-safe except `:`)
45
+ safe = suffix.replace(":", "/")
46
+ return rdflib.URIRef(NS + safe)
47
+
48
+
49
+ def to_turtle(g: Graph) -> str:
50
+ rdflib = _require_rdflib()
51
+ from rdflib import Graph as RG, Literal, RDF, RDFS, OWL, Namespace
52
+ rg = RG()
53
+ rb_ns = Namespace(RB)
54
+ dc_ns = Namespace(DCTERMS)
55
+ rg.bind("rb", rb_ns)
56
+ rg.bind("dcterms", dc_ns)
57
+ rg.bind("mg", Namespace(NS))
58
+
59
+ # ontology: classes/properties include public schema plus any legacy/private
60
+ # types already present in the graph.
61
+ rg.add((rb_ns.Concept, RDF.type, OWL.Class))
62
+ rg.add((rb_ns.Source, RDF.type, OWL.Class))
63
+ rg.add((rb_ns.Source, RDFS.subClassOf, dc_ns.ProvenanceEntity))
64
+ node_types = NODE_TYPES | {n.type for n in g.nodes.values()}
65
+ edge_types = EDGE_TYPES | {e.type for e in g.edges}
66
+ for t in sorted(node_types):
67
+ cls = rb_ns[t.capitalize()]
68
+ rg.add((cls, RDF.type, OWL.Class))
69
+ if t == "source":
70
+ rg.add((cls, RDFS.subClassOf, rb_ns.Source))
71
+ else:
72
+ rg.add((cls, RDFS.subClassOf, rb_ns.Concept))
73
+ for t in sorted(edge_types):
74
+ rg.add((rb_ns[t], RDF.type, OWL.ObjectProperty))
75
+
76
+ # nodes
77
+ for nid, n in g.nodes.items():
78
+ iri = _iri(rdflib, nid)
79
+ rg.add((iri, RDF.type, rb_ns[n.type.capitalize()]))
80
+ rg.add((iri, RDFS.label, Literal(n.label)))
81
+ if n.body:
82
+ rg.add((iri, RDFS.comment, Literal(n.body)))
83
+ rg.add((iri, rb_ns.confidence, Literal(n.confidence)))
84
+ rg.add((iri, rb_ns.createdAt, Literal(n.created_at)))
85
+ rg.add((iri, rb_ns.nodeId, Literal(nid)))
86
+
87
+ # edges (direct triple + reified rb:Assertion holding metadata)
88
+ for i, e in enumerate(g.edges):
89
+ s = _iri(rdflib, e.src)
90
+ o = _iri(rdflib, e.dst)
91
+ p = rb_ns[e.type]
92
+ rg.add((s, p, o))
93
+ # reify
94
+ a_iri = rdflib.URIRef(f"{NS}_assertion/{i}")
95
+ rg.add((a_iri, RDF.type, rb_ns.Assertion))
96
+ rg.add((a_iri, RDF.subject, s))
97
+ rg.add((a_iri, RDF.predicate, p))
98
+ rg.add((a_iri, RDF.object, o))
99
+ rg.add((a_iri, rb_ns.sourceId, Literal(e.source_id)))
100
+ rg.add((a_iri, rb_ns.confidence, Literal(e.confidence)))
101
+ rg.add((a_iri, rb_ns.createdAt, Literal(e.created_at)))
102
+ rg.add((a_iri, rb_ns.lastSeen, Literal(e.last_seen)))
103
+ rg.add((a_iri, rb_ns.edgeType, Literal(e.type)))
104
+ rg.add((a_iri, rb_ns.srcId, Literal(e.src)))
105
+ rg.add((a_iri, rb_ns.dstId, Literal(e.dst)))
106
+ if e.excerpt:
107
+ rg.add((a_iri, rb_ns.excerpt, Literal(e.excerpt)))
108
+
109
+ return rg.serialize(format="turtle")
110
+
111
+
112
+ def from_turtle(path: Path) -> Graph:
113
+ """Reimport a graph from Turtle. Reads from the rb:Assertion reifications
114
+ (which carry the full edge metadata) — direct triples are redundant."""
115
+ rdflib = _require_rdflib()
116
+ from rdflib import Graph as RG, Literal, RDF, RDFS, Namespace
117
+ rg = RG()
118
+ rg.parse(str(path), format="turtle")
119
+ rb_ns = Namespace(RB)
120
+
121
+ nodes: dict[str, Node] = {}
122
+ # iterate all subjects with rb:nodeId set
123
+ for s, _, lit in rg.triples((None, rb_ns.nodeId, None)):
124
+ nid = str(lit)
125
+ # type from rdf:type (capitalize → lowercase mapping)
126
+ t_iri = next(rg.objects(s, RDF.type), None)
127
+ type_str = (str(t_iri).rsplit("#", 1)[-1] if t_iri else "idea").lower()
128
+ label = str(next(rg.objects(s, RDFS.label), Literal("")))
129
+ body = str(next(rg.objects(s, RDFS.comment), Literal("")))
130
+ conf = str(next(rg.objects(s, rb_ns.confidence), Literal("medium")))
131
+ created = str(next(rg.objects(s, rb_ns.createdAt),
132
+ Literal("1970-01-01T00:00:00+00:00")))
133
+ nodes[nid] = Node(id=nid, type=type_str, label=label, body=body,
134
+ confidence=conf, created_at=created)
135
+
136
+ edges: list[Edge] = []
137
+ for a_iri, _, _ in rg.triples((None, RDF.type, rb_ns.Assertion)):
138
+ src = str(next(rg.objects(a_iri, rb_ns.srcId), Literal("")))
139
+ dst = str(next(rg.objects(a_iri, rb_ns.dstId), Literal("")))
140
+ etype = str(next(rg.objects(a_iri, rb_ns.edgeType), Literal("")))
141
+ if not etype:
142
+ continue
143
+ sid = str(next(rg.objects(a_iri, rb_ns.sourceId), Literal("")))
144
+ conf = str(next(rg.objects(a_iri, rb_ns.confidence), Literal("medium")))
145
+ created = str(next(rg.objects(a_iri, rb_ns.createdAt),
146
+ Literal("1970-01-01T00:00:00+00:00")))
147
+ last = str(next(rg.objects(a_iri, rb_ns.lastSeen), Literal(created)))
148
+ excerpt = str(next(rg.objects(a_iri, rb_ns.excerpt), Literal("")))
149
+ edges.append(Edge(src=src, dst=dst, type=etype, source_id=sid,
150
+ excerpt=excerpt, confidence=conf,
151
+ created_at=created, last_seen=last))
152
+ return Graph(nodes=nodes, edges=edges)
153
+
154
+
155
+ def round_trip_test(graph_path: Path | None = None) -> tuple[bool, str]:
156
+ import tempfile
157
+ active_path = graph_path or Path(resolve_graph_path())
158
+ g = Graph.load(str(active_path))
159
+ ttl = to_turtle(g)
160
+ with tempfile.NamedTemporaryFile("w", suffix=".ttl", delete=False) as f:
161
+ f.write(ttl)
162
+ tmp = Path(f.name)
163
+ try:
164
+ g2 = from_turtle(tmp)
165
+ finally:
166
+ try:
167
+ tmp.unlink()
168
+ except OSError:
169
+ pass
170
+ n_match = len(g.nodes) == len(g2.nodes)
171
+ e_match = len(g.edges) == len(g2.edges)
172
+ if n_match and e_match:
173
+ return True, f"OK: {len(g.nodes)} nodes / {len(g.edges)} edges round-tripped"
174
+ return False, (f"MISMATCH: orig {len(g.nodes)}/{len(g.edges)} "
175
+ f"vs reimport {len(g2.nodes)}/{len(g2.edges)}")
176
+
177
+
178
+ def run_export(args: list[str]) -> int:
179
+ if "--ttl" not in args:
180
+ print("Usage: mykg export --ttl [--graph <path>] [--out <path>] [--round-trip]")
181
+ return 1
182
+ graph_path = Path(resolve_graph_path())
183
+ if "--graph" in args:
184
+ i = args.index("--graph")
185
+ graph_path = Path(args[i + 1]).expanduser().resolve()
186
+ out = graph_path.with_suffix(".ttl")
187
+ if "--out" in args:
188
+ i = args.index("--out")
189
+ out = Path(args[i + 1]).expanduser().resolve()
190
+ g = Graph.load(str(graph_path))
191
+ out.parent.mkdir(parents=True, exist_ok=True)
192
+ out.write_text(to_turtle(g), encoding="utf-8")
193
+ print(f"export: wrote {out}")
194
+ if "--round-trip" in args:
195
+ ok, msg = round_trip_test(graph_path)
196
+ print(f"round-trip: {msg}")
197
+ return 0 if ok else 2
198
+ return 0
199
+
200
+
201
+ if __name__ == "__main__":
202
+ sys.exit(run_export(sys.argv[1:]))
mygraph/review.py ADDED
@@ -0,0 +1,151 @@
1
+ """
2
+ review.py — Stage 3 of the v1 ingest pipeline.
3
+
4
+ Interactive terminal loop over validated candidates. Keys:
5
+ [a]ccept merge into the graph
6
+ [r]eject skip; record the rejection
7
+ [e]dit pop $EDITOR on the candidate JSON, then re-validate this candidate
8
+ [s]kip defer to next session
9
+ [q]uit stop reviewing (already-merged stay merged)
10
+
11
+ Idempotent: re-running on the same source_id skips already-merged node IDs.
12
+
13
+ Non-interactive modes (for headless testing / dispatch):
14
+ --auto-accept-high accept everything with confidence == "high"
15
+ --auto-accept-all accept every accepted candidate
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import os
22
+ import subprocess
23
+ import sys
24
+ import tempfile
25
+ from pathlib import Path
26
+ from typing import Iterable
27
+
28
+ from mygraph import Graph
29
+ try:
30
+ from .validator import validate
31
+ from .eval_log import append as eval_append
32
+ except ImportError: # direct script execution
33
+ from validator import validate
34
+ from eval_log import append as eval_append
35
+
36
+
37
+ def _print_node(node: dict) -> None:
38
+ print(f"\n[{node['type']}] {node['id']}")
39
+ print(f" label : {node['label']}")
40
+ if node.get("body"):
41
+ print(f" body : {node['body']}")
42
+ print(f" confidence: {node.get('confidence')}")
43
+ if node.get("excerpt"):
44
+ print(f" excerpt : \"{node['excerpt']}\"")
45
+
46
+
47
+ def _print_edge(edge: dict) -> None:
48
+ print(f"\n[edge] {edge['src']} --{edge['type']}--> {edge['dst']}")
49
+ print(f" confidence: {edge.get('confidence')}")
50
+ if edge.get("excerpt"):
51
+ print(f" excerpt : \"{edge['excerpt']}\"")
52
+
53
+
54
+ def _edit_in_editor(payload: dict) -> dict:
55
+ editor = os.environ.get("EDITOR", "vi")
56
+ with tempfile.NamedTemporaryFile("w+", suffix=".json", delete=False, encoding="utf-8") as f:
57
+ json.dump(payload, f, indent=2)
58
+ path = f.name
59
+ try:
60
+ subprocess.run([editor, path], check=False)
61
+ with open(path, encoding="utf-8") as f:
62
+ return json.load(f)
63
+ finally:
64
+ os.unlink(path)
65
+
66
+
67
+ def _already_merged_ids(g: Graph, source_id: str) -> set[str]:
68
+ """Nodes that already have a MENTIONED_IN edge to this source."""
69
+ out = set()
70
+ for e in g.edges:
71
+ if e.type == "MENTIONED_IN" and e.dst == source_id:
72
+ out.add(e.src)
73
+ elif e.type == "MENTIONED_IN" and e.src == source_id:
74
+ out.add(e.dst)
75
+ return out
76
+
77
+
78
+ def _ask(prompt: str, valid: set[str]) -> str:
79
+ while True:
80
+ try:
81
+ ans = input(prompt).strip().lower()
82
+ except EOFError:
83
+ return "q"
84
+ if ans in valid:
85
+ return ans
86
+ print(f" ? choose one of: {sorted(valid)}")
87
+
88
+
89
+ def review(validated: dict, source_text: str,
90
+ auto_accept_high: bool = False,
91
+ auto_accept_all: bool = False) -> dict:
92
+ """
93
+ Returns the user-approved subset of `validated` (same shape).
94
+ Edges whose endpoints aren't approved get filtered after node decisions.
95
+ """
96
+ g = Graph.load()
97
+ src = validated["source"]
98
+ already = _already_merged_ids(g, src["id"])
99
+
100
+ approved_nodes: list[dict] = []
101
+ decisions: list[dict] = [] # for eval_log
102
+
103
+ auto = auto_accept_all or auto_accept_high
104
+ for node in validated.get("nodes", []):
105
+ if node["id"] in already:
106
+ decisions.append({"kind": "review", "verdict": "skip_already_merged",
107
+ "candidate_id": node["id"], "source_id": src["id"],
108
+ "extractor_confidence": node.get("confidence")})
109
+ continue
110
+ if auto:
111
+ verdict = "accept" if (auto_accept_all or node.get("confidence") == "high") else "skip"
112
+ user_edit = None
113
+ else:
114
+ _print_node(node)
115
+ choice = _ask(" [a]ccept [r]eject [e]dit [s]kip [q]uit > ",
116
+ {"a", "r", "e", "s", "q"})
117
+ user_edit = None
118
+ if choice == "q":
119
+ break
120
+ if choice == "e":
121
+ edited = _edit_in_editor(node)
122
+ # re-validate just this candidate against the source
123
+ subset = {"source": src, "nodes": [edited], "edges": []}
124
+ v_payload, _ = validate(subset, source_text)
125
+ if v_payload["nodes"]:
126
+ node = v_payload["nodes"][0]
127
+ user_edit = edited
128
+ _print_node(node)
129
+ choice = _ask(" After edit: [a]ccept [r]eject [s]kip > ", {"a", "r", "s"})
130
+ else:
131
+ print(" (edit failed validation, skipping)")
132
+ choice = "s"
133
+ verdict = {"a": "accept", "r": "reject", "s": "skip"}.get(choice, "skip")
134
+ decisions.append({"kind": "review", "verdict": verdict,
135
+ "candidate_id": node["id"], "source_id": src["id"],
136
+ "extractor_confidence": node.get("confidence"),
137
+ "user_edit": user_edit})
138
+ if verdict == "accept":
139
+ approved_nodes.append(node)
140
+
141
+ approved_node_ids = {n["id"] for n in approved_nodes} | set(g.nodes.keys()) | {src["id"]}
142
+ approved_edges: list[dict] = []
143
+ for edge in validated.get("edges", []):
144
+ if edge["src"] in approved_node_ids and edge["dst"] in approved_node_ids:
145
+ approved_edges.append(edge)
146
+
147
+ for d in decisions:
148
+ eval_append(d)
149
+
150
+ return {"source": src, "nodes": approved_nodes, "edges": approved_edges,
151
+ "_meta": validated.get("_meta", {})}
mygraph/validator.py ADDED
@@ -0,0 +1,149 @@
1
+ """
2
+ validator.py — Stage 2 of the v1 ingest pipeline.
3
+
4
+ Takes the extractor's candidates.json and runs deterministic checks:
5
+ - shape (per the extractor's tool_schema)
6
+ - excerpt verification (`high` confidence MUST substring-match the source markdown,
7
+ after whitespace normalization)
8
+ - orphan edge check (src/dst must resolve to existing node OR another candidate)
9
+ - ID format (`^[a-z]+:[a-z0-9-]+$`)
10
+
11
+ Output: a validated payload (same shape as input) plus a manifest of
12
+ accepted / demoted-with-reason / rejected-with-reason.
13
+
14
+ No external deps. Lightweight schema check (we control both producer and consumer).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ from mygraph import Graph, NODE_TYPES, EDGE_TYPES
25
+
26
+ ID_RE = re.compile(r"^[a-z]+:[a-z0-9-]+$")
27
+ WS_RE = re.compile(r"\s+")
28
+
29
+
30
+ def _norm(s: str) -> str:
31
+ return WS_RE.sub(" ", s).strip().lower()
32
+
33
+
34
+ @dataclass
35
+ class Manifest:
36
+ accepted_nodes: list[dict] = field(default_factory=list)
37
+ accepted_edges: list[dict] = field(default_factory=list)
38
+ demoted_nodes: list[tuple[dict, str]] = field(default_factory=list)
39
+ rejected_nodes: list[tuple[dict, str]] = field(default_factory=list)
40
+ rejected_edges: list[tuple[dict, str]] = field(default_factory=list)
41
+
42
+ def summary(self) -> str:
43
+ return (
44
+ f" accepted : {len(self.accepted_nodes)} nodes / {len(self.accepted_edges)} edges\n"
45
+ f" demoted : {len(self.demoted_nodes)} nodes\n"
46
+ f" rejected : {len(self.rejected_nodes)} nodes / {len(self.rejected_edges)} edges"
47
+ )
48
+
49
+
50
+ def _check_shape(payload: dict) -> list[str]:
51
+ errs = []
52
+ if not isinstance(payload, dict):
53
+ return ["payload is not a dict"]
54
+ if "source" not in payload or not isinstance(payload["source"], dict):
55
+ errs.append("missing source object")
56
+ else:
57
+ for k in ("id", "label", "body"):
58
+ if k not in payload["source"]:
59
+ errs.append(f"source missing field: {k}")
60
+ for key in ("nodes", "edges"):
61
+ if key not in payload or not isinstance(payload[key], list):
62
+ errs.append(f"missing {key} list")
63
+ return errs
64
+
65
+
66
+ def validate(payload: dict, source_text: str) -> tuple[dict, Manifest]:
67
+ """Return (validated_payload, manifest). validated_payload mutates confidences and drops rejects."""
68
+ shape_errs = _check_shape(payload)
69
+ if shape_errs:
70
+ raise ValueError("validator: malformed payload → " + "; ".join(shape_errs))
71
+
72
+ g = Graph.load()
73
+ manifest = Manifest()
74
+ src_norm = _norm(source_text)
75
+
76
+ # validate Source
77
+ src = payload["source"]
78
+ if not ID_RE.match(src["id"]) or not src["id"].startswith("source:"):
79
+ raise ValueError(f"validator: invalid source id: {src['id']!r}")
80
+
81
+ # validate nodes
82
+ valid_nodes: list[dict] = []
83
+ candidate_ids: set[str] = {src["id"]}
84
+ for node in payload.get("nodes", []):
85
+ nid = node.get("id", "")
86
+ if not ID_RE.match(nid):
87
+ manifest.rejected_nodes.append((node, "id_format"))
88
+ continue
89
+ if node.get("type") not in NODE_TYPES:
90
+ manifest.rejected_nodes.append((node, f"bad_type:{node.get('type')}"))
91
+ continue
92
+ if node.get("confidence") not in {"high", "medium", "low"}:
93
+ manifest.rejected_nodes.append((node, "bad_confidence"))
94
+ continue
95
+ # provenance-or-bust: high → must have excerpt + must substring-match source
96
+ excerpt = (node.get("excerpt") or "").strip()
97
+ if node["confidence"] == "high":
98
+ if not excerpt:
99
+ node["confidence"] = "low"
100
+ manifest.demoted_nodes.append((node, "no_excerpt"))
101
+ elif _norm(excerpt) not in src_norm:
102
+ node["confidence"] = "low"
103
+ manifest.demoted_nodes.append((node, "excerpt_not_in_source"))
104
+ candidate_ids.add(nid)
105
+ valid_nodes.append(node)
106
+ manifest.accepted_nodes.append(node)
107
+
108
+ # validate edges
109
+ valid_edges: list[dict] = []
110
+ for edge in payload.get("edges", []):
111
+ if edge.get("type") not in EDGE_TYPES:
112
+ manifest.rejected_edges.append((edge, f"bad_type:{edge.get('type')}"))
113
+ continue
114
+ if edge.get("confidence") not in {"high", "medium", "low"}:
115
+ manifest.rejected_edges.append((edge, "bad_confidence"))
116
+ continue
117
+ for endpoint_key in ("src", "dst"):
118
+ ep = edge.get(endpoint_key, "")
119
+ if not ID_RE.match(ep):
120
+ manifest.rejected_edges.append((edge, f"{endpoint_key}_id_format"))
121
+ break
122
+ if ep not in g.nodes and ep not in candidate_ids:
123
+ manifest.rejected_edges.append((edge, f"orphan_{endpoint_key}:{ep}"))
124
+ break
125
+ else:
126
+ valid_edges.append(edge)
127
+ manifest.accepted_edges.append(edge)
128
+
129
+ validated = dict(payload)
130
+ validated["nodes"] = valid_nodes
131
+ validated["edges"] = valid_edges
132
+ return validated, manifest
133
+
134
+
135
+ def main():
136
+ import json
137
+ import sys
138
+ if len(sys.argv) < 3:
139
+ print("Usage: python validator.py <candidates.json> <source.md>")
140
+ return 1
141
+ payload = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
142
+ src_text = Path(sys.argv[2]).read_text(encoding="utf-8")
143
+ _, manifest = validate(payload, src_text)
144
+ print(manifest.summary())
145
+ return 0
146
+
147
+
148
+ if __name__ == "__main__":
149
+ raise SystemExit(main())