knowledge-worker 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_worker-0.6.0.dist-info/METADATA +365 -0
- knowledge_worker-0.6.0.dist-info/RECORD +27 -0
- knowledge_worker-0.6.0.dist-info/WHEEL +5 -0
- knowledge_worker-0.6.0.dist-info/entry_points.txt +3 -0
- knowledge_worker-0.6.0.dist-info/licenses/LICENSE +21 -0
- knowledge_worker-0.6.0.dist-info/top_level.txt +2 -0
- mygraph/__init__.py +23 -0
- mygraph/anthropic_client.py +199 -0
- mygraph/audit.py +137 -0
- mygraph/check.py +273 -0
- mygraph/discover.py +654 -0
- mygraph/eval_log.py +36 -0
- mygraph/export_context.py +124 -0
- mygraph/extractor.py +243 -0
- mygraph/extractor_openai.py +165 -0
- mygraph/ingest.py +170 -0
- mygraph/memory_audit.py +1094 -0
- mygraph/merge.py +133 -0
- mygraph/mygraph.py +773 -0
- mygraph/owl_io.py +202 -0
- mygraph/review.py +151 -0
- mygraph/validator.py +149 -0
- mygraph/viz.py +409 -0
- ollama_proxy/eval_compare.py +185 -0
- ollama_proxy/extractor_adapter.py +168 -0
- ollama_proxy/proxy.py +143 -0
- ollama_proxy/server.py +194 -0
mygraph/owl_io.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""
|
|
2
|
+
owl_io.py — Turtle (OWL) sibling serialization for the graph.
|
|
3
|
+
|
|
4
|
+
JSON stays canonical. mygraph.ttl is generated from JSON at any time and can be
|
|
5
|
+
re-imported losslessly (round-trip on node + edge counts is a hard test).
|
|
6
|
+
|
|
7
|
+
Mapping follows the graph model in SPEC.md and the pipeline in DESIGN.md:
|
|
8
|
+
|
|
9
|
+
- Node type → owl:Class under rb:Concept
|
|
10
|
+
- Edge type → owl:ObjectProperty
|
|
11
|
+
- Node id → IRI <http://mygraph.local/{id}>
|
|
12
|
+
- Node label / body → rdfs:label / rdfs:comment
|
|
13
|
+
- Edge metadata → rb:Assertion (reified) with rb:confidence,
|
|
14
|
+
rb:excerpt, rb:sourceId, rb:createdAt, rb:lastSeen
|
|
15
|
+
- Source node → rb:Source (subclass of dcterms:ProvenanceEntity)
|
|
16
|
+
|
|
17
|
+
Requires `rdflib` (`pip install -e ".[rdf]"`).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from mygraph import Graph, Node, Edge, NODE_TYPES, EDGE_TYPES, resolve_graph_path
|
|
26
|
+
|
|
27
|
+
NS = "http://mygraph.local/"
|
|
28
|
+
RB = "http://mygraph.local/schema#"
|
|
29
|
+
DCTERMS = "http://purl.org/dc/terms/"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _require_rdflib():
|
|
33
|
+
try:
|
|
34
|
+
import rdflib # type: ignore
|
|
35
|
+
return rdflib
|
|
36
|
+
except ImportError as e:
|
|
37
|
+
raise SystemExit(
|
|
38
|
+
"owl_io: `rdflib` is not installed. Run:\n"
|
|
39
|
+
' pip install -e ".[rdf]"'
|
|
40
|
+
) from e
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _iri(rdflib, suffix: str):
|
|
44
|
+
# rdflib URIRef accepts strings; we keep IDs verbatim (slug already URL-safe except `:`)
|
|
45
|
+
safe = suffix.replace(":", "/")
|
|
46
|
+
return rdflib.URIRef(NS + safe)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def to_turtle(g: Graph) -> str:
|
|
50
|
+
rdflib = _require_rdflib()
|
|
51
|
+
from rdflib import Graph as RG, Literal, RDF, RDFS, OWL, Namespace
|
|
52
|
+
rg = RG()
|
|
53
|
+
rb_ns = Namespace(RB)
|
|
54
|
+
dc_ns = Namespace(DCTERMS)
|
|
55
|
+
rg.bind("rb", rb_ns)
|
|
56
|
+
rg.bind("dcterms", dc_ns)
|
|
57
|
+
rg.bind("mg", Namespace(NS))
|
|
58
|
+
|
|
59
|
+
# ontology: classes/properties include public schema plus any legacy/private
|
|
60
|
+
# types already present in the graph.
|
|
61
|
+
rg.add((rb_ns.Concept, RDF.type, OWL.Class))
|
|
62
|
+
rg.add((rb_ns.Source, RDF.type, OWL.Class))
|
|
63
|
+
rg.add((rb_ns.Source, RDFS.subClassOf, dc_ns.ProvenanceEntity))
|
|
64
|
+
node_types = NODE_TYPES | {n.type for n in g.nodes.values()}
|
|
65
|
+
edge_types = EDGE_TYPES | {e.type for e in g.edges}
|
|
66
|
+
for t in sorted(node_types):
|
|
67
|
+
cls = rb_ns[t.capitalize()]
|
|
68
|
+
rg.add((cls, RDF.type, OWL.Class))
|
|
69
|
+
if t == "source":
|
|
70
|
+
rg.add((cls, RDFS.subClassOf, rb_ns.Source))
|
|
71
|
+
else:
|
|
72
|
+
rg.add((cls, RDFS.subClassOf, rb_ns.Concept))
|
|
73
|
+
for t in sorted(edge_types):
|
|
74
|
+
rg.add((rb_ns[t], RDF.type, OWL.ObjectProperty))
|
|
75
|
+
|
|
76
|
+
# nodes
|
|
77
|
+
for nid, n in g.nodes.items():
|
|
78
|
+
iri = _iri(rdflib, nid)
|
|
79
|
+
rg.add((iri, RDF.type, rb_ns[n.type.capitalize()]))
|
|
80
|
+
rg.add((iri, RDFS.label, Literal(n.label)))
|
|
81
|
+
if n.body:
|
|
82
|
+
rg.add((iri, RDFS.comment, Literal(n.body)))
|
|
83
|
+
rg.add((iri, rb_ns.confidence, Literal(n.confidence)))
|
|
84
|
+
rg.add((iri, rb_ns.createdAt, Literal(n.created_at)))
|
|
85
|
+
rg.add((iri, rb_ns.nodeId, Literal(nid)))
|
|
86
|
+
|
|
87
|
+
# edges (direct triple + reified rb:Assertion holding metadata)
|
|
88
|
+
for i, e in enumerate(g.edges):
|
|
89
|
+
s = _iri(rdflib, e.src)
|
|
90
|
+
o = _iri(rdflib, e.dst)
|
|
91
|
+
p = rb_ns[e.type]
|
|
92
|
+
rg.add((s, p, o))
|
|
93
|
+
# reify
|
|
94
|
+
a_iri = rdflib.URIRef(f"{NS}_assertion/{i}")
|
|
95
|
+
rg.add((a_iri, RDF.type, rb_ns.Assertion))
|
|
96
|
+
rg.add((a_iri, RDF.subject, s))
|
|
97
|
+
rg.add((a_iri, RDF.predicate, p))
|
|
98
|
+
rg.add((a_iri, RDF.object, o))
|
|
99
|
+
rg.add((a_iri, rb_ns.sourceId, Literal(e.source_id)))
|
|
100
|
+
rg.add((a_iri, rb_ns.confidence, Literal(e.confidence)))
|
|
101
|
+
rg.add((a_iri, rb_ns.createdAt, Literal(e.created_at)))
|
|
102
|
+
rg.add((a_iri, rb_ns.lastSeen, Literal(e.last_seen)))
|
|
103
|
+
rg.add((a_iri, rb_ns.edgeType, Literal(e.type)))
|
|
104
|
+
rg.add((a_iri, rb_ns.srcId, Literal(e.src)))
|
|
105
|
+
rg.add((a_iri, rb_ns.dstId, Literal(e.dst)))
|
|
106
|
+
if e.excerpt:
|
|
107
|
+
rg.add((a_iri, rb_ns.excerpt, Literal(e.excerpt)))
|
|
108
|
+
|
|
109
|
+
return rg.serialize(format="turtle")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def from_turtle(path: Path) -> Graph:
|
|
113
|
+
"""Reimport a graph from Turtle. Reads from the rb:Assertion reifications
|
|
114
|
+
(which carry the full edge metadata) — direct triples are redundant."""
|
|
115
|
+
rdflib = _require_rdflib()
|
|
116
|
+
from rdflib import Graph as RG, Literal, RDF, RDFS, Namespace
|
|
117
|
+
rg = RG()
|
|
118
|
+
rg.parse(str(path), format="turtle")
|
|
119
|
+
rb_ns = Namespace(RB)
|
|
120
|
+
|
|
121
|
+
nodes: dict[str, Node] = {}
|
|
122
|
+
# iterate all subjects with rb:nodeId set
|
|
123
|
+
for s, _, lit in rg.triples((None, rb_ns.nodeId, None)):
|
|
124
|
+
nid = str(lit)
|
|
125
|
+
# type from rdf:type (capitalize → lowercase mapping)
|
|
126
|
+
t_iri = next(rg.objects(s, RDF.type), None)
|
|
127
|
+
type_str = (str(t_iri).rsplit("#", 1)[-1] if t_iri else "idea").lower()
|
|
128
|
+
label = str(next(rg.objects(s, RDFS.label), Literal("")))
|
|
129
|
+
body = str(next(rg.objects(s, RDFS.comment), Literal("")))
|
|
130
|
+
conf = str(next(rg.objects(s, rb_ns.confidence), Literal("medium")))
|
|
131
|
+
created = str(next(rg.objects(s, rb_ns.createdAt),
|
|
132
|
+
Literal("1970-01-01T00:00:00+00:00")))
|
|
133
|
+
nodes[nid] = Node(id=nid, type=type_str, label=label, body=body,
|
|
134
|
+
confidence=conf, created_at=created)
|
|
135
|
+
|
|
136
|
+
edges: list[Edge] = []
|
|
137
|
+
for a_iri, _, _ in rg.triples((None, RDF.type, rb_ns.Assertion)):
|
|
138
|
+
src = str(next(rg.objects(a_iri, rb_ns.srcId), Literal("")))
|
|
139
|
+
dst = str(next(rg.objects(a_iri, rb_ns.dstId), Literal("")))
|
|
140
|
+
etype = str(next(rg.objects(a_iri, rb_ns.edgeType), Literal("")))
|
|
141
|
+
if not etype:
|
|
142
|
+
continue
|
|
143
|
+
sid = str(next(rg.objects(a_iri, rb_ns.sourceId), Literal("")))
|
|
144
|
+
conf = str(next(rg.objects(a_iri, rb_ns.confidence), Literal("medium")))
|
|
145
|
+
created = str(next(rg.objects(a_iri, rb_ns.createdAt),
|
|
146
|
+
Literal("1970-01-01T00:00:00+00:00")))
|
|
147
|
+
last = str(next(rg.objects(a_iri, rb_ns.lastSeen), Literal(created)))
|
|
148
|
+
excerpt = str(next(rg.objects(a_iri, rb_ns.excerpt), Literal("")))
|
|
149
|
+
edges.append(Edge(src=src, dst=dst, type=etype, source_id=sid,
|
|
150
|
+
excerpt=excerpt, confidence=conf,
|
|
151
|
+
created_at=created, last_seen=last))
|
|
152
|
+
return Graph(nodes=nodes, edges=edges)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def round_trip_test(graph_path: Path | None = None) -> tuple[bool, str]:
|
|
156
|
+
import tempfile
|
|
157
|
+
active_path = graph_path or Path(resolve_graph_path())
|
|
158
|
+
g = Graph.load(str(active_path))
|
|
159
|
+
ttl = to_turtle(g)
|
|
160
|
+
with tempfile.NamedTemporaryFile("w", suffix=".ttl", delete=False) as f:
|
|
161
|
+
f.write(ttl)
|
|
162
|
+
tmp = Path(f.name)
|
|
163
|
+
try:
|
|
164
|
+
g2 = from_turtle(tmp)
|
|
165
|
+
finally:
|
|
166
|
+
try:
|
|
167
|
+
tmp.unlink()
|
|
168
|
+
except OSError:
|
|
169
|
+
pass
|
|
170
|
+
n_match = len(g.nodes) == len(g2.nodes)
|
|
171
|
+
e_match = len(g.edges) == len(g2.edges)
|
|
172
|
+
if n_match and e_match:
|
|
173
|
+
return True, f"OK: {len(g.nodes)} nodes / {len(g.edges)} edges round-tripped"
|
|
174
|
+
return False, (f"MISMATCH: orig {len(g.nodes)}/{len(g.edges)} "
|
|
175
|
+
f"vs reimport {len(g2.nodes)}/{len(g2.edges)}")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def run_export(args: list[str]) -> int:
|
|
179
|
+
if "--ttl" not in args:
|
|
180
|
+
print("Usage: mykg export --ttl [--graph <path>] [--out <path>] [--round-trip]")
|
|
181
|
+
return 1
|
|
182
|
+
graph_path = Path(resolve_graph_path())
|
|
183
|
+
if "--graph" in args:
|
|
184
|
+
i = args.index("--graph")
|
|
185
|
+
graph_path = Path(args[i + 1]).expanduser().resolve()
|
|
186
|
+
out = graph_path.with_suffix(".ttl")
|
|
187
|
+
if "--out" in args:
|
|
188
|
+
i = args.index("--out")
|
|
189
|
+
out = Path(args[i + 1]).expanduser().resolve()
|
|
190
|
+
g = Graph.load(str(graph_path))
|
|
191
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
out.write_text(to_turtle(g), encoding="utf-8")
|
|
193
|
+
print(f"export: wrote {out}")
|
|
194
|
+
if "--round-trip" in args:
|
|
195
|
+
ok, msg = round_trip_test(graph_path)
|
|
196
|
+
print(f"round-trip: {msg}")
|
|
197
|
+
return 0 if ok else 2
|
|
198
|
+
return 0
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
sys.exit(run_export(sys.argv[1:]))
|
mygraph/review.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
review.py — Stage 3 of the v1 ingest pipeline.
|
|
3
|
+
|
|
4
|
+
Interactive terminal loop over validated candidates. Keys:
|
|
5
|
+
[a]ccept merge into the graph
|
|
6
|
+
[r]eject skip; record the rejection
|
|
7
|
+
[e]dit pop $EDITOR on the candidate JSON, then re-validate this candidate
|
|
8
|
+
[s]kip defer to next session
|
|
9
|
+
[q]uit stop reviewing (already-merged stay merged)
|
|
10
|
+
|
|
11
|
+
Idempotent: re-running on the same source_id skips already-merged node IDs.
|
|
12
|
+
|
|
13
|
+
Non-interactive modes (for headless testing / dispatch):
|
|
14
|
+
--auto-accept-high accept everything with confidence == "high"
|
|
15
|
+
--auto-accept-all accept every accepted candidate
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
import tempfile
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Iterable
|
|
27
|
+
|
|
28
|
+
from mygraph import Graph
|
|
29
|
+
try:
|
|
30
|
+
from .validator import validate
|
|
31
|
+
from .eval_log import append as eval_append
|
|
32
|
+
except ImportError: # direct script execution
|
|
33
|
+
from validator import validate
|
|
34
|
+
from eval_log import append as eval_append
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _print_node(node: dict) -> None:
|
|
38
|
+
print(f"\n[{node['type']}] {node['id']}")
|
|
39
|
+
print(f" label : {node['label']}")
|
|
40
|
+
if node.get("body"):
|
|
41
|
+
print(f" body : {node['body']}")
|
|
42
|
+
print(f" confidence: {node.get('confidence')}")
|
|
43
|
+
if node.get("excerpt"):
|
|
44
|
+
print(f" excerpt : \"{node['excerpt']}\"")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _print_edge(edge: dict) -> None:
|
|
48
|
+
print(f"\n[edge] {edge['src']} --{edge['type']}--> {edge['dst']}")
|
|
49
|
+
print(f" confidence: {edge.get('confidence')}")
|
|
50
|
+
if edge.get("excerpt"):
|
|
51
|
+
print(f" excerpt : \"{edge['excerpt']}\"")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _edit_in_editor(payload: dict) -> dict:
|
|
55
|
+
editor = os.environ.get("EDITOR", "vi")
|
|
56
|
+
with tempfile.NamedTemporaryFile("w+", suffix=".json", delete=False, encoding="utf-8") as f:
|
|
57
|
+
json.dump(payload, f, indent=2)
|
|
58
|
+
path = f.name
|
|
59
|
+
try:
|
|
60
|
+
subprocess.run([editor, path], check=False)
|
|
61
|
+
with open(path, encoding="utf-8") as f:
|
|
62
|
+
return json.load(f)
|
|
63
|
+
finally:
|
|
64
|
+
os.unlink(path)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _already_merged_ids(g: Graph, source_id: str) -> set[str]:
|
|
68
|
+
"""Nodes that already have a MENTIONED_IN edge to this source."""
|
|
69
|
+
out = set()
|
|
70
|
+
for e in g.edges:
|
|
71
|
+
if e.type == "MENTIONED_IN" and e.dst == source_id:
|
|
72
|
+
out.add(e.src)
|
|
73
|
+
elif e.type == "MENTIONED_IN" and e.src == source_id:
|
|
74
|
+
out.add(e.dst)
|
|
75
|
+
return out
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _ask(prompt: str, valid: set[str]) -> str:
|
|
79
|
+
while True:
|
|
80
|
+
try:
|
|
81
|
+
ans = input(prompt).strip().lower()
|
|
82
|
+
except EOFError:
|
|
83
|
+
return "q"
|
|
84
|
+
if ans in valid:
|
|
85
|
+
return ans
|
|
86
|
+
print(f" ? choose one of: {sorted(valid)}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def review(validated: dict, source_text: str,
|
|
90
|
+
auto_accept_high: bool = False,
|
|
91
|
+
auto_accept_all: bool = False) -> dict:
|
|
92
|
+
"""
|
|
93
|
+
Returns the user-approved subset of `validated` (same shape).
|
|
94
|
+
Edges whose endpoints aren't approved get filtered after node decisions.
|
|
95
|
+
"""
|
|
96
|
+
g = Graph.load()
|
|
97
|
+
src = validated["source"]
|
|
98
|
+
already = _already_merged_ids(g, src["id"])
|
|
99
|
+
|
|
100
|
+
approved_nodes: list[dict] = []
|
|
101
|
+
decisions: list[dict] = [] # for eval_log
|
|
102
|
+
|
|
103
|
+
auto = auto_accept_all or auto_accept_high
|
|
104
|
+
for node in validated.get("nodes", []):
|
|
105
|
+
if node["id"] in already:
|
|
106
|
+
decisions.append({"kind": "review", "verdict": "skip_already_merged",
|
|
107
|
+
"candidate_id": node["id"], "source_id": src["id"],
|
|
108
|
+
"extractor_confidence": node.get("confidence")})
|
|
109
|
+
continue
|
|
110
|
+
if auto:
|
|
111
|
+
verdict = "accept" if (auto_accept_all or node.get("confidence") == "high") else "skip"
|
|
112
|
+
user_edit = None
|
|
113
|
+
else:
|
|
114
|
+
_print_node(node)
|
|
115
|
+
choice = _ask(" [a]ccept [r]eject [e]dit [s]kip [q]uit > ",
|
|
116
|
+
{"a", "r", "e", "s", "q"})
|
|
117
|
+
user_edit = None
|
|
118
|
+
if choice == "q":
|
|
119
|
+
break
|
|
120
|
+
if choice == "e":
|
|
121
|
+
edited = _edit_in_editor(node)
|
|
122
|
+
# re-validate just this candidate against the source
|
|
123
|
+
subset = {"source": src, "nodes": [edited], "edges": []}
|
|
124
|
+
v_payload, _ = validate(subset, source_text)
|
|
125
|
+
if v_payload["nodes"]:
|
|
126
|
+
node = v_payload["nodes"][0]
|
|
127
|
+
user_edit = edited
|
|
128
|
+
_print_node(node)
|
|
129
|
+
choice = _ask(" After edit: [a]ccept [r]eject [s]kip > ", {"a", "r", "s"})
|
|
130
|
+
else:
|
|
131
|
+
print(" (edit failed validation, skipping)")
|
|
132
|
+
choice = "s"
|
|
133
|
+
verdict = {"a": "accept", "r": "reject", "s": "skip"}.get(choice, "skip")
|
|
134
|
+
decisions.append({"kind": "review", "verdict": verdict,
|
|
135
|
+
"candidate_id": node["id"], "source_id": src["id"],
|
|
136
|
+
"extractor_confidence": node.get("confidence"),
|
|
137
|
+
"user_edit": user_edit})
|
|
138
|
+
if verdict == "accept":
|
|
139
|
+
approved_nodes.append(node)
|
|
140
|
+
|
|
141
|
+
approved_node_ids = {n["id"] for n in approved_nodes} | set(g.nodes.keys()) | {src["id"]}
|
|
142
|
+
approved_edges: list[dict] = []
|
|
143
|
+
for edge in validated.get("edges", []):
|
|
144
|
+
if edge["src"] in approved_node_ids and edge["dst"] in approved_node_ids:
|
|
145
|
+
approved_edges.append(edge)
|
|
146
|
+
|
|
147
|
+
for d in decisions:
|
|
148
|
+
eval_append(d)
|
|
149
|
+
|
|
150
|
+
return {"source": src, "nodes": approved_nodes, "edges": approved_edges,
|
|
151
|
+
"_meta": validated.get("_meta", {})}
|
mygraph/validator.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
validator.py — Stage 2 of the v1 ingest pipeline.
|
|
3
|
+
|
|
4
|
+
Takes the extractor's candidates.json and runs deterministic checks:
|
|
5
|
+
- shape (per the extractor's tool_schema)
|
|
6
|
+
- excerpt verification (`high` confidence MUST substring-match the source markdown,
|
|
7
|
+
after whitespace normalization)
|
|
8
|
+
- orphan edge check (src/dst must resolve to existing node OR another candidate)
|
|
9
|
+
- ID format (`^[a-z]+:[a-z0-9-]+$`)
|
|
10
|
+
|
|
11
|
+
Output: a validated payload (same shape as input) plus a manifest of
|
|
12
|
+
accepted / demoted-with-reason / rejected-with-reason.
|
|
13
|
+
|
|
14
|
+
No external deps. Lightweight schema check (we control both producer and consumer).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from mygraph import Graph, NODE_TYPES, EDGE_TYPES
|
|
25
|
+
|
|
26
|
+
ID_RE = re.compile(r"^[a-z]+:[a-z0-9-]+$")
|
|
27
|
+
WS_RE = re.compile(r"\s+")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _norm(s: str) -> str:
|
|
31
|
+
return WS_RE.sub(" ", s).strip().lower()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Manifest:
|
|
36
|
+
accepted_nodes: list[dict] = field(default_factory=list)
|
|
37
|
+
accepted_edges: list[dict] = field(default_factory=list)
|
|
38
|
+
demoted_nodes: list[tuple[dict, str]] = field(default_factory=list)
|
|
39
|
+
rejected_nodes: list[tuple[dict, str]] = field(default_factory=list)
|
|
40
|
+
rejected_edges: list[tuple[dict, str]] = field(default_factory=list)
|
|
41
|
+
|
|
42
|
+
def summary(self) -> str:
|
|
43
|
+
return (
|
|
44
|
+
f" accepted : {len(self.accepted_nodes)} nodes / {len(self.accepted_edges)} edges\n"
|
|
45
|
+
f" demoted : {len(self.demoted_nodes)} nodes\n"
|
|
46
|
+
f" rejected : {len(self.rejected_nodes)} nodes / {len(self.rejected_edges)} edges"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _check_shape(payload: dict) -> list[str]:
|
|
51
|
+
errs = []
|
|
52
|
+
if not isinstance(payload, dict):
|
|
53
|
+
return ["payload is not a dict"]
|
|
54
|
+
if "source" not in payload or not isinstance(payload["source"], dict):
|
|
55
|
+
errs.append("missing source object")
|
|
56
|
+
else:
|
|
57
|
+
for k in ("id", "label", "body"):
|
|
58
|
+
if k not in payload["source"]:
|
|
59
|
+
errs.append(f"source missing field: {k}")
|
|
60
|
+
for key in ("nodes", "edges"):
|
|
61
|
+
if key not in payload or not isinstance(payload[key], list):
|
|
62
|
+
errs.append(f"missing {key} list")
|
|
63
|
+
return errs
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def validate(payload: dict, source_text: str) -> tuple[dict, Manifest]:
|
|
67
|
+
"""Return (validated_payload, manifest). validated_payload mutates confidences and drops rejects."""
|
|
68
|
+
shape_errs = _check_shape(payload)
|
|
69
|
+
if shape_errs:
|
|
70
|
+
raise ValueError("validator: malformed payload → " + "; ".join(shape_errs))
|
|
71
|
+
|
|
72
|
+
g = Graph.load()
|
|
73
|
+
manifest = Manifest()
|
|
74
|
+
src_norm = _norm(source_text)
|
|
75
|
+
|
|
76
|
+
# validate Source
|
|
77
|
+
src = payload["source"]
|
|
78
|
+
if not ID_RE.match(src["id"]) or not src["id"].startswith("source:"):
|
|
79
|
+
raise ValueError(f"validator: invalid source id: {src['id']!r}")
|
|
80
|
+
|
|
81
|
+
# validate nodes
|
|
82
|
+
valid_nodes: list[dict] = []
|
|
83
|
+
candidate_ids: set[str] = {src["id"]}
|
|
84
|
+
for node in payload.get("nodes", []):
|
|
85
|
+
nid = node.get("id", "")
|
|
86
|
+
if not ID_RE.match(nid):
|
|
87
|
+
manifest.rejected_nodes.append((node, "id_format"))
|
|
88
|
+
continue
|
|
89
|
+
if node.get("type") not in NODE_TYPES:
|
|
90
|
+
manifest.rejected_nodes.append((node, f"bad_type:{node.get('type')}"))
|
|
91
|
+
continue
|
|
92
|
+
if node.get("confidence") not in {"high", "medium", "low"}:
|
|
93
|
+
manifest.rejected_nodes.append((node, "bad_confidence"))
|
|
94
|
+
continue
|
|
95
|
+
# provenance-or-bust: high → must have excerpt + must substring-match source
|
|
96
|
+
excerpt = (node.get("excerpt") or "").strip()
|
|
97
|
+
if node["confidence"] == "high":
|
|
98
|
+
if not excerpt:
|
|
99
|
+
node["confidence"] = "low"
|
|
100
|
+
manifest.demoted_nodes.append((node, "no_excerpt"))
|
|
101
|
+
elif _norm(excerpt) not in src_norm:
|
|
102
|
+
node["confidence"] = "low"
|
|
103
|
+
manifest.demoted_nodes.append((node, "excerpt_not_in_source"))
|
|
104
|
+
candidate_ids.add(nid)
|
|
105
|
+
valid_nodes.append(node)
|
|
106
|
+
manifest.accepted_nodes.append(node)
|
|
107
|
+
|
|
108
|
+
# validate edges
|
|
109
|
+
valid_edges: list[dict] = []
|
|
110
|
+
for edge in payload.get("edges", []):
|
|
111
|
+
if edge.get("type") not in EDGE_TYPES:
|
|
112
|
+
manifest.rejected_edges.append((edge, f"bad_type:{edge.get('type')}"))
|
|
113
|
+
continue
|
|
114
|
+
if edge.get("confidence") not in {"high", "medium", "low"}:
|
|
115
|
+
manifest.rejected_edges.append((edge, "bad_confidence"))
|
|
116
|
+
continue
|
|
117
|
+
for endpoint_key in ("src", "dst"):
|
|
118
|
+
ep = edge.get(endpoint_key, "")
|
|
119
|
+
if not ID_RE.match(ep):
|
|
120
|
+
manifest.rejected_edges.append((edge, f"{endpoint_key}_id_format"))
|
|
121
|
+
break
|
|
122
|
+
if ep not in g.nodes and ep not in candidate_ids:
|
|
123
|
+
manifest.rejected_edges.append((edge, f"orphan_{endpoint_key}:{ep}"))
|
|
124
|
+
break
|
|
125
|
+
else:
|
|
126
|
+
valid_edges.append(edge)
|
|
127
|
+
manifest.accepted_edges.append(edge)
|
|
128
|
+
|
|
129
|
+
validated = dict(payload)
|
|
130
|
+
validated["nodes"] = valid_nodes
|
|
131
|
+
validated["edges"] = valid_edges
|
|
132
|
+
return validated, manifest
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def main():
|
|
136
|
+
import json
|
|
137
|
+
import sys
|
|
138
|
+
if len(sys.argv) < 3:
|
|
139
|
+
print("Usage: python validator.py <candidates.json> <source.md>")
|
|
140
|
+
return 1
|
|
141
|
+
payload = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
|
|
142
|
+
src_text = Path(sys.argv[2]).read_text(encoding="utf-8")
|
|
143
|
+
_, manifest = validate(payload, src_text)
|
|
144
|
+
print(manifest.summary())
|
|
145
|
+
return 0
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
raise SystemExit(main())
|