knowledge-worker 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_worker-0.6.0.dist-info/METADATA +365 -0
- knowledge_worker-0.6.0.dist-info/RECORD +27 -0
- knowledge_worker-0.6.0.dist-info/WHEEL +5 -0
- knowledge_worker-0.6.0.dist-info/entry_points.txt +3 -0
- knowledge_worker-0.6.0.dist-info/licenses/LICENSE +21 -0
- knowledge_worker-0.6.0.dist-info/top_level.txt +2 -0
- mygraph/__init__.py +23 -0
- mygraph/anthropic_client.py +199 -0
- mygraph/audit.py +137 -0
- mygraph/check.py +273 -0
- mygraph/discover.py +654 -0
- mygraph/eval_log.py +36 -0
- mygraph/export_context.py +124 -0
- mygraph/extractor.py +243 -0
- mygraph/extractor_openai.py +165 -0
- mygraph/ingest.py +170 -0
- mygraph/memory_audit.py +1094 -0
- mygraph/merge.py +133 -0
- mygraph/mygraph.py +773 -0
- mygraph/owl_io.py +202 -0
- mygraph/review.py +151 -0
- mygraph/validator.py +149 -0
- mygraph/viz.py +409 -0
- ollama_proxy/eval_compare.py +185 -0
- ollama_proxy/extractor_adapter.py +168 -0
- ollama_proxy/proxy.py +143 -0
- ollama_proxy/server.py +194 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
export_context.py — generate a compact LLM-ready context snapshot of mygraph.
|
|
3
|
+
Usage: mykg context [--out context.md] [--max-ideas 20]
|
|
4
|
+
"""
|
|
5
|
+
import argparse
|
|
6
|
+
import datetime
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from .mygraph import resolve_graph_path
|
|
13
|
+
except ImportError: # direct script execution: python mygraph/export_context.py
|
|
14
|
+
from mygraph import resolve_graph_path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load(path=None):
|
|
18
|
+
path = resolve_graph_path(path)
|
|
19
|
+
with open(path, encoding="utf-8") as f:
|
|
20
|
+
return json.load(f)
|
|
21
|
+
|
|
22
|
+
def export_context(g, max_ideas=20):
|
|
23
|
+
nodes = g["nodes"]
|
|
24
|
+
edges = g["edges"]
|
|
25
|
+
|
|
26
|
+
# Build incoming edge count per node
|
|
27
|
+
in_edges = defaultdict(list)
|
|
28
|
+
for e in edges:
|
|
29
|
+
in_edges[e["dst"]].append(e)
|
|
30
|
+
|
|
31
|
+
def by_type(t):
|
|
32
|
+
return [n for n in nodes.values() if n.get("type") == t]
|
|
33
|
+
|
|
34
|
+
def conf_marker(c):
|
|
35
|
+
if c == "low": return " WARN"
|
|
36
|
+
if c == "medium": return " ~"
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
lines = []
|
|
40
|
+
lines.append("# mygraph - Context Snapshot")
|
|
41
|
+
lines.append(f"*Generated: {datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')} | "
|
|
42
|
+
f"{len(nodes)} nodes, {len(edges)} edges*\n")
|
|
43
|
+
|
|
44
|
+
# Goals
|
|
45
|
+
goals = by_type("goal")
|
|
46
|
+
if goals:
|
|
47
|
+
lines.append("## Goals")
|
|
48
|
+
for n in goals:
|
|
49
|
+
lines.append(f"- **{n['label']}**{conf_marker(n.get('confidence',''))}")
|
|
50
|
+
if n.get("body"):
|
|
51
|
+
lines.append(f" {n['body'][:120]}")
|
|
52
|
+
lines.append("")
|
|
53
|
+
|
|
54
|
+
# Decisions (high confidence only to keep it tight)
|
|
55
|
+
decisions = [n for n in by_type("decision") if n.get("confidence") != "low"]
|
|
56
|
+
if decisions:
|
|
57
|
+
lines.append("## Key Decisions")
|
|
58
|
+
for n in decisions[:20]:
|
|
59
|
+
lines.append(f"- **{n['label']}**")
|
|
60
|
+
if n.get("body"):
|
|
61
|
+
lines.append(f" {n['body'][:100]}")
|
|
62
|
+
lines.append("")
|
|
63
|
+
|
|
64
|
+
# Ideas: sort by incoming edge count (most connected first)
|
|
65
|
+
ideas = by_type("idea")
|
|
66
|
+
ideas_sorted = sorted(ideas, key=lambda n: -len(in_edges.get(n["id"], [])))
|
|
67
|
+
lines.append("## Ideas")
|
|
68
|
+
for n in ideas_sorted[:max_ideas]:
|
|
69
|
+
edge_count = len(in_edges.get(n["id"], []))
|
|
70
|
+
marker = conf_marker(n.get("confidence",""))
|
|
71
|
+
lines.append(f"- **{n['label']}**{marker} *(connections: {edge_count})*")
|
|
72
|
+
if n.get("body"):
|
|
73
|
+
lines.append(f" {n['body'][:120]}")
|
|
74
|
+
lines.append("")
|
|
75
|
+
|
|
76
|
+
# Topics: k-core proxy, more than 1 incoming edge.
|
|
77
|
+
topics = [n for n in by_type("topic") if len(in_edges.get(n["id"],[])) > 1]
|
|
78
|
+
topics_sorted = sorted(topics, key=lambda n: -len(in_edges.get(n["id"],[])))
|
|
79
|
+
if topics_sorted:
|
|
80
|
+
lines.append("## Core Topics")
|
|
81
|
+
for n in topics_sorted[:20]:
|
|
82
|
+
lines.append(f"- {n['label']} *(x{len(in_edges[n['id']])})*")
|
|
83
|
+
lines.append("")
|
|
84
|
+
|
|
85
|
+
# Recent sources (last 5)
|
|
86
|
+
sources = sorted(by_type("source"), key=lambda n: n.get("created_at",""), reverse=True)[:5]
|
|
87
|
+
if sources:
|
|
88
|
+
lines.append("## Recent Sources")
|
|
89
|
+
for n in sources:
|
|
90
|
+
lines.append(f"- {n['label']}")
|
|
91
|
+
lines.append("")
|
|
92
|
+
|
|
93
|
+
# Questions
|
|
94
|
+
questions = by_type("question")
|
|
95
|
+
if questions:
|
|
96
|
+
lines.append("## Open Questions")
|
|
97
|
+
for n in questions[:10]:
|
|
98
|
+
lines.append(f"- {n['label']}{conf_marker(n.get('confidence',''))}")
|
|
99
|
+
lines.append("")
|
|
100
|
+
|
|
101
|
+
return "\n".join(lines)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def run_export_context(args: list[str]) -> int:
|
|
105
|
+
parser = argparse.ArgumentParser()
|
|
106
|
+
parser.add_argument("--graph", default=None)
|
|
107
|
+
parser.add_argument("--out", default=None)
|
|
108
|
+
parser.add_argument("--max-ideas", type=int, default=20)
|
|
109
|
+
parsed = parser.parse_args(args)
|
|
110
|
+
|
|
111
|
+
g = load(parsed.graph)
|
|
112
|
+
text = export_context(g, max_ideas=parsed.max_ideas)
|
|
113
|
+
|
|
114
|
+
if parsed.out:
|
|
115
|
+
with open(parsed.out, "w", encoding="utf-8") as f:
|
|
116
|
+
f.write(text)
|
|
117
|
+
print(f"Written to {parsed.out}")
|
|
118
|
+
else:
|
|
119
|
+
print(text)
|
|
120
|
+
return 0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
sys.exit(run_export_context(sys.argv[1:]))
|
mygraph/extractor.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor.py — Stage 1 of the v1 ingest pipeline.
|
|
3
|
+
|
|
4
|
+
Reads a markdown file, calls the Anthropic API with a schema-constrained prompt,
|
|
5
|
+
and writes a candidates.json. No graph mutation here.
|
|
6
|
+
|
|
7
|
+
Provenance-or-bust: the prompt requires literal excerpts for `high`-confidence
|
|
8
|
+
candidates. Validator (Stage 2) enforces it; this stage just asks for it.
|
|
9
|
+
|
|
10
|
+
Env:
|
|
11
|
+
Anthropic auth is inferred from env. Supported providers:
|
|
12
|
+
- Anthropic API: ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN
|
|
13
|
+
- Foundry: ANTHROPIC_FOUNDRY_API_KEY plus resource/base URL
|
|
14
|
+
- Bedrock: AWS_BEARER_TOKEN_BEDROCK or AWS credentials plus region
|
|
15
|
+
MYGRAPH_MODEL — optional model override; default `claude-sonnet-4-6`.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import sys
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from mygraph import Graph, NODE_TYPES, EDGE_TYPES, slug
|
|
28
|
+
try:
|
|
29
|
+
from .anthropic_client import get_anthropic_client
|
|
30
|
+
except ImportError: # direct script execution
|
|
31
|
+
from anthropic_client import get_anthropic_client
|
|
32
|
+
|
|
33
|
+
DEFAULT_MODEL = "claude-sonnet-4-6"
|
|
34
|
+
|
|
35
|
+
EXTRACTION_TOOL = {
|
|
36
|
+
"name": "emit_candidates",
|
|
37
|
+
"description": "Emit candidate nodes and edges extracted from the source markdown.",
|
|
38
|
+
"input_schema": {
|
|
39
|
+
"type": "object",
|
|
40
|
+
"properties": {
|
|
41
|
+
"source": {
|
|
42
|
+
"type": "object",
|
|
43
|
+
"properties": {
|
|
44
|
+
"id": {"type": "string"},
|
|
45
|
+
"label": {"type": "string"},
|
|
46
|
+
"body": {"type": "string"},
|
|
47
|
+
},
|
|
48
|
+
"required": ["id", "label", "body"],
|
|
49
|
+
},
|
|
50
|
+
"nodes": {
|
|
51
|
+
"type": "array",
|
|
52
|
+
"items": {
|
|
53
|
+
"type": "object",
|
|
54
|
+
"properties": {
|
|
55
|
+
"id": {"type": "string"},
|
|
56
|
+
"type": {"type": "string", "enum": sorted(NODE_TYPES)},
|
|
57
|
+
"label": {"type": "string"},
|
|
58
|
+
"body": {"type": "string"},
|
|
59
|
+
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
|
|
60
|
+
"excerpt": {"type": "string"},
|
|
61
|
+
},
|
|
62
|
+
"required": ["id", "type", "label", "confidence"],
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
"edges": {
|
|
66
|
+
"type": "array",
|
|
67
|
+
"items": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"properties": {
|
|
70
|
+
"src": {"type": "string"},
|
|
71
|
+
"dst": {"type": "string"},
|
|
72
|
+
"type": {"type": "string", "enum": sorted(EDGE_TYPES)},
|
|
73
|
+
"confidence": {"type": "string", "enum": ["high", "medium", "low"]},
|
|
74
|
+
"excerpt": {"type": "string"},
|
|
75
|
+
},
|
|
76
|
+
"required": ["src", "dst", "type", "confidence"],
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
"required": ["source", "nodes", "edges"],
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
PROMPT_TEMPLATE = """\
|
|
86
|
+
You are extracting nodes and edges for a personal knowledge graph centered on the user.
|
|
87
|
+
The graph stores durable concepts (Person, Idea, Project, Goal, Topic, Reference,
|
|
88
|
+
Question, Decision, Source) and relations between them.
|
|
89
|
+
|
|
90
|
+
Rules:
|
|
91
|
+
1. Every node and edge MUST cite a literal excerpt from the source. No paraphrase.
|
|
92
|
+
2. Use confidence "high" only when you have a direct quote in the `excerpt` field.
|
|
93
|
+
3. Use confidence "medium" for clear paraphrase (still quote what you paraphrased FROM).
|
|
94
|
+
4. Use confidence "low" for inference. Quote what you inferred FROM.
|
|
95
|
+
5. Slug-style IDs: lowercase, hyphenated, type-prefixed. E.g. `idea:context-memory`.
|
|
96
|
+
6. Reuse existing IDs (below) when a candidate refers to an existing concept.
|
|
97
|
+
7. Do NOT invent biographical facts. If the source doesn't say it, it doesn't go in.
|
|
98
|
+
8. The Source node `id` MUST equal: {source_id}
|
|
99
|
+
9. Every NEW concept node MUST have a `MENTIONED_IN` edge to the Source.
|
|
100
|
+
10. Output the tool call exactly per schema. No prose.
|
|
101
|
+
|
|
102
|
+
Allowed node types: {node_types}
|
|
103
|
+
Allowed edge types: {edge_types}
|
|
104
|
+
|
|
105
|
+
Existing node IDs (reuse when applicable):
|
|
106
|
+
{existing_ids}
|
|
107
|
+
|
|
108
|
+
SOURCE METADATA:
|
|
109
|
+
id : {source_id}
|
|
110
|
+
label : {source_label}
|
|
111
|
+
path : {source_path}
|
|
112
|
+
|
|
113
|
+
SOURCE MARKDOWN follows between <<<SOURCE>>> markers. Extract.
|
|
114
|
+
|
|
115
|
+
<<<SOURCE>>>
|
|
116
|
+
{source_text}
|
|
117
|
+
<<<SOURCE>>>
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def build_source_decl(md_path: Path) -> dict:
|
|
122
|
+
sid_slug = slug(md_path.stem)
|
|
123
|
+
return {
|
|
124
|
+
"source_id": f"source:{sid_slug}",
|
|
125
|
+
"source_label": md_path.name,
|
|
126
|
+
"source_path": str(md_path.resolve()),
|
|
127
|
+
"ingested_at": datetime.now(timezone.utc).isoformat(),
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def call_anthropic(prompt: str, model: str | None = None) -> dict:
|
|
132
|
+
"""Invoke Claude with the extraction tool. Returns the tool input dict."""
|
|
133
|
+
try:
|
|
134
|
+
client, config = get_anthropic_client()
|
|
135
|
+
except RuntimeError as e:
|
|
136
|
+
raise SystemExit(
|
|
137
|
+
f"extractor: {e}\n"
|
|
138
|
+
"Or run `mykg ingest <file> --candidates-file <path>` "
|
|
139
|
+
"to skip extraction with a hand-curated candidates JSON."
|
|
140
|
+
) from e
|
|
141
|
+
model = model or config.model
|
|
142
|
+
resp = client.messages.create(
|
|
143
|
+
model=model,
|
|
144
|
+
max_tokens=8000,
|
|
145
|
+
tools=[EXTRACTION_TOOL],
|
|
146
|
+
tool_choice={"type": "tool", "name": "emit_candidates"},
|
|
147
|
+
messages=[{"role": "user", "content": prompt}],
|
|
148
|
+
)
|
|
149
|
+
for block in resp.content:
|
|
150
|
+
if getattr(block, "type", None) == "tool_use" and block.name == "emit_candidates":
|
|
151
|
+
return block.input # type: ignore[return-value]
|
|
152
|
+
raise RuntimeError("extractor: model did not emit the emit_candidates tool call.")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def ensure_provenance_edges(payload: dict) -> int:
|
|
156
|
+
"""Backfill MENTIONED_IN edges when a gateway drops tool-emitted edges."""
|
|
157
|
+
src_id = payload.get("source", {}).get("id")
|
|
158
|
+
if not src_id:
|
|
159
|
+
return 0
|
|
160
|
+
|
|
161
|
+
edges = payload.get("edges")
|
|
162
|
+
if not isinstance(edges, list):
|
|
163
|
+
edges = []
|
|
164
|
+
payload["edges"] = edges
|
|
165
|
+
nodes = payload.get("nodes", [])
|
|
166
|
+
if not isinstance(nodes, list):
|
|
167
|
+
return 0
|
|
168
|
+
|
|
169
|
+
existing = {
|
|
170
|
+
(e.get("src"), e.get("dst"), e.get("type"))
|
|
171
|
+
for e in edges
|
|
172
|
+
if isinstance(e, dict)
|
|
173
|
+
}
|
|
174
|
+
injected = 0
|
|
175
|
+
for node in nodes:
|
|
176
|
+
if not isinstance(node, dict):
|
|
177
|
+
continue
|
|
178
|
+
node_id = node.get("id")
|
|
179
|
+
if not node_id or node_id == src_id:
|
|
180
|
+
continue
|
|
181
|
+
key = (node_id, src_id, "MENTIONED_IN")
|
|
182
|
+
if key in existing:
|
|
183
|
+
continue
|
|
184
|
+
edges.append({
|
|
185
|
+
"src": node_id,
|
|
186
|
+
"dst": src_id,
|
|
187
|
+
"type": "MENTIONED_IN",
|
|
188
|
+
"confidence": node.get("confidence", "medium"),
|
|
189
|
+
"excerpt": (node.get("excerpt") or node.get("body") or "")[:300],
|
|
190
|
+
})
|
|
191
|
+
existing.add(key)
|
|
192
|
+
injected += 1
|
|
193
|
+
return injected
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def extract(md_path: Path, out_path: Path | None = None,
|
|
197
|
+
model: str | None = None) -> dict:
|
|
198
|
+
"""End-to-end extract: read markdown, call LLM, write candidates.json."""
|
|
199
|
+
g = Graph.load()
|
|
200
|
+
decl = build_source_decl(md_path)
|
|
201
|
+
source_text = md_path.read_text(encoding="utf-8")
|
|
202
|
+
existing_ids = sorted(g.nodes.keys())
|
|
203
|
+
prompt = PROMPT_TEMPLATE.format(
|
|
204
|
+
source_id=decl["source_id"],
|
|
205
|
+
source_label=decl["source_label"],
|
|
206
|
+
source_path=decl["source_path"],
|
|
207
|
+
node_types=", ".join(sorted(NODE_TYPES)),
|
|
208
|
+
edge_types=", ".join(sorted(EDGE_TYPES)),
|
|
209
|
+
existing_ids="\n".join(f" - {i}" for i in existing_ids),
|
|
210
|
+
source_text=source_text,
|
|
211
|
+
)
|
|
212
|
+
payload = call_anthropic(prompt, model=model)
|
|
213
|
+
injected = ensure_provenance_edges(payload)
|
|
214
|
+
if injected:
|
|
215
|
+
print(
|
|
216
|
+
"extractor: gateway returned missing provenance edges; "
|
|
217
|
+
f"synthesized {injected} MENTIONED_IN edges.",
|
|
218
|
+
file=sys.stderr,
|
|
219
|
+
)
|
|
220
|
+
# ensure source_path/ingested_at hitch a ride for downstream stages
|
|
221
|
+
payload.setdefault("_meta", {})
|
|
222
|
+
payload["_meta"]["source_path"] = decl["source_path"]
|
|
223
|
+
payload["_meta"]["ingested_at"] = decl["ingested_at"]
|
|
224
|
+
payload["_meta"]["model"] = model or os.environ.get("MYGRAPH_MODEL", DEFAULT_MODEL)
|
|
225
|
+
if out_path:
|
|
226
|
+
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
227
|
+
return payload
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def main(argv: list[str]) -> int:
|
|
231
|
+
if len(argv) < 2:
|
|
232
|
+
print("Usage: python extractor.py <path/to/file.md> [out.json]")
|
|
233
|
+
return 1
|
|
234
|
+
md = Path(argv[1]).expanduser().resolve()
|
|
235
|
+
out = Path(argv[2]).resolve() if len(argv) > 2 else md.parent / f"{md.stem}.candidates.json"
|
|
236
|
+
payload = extract(md, out)
|
|
237
|
+
print(f"extractor: wrote {len(payload.get('nodes', []))} nodes, "
|
|
238
|
+
f"{len(payload.get('edges', []))} edges -> {out}")
|
|
239
|
+
return 0
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
sys.exit(main(sys.argv))
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor_openai.py — Stage 1 extractor backed by OpenAI Responses API.
|
|
3
|
+
|
|
4
|
+
Reads a markdown file, calls OpenAI with a JSON-schema constrained response,
|
|
5
|
+
and writes a candidates.json. No graph mutation here.
|
|
6
|
+
|
|
7
|
+
Env:
|
|
8
|
+
OPENAI_API_KEY — required.
|
|
9
|
+
MYGRAPH_OPENAI_MODEL — optional model override; default `gpt-5.2`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from mygraph import Graph, NODE_TYPES, EDGE_TYPES
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from .extractor import (
|
|
24
|
+
EXTRACTION_TOOL,
|
|
25
|
+
PROMPT_TEMPLATE,
|
|
26
|
+
build_source_decl,
|
|
27
|
+
ensure_provenance_edges,
|
|
28
|
+
)
|
|
29
|
+
except ImportError: # direct script execution
|
|
30
|
+
from extractor import (
|
|
31
|
+
EXTRACTION_TOOL,
|
|
32
|
+
PROMPT_TEMPLATE,
|
|
33
|
+
build_source_decl,
|
|
34
|
+
ensure_provenance_edges,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
DEFAULT_MODEL = os.environ.get("MYGRAPH_OPENAI_MODEL", "gpt-5.2")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _response_text(resp: Any) -> str:
|
|
42
|
+
"""Extract text from OpenAI Responses SDK objects across SDK versions."""
|
|
43
|
+
output_text = getattr(resp, "output_text", None)
|
|
44
|
+
if output_text:
|
|
45
|
+
return output_text
|
|
46
|
+
|
|
47
|
+
chunks: list[str] = []
|
|
48
|
+
for item in getattr(resp, "output", []) or []:
|
|
49
|
+
for content in getattr(item, "content", []) or []:
|
|
50
|
+
text = getattr(content, "text", None)
|
|
51
|
+
if text:
|
|
52
|
+
chunks.append(text)
|
|
53
|
+
return "".join(chunks)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _loads_json(text: str) -> dict:
|
|
57
|
+
stripped = text.strip()
|
|
58
|
+
if stripped.startswith("```"):
|
|
59
|
+
stripped = stripped.strip("`")
|
|
60
|
+
stripped = stripped.split("\n", 1)[1] if "\n" in stripped else stripped
|
|
61
|
+
stripped = stripped.rsplit("```", 1)[0] if stripped.endswith("```") else stripped
|
|
62
|
+
return json.loads(stripped)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def call_openai(prompt: str, model: str = DEFAULT_MODEL) -> dict:
|
|
66
|
+
"""Invoke OpenAI with Structured Outputs. Returns the parsed JSON dict."""
|
|
67
|
+
try:
|
|
68
|
+
from openai import OpenAI # type: ignore
|
|
69
|
+
except ImportError as e:
|
|
70
|
+
raise SystemExit(
|
|
71
|
+
"extractor_openai: `openai` package not installed. Run:\n"
|
|
72
|
+
" python -m pip install -e '.[openai]'"
|
|
73
|
+
) from e
|
|
74
|
+
if not os.environ.get("OPENAI_API_KEY"):
|
|
75
|
+
raise SystemExit(
|
|
76
|
+
"extractor_openai: OPENAI_API_KEY env var is not set.\n"
|
|
77
|
+
"Either export it, or run `mykg ingest <file> --candidates-file <path>` "
|
|
78
|
+
"to skip extraction with a hand-curated candidates JSON."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
client = OpenAI()
|
|
82
|
+
resp = client.responses.create(
|
|
83
|
+
model=model,
|
|
84
|
+
input=[{"role": "user", "content": prompt}],
|
|
85
|
+
max_output_tokens=8000,
|
|
86
|
+
text={
|
|
87
|
+
"format": {
|
|
88
|
+
"type": "json_schema",
|
|
89
|
+
"name": EXTRACTION_TOOL["name"],
|
|
90
|
+
"description": EXTRACTION_TOOL["description"],
|
|
91
|
+
"schema": EXTRACTION_TOOL["input_schema"],
|
|
92
|
+
"strict": False,
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
)
|
|
96
|
+
text = _response_text(resp)
|
|
97
|
+
if not text:
|
|
98
|
+
raise RuntimeError("extractor_openai: model returned no text output.")
|
|
99
|
+
try:
|
|
100
|
+
return _loads_json(text)
|
|
101
|
+
except json.JSONDecodeError as e:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
f"extractor_openai: model returned non-JSON: {text[:300]}"
|
|
104
|
+
) from e
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def extract(md_path: Path, out_path: Path | None = None,
|
|
108
|
+
model: str = DEFAULT_MODEL) -> dict:
|
|
109
|
+
"""End-to-end extract via OpenAI. Same return shape as extractor.extract."""
|
|
110
|
+
g = Graph.load()
|
|
111
|
+
decl = build_source_decl(md_path)
|
|
112
|
+
source_text = md_path.read_text(encoding="utf-8")
|
|
113
|
+
existing_ids = sorted(g.nodes.keys())
|
|
114
|
+
prompt = PROMPT_TEMPLATE.format(
|
|
115
|
+
source_id=decl["source_id"],
|
|
116
|
+
source_label=decl["source_label"],
|
|
117
|
+
source_path=decl["source_path"],
|
|
118
|
+
node_types=", ".join(sorted(NODE_TYPES)),
|
|
119
|
+
edge_types=", ".join(sorted(EDGE_TYPES)),
|
|
120
|
+
existing_ids="\n".join(f" - {i}" for i in existing_ids),
|
|
121
|
+
source_text=source_text,
|
|
122
|
+
)
|
|
123
|
+
payload = call_openai(prompt, model=model)
|
|
124
|
+
injected = ensure_provenance_edges(payload)
|
|
125
|
+
if injected:
|
|
126
|
+
print(
|
|
127
|
+
"extractor_openai: gateway returned missing provenance edges; "
|
|
128
|
+
f"synthesized {injected} MENTIONED_IN edges.",
|
|
129
|
+
file=sys.stderr,
|
|
130
|
+
)
|
|
131
|
+
payload.setdefault("_meta", {})
|
|
132
|
+
payload["_meta"]["source_path"] = decl["source_path"]
|
|
133
|
+
payload["_meta"]["ingested_at"] = decl["ingested_at"]
|
|
134
|
+
payload["_meta"]["model"] = model
|
|
135
|
+
payload["_meta"]["backend"] = "openai"
|
|
136
|
+
if out_path:
|
|
137
|
+
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
138
|
+
return payload
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def main(argv: list[str]) -> int:
|
|
142
|
+
if len(argv) < 2:
|
|
143
|
+
print("Usage: python extractor_openai.py <path/to/file.md> [out.json] [--model NAME]")
|
|
144
|
+
return 1
|
|
145
|
+
md = Path(argv[1]).expanduser().resolve()
|
|
146
|
+
out = None
|
|
147
|
+
model = DEFAULT_MODEL
|
|
148
|
+
rest = argv[2:]
|
|
149
|
+
if "--model" in rest:
|
|
150
|
+
i = rest.index("--model")
|
|
151
|
+
model = rest[i + 1]
|
|
152
|
+
del rest[i:i + 2]
|
|
153
|
+
if rest:
|
|
154
|
+
out = Path(rest[0]).expanduser().resolve()
|
|
155
|
+
else:
|
|
156
|
+
out = md.parent / f"{md.stem}.candidates.openai.json"
|
|
157
|
+
payload = extract(md, out, model=model)
|
|
158
|
+
print(f"extractor_openai: model={model} "
|
|
159
|
+
f"nodes={len(payload.get('nodes', []))} "
|
|
160
|
+
f"edges={len(payload.get('edges', []))} -> {out}")
|
|
161
|
+
return 0
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
if __name__ == "__main__":
|
|
165
|
+
sys.exit(main(sys.argv[1:]))
|