knowledge-worker 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ """
2
+ export_context.py — generate a compact LLM-ready context snapshot of mygraph.
3
+ Usage: mykg context [--out context.md] [--max-ideas 20]
4
+ """
5
+ import argparse
6
+ import datetime
7
+ import json
8
+ import sys
9
+ from collections import defaultdict
10
+
11
+ try:
12
+ from .mygraph import resolve_graph_path
13
+ except ImportError: # direct script execution: python mygraph/export_context.py
14
+ from mygraph import resolve_graph_path
15
+
16
+
17
+ def load(path=None):
18
+ path = resolve_graph_path(path)
19
+ with open(path, encoding="utf-8") as f:
20
+ return json.load(f)
21
+
22
+ def export_context(g, max_ideas=20):
23
+ nodes = g["nodes"]
24
+ edges = g["edges"]
25
+
26
+ # Build incoming edge count per node
27
+ in_edges = defaultdict(list)
28
+ for e in edges:
29
+ in_edges[e["dst"]].append(e)
30
+
31
+ def by_type(t):
32
+ return [n for n in nodes.values() if n.get("type") == t]
33
+
34
+ def conf_marker(c):
35
+ if c == "low": return " WARN"
36
+ if c == "medium": return " ~"
37
+ return ""
38
+
39
+ lines = []
40
+ lines.append("# mygraph - Context Snapshot")
41
+ lines.append(f"*Generated: {datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')} | "
42
+ f"{len(nodes)} nodes, {len(edges)} edges*\n")
43
+
44
+ # Goals
45
+ goals = by_type("goal")
46
+ if goals:
47
+ lines.append("## Goals")
48
+ for n in goals:
49
+ lines.append(f"- **{n['label']}**{conf_marker(n.get('confidence',''))}")
50
+ if n.get("body"):
51
+ lines.append(f" {n['body'][:120]}")
52
+ lines.append("")
53
+
54
+ # Decisions (high confidence only to keep it tight)
55
+ decisions = [n for n in by_type("decision") if n.get("confidence") != "low"]
56
+ if decisions:
57
+ lines.append("## Key Decisions")
58
+ for n in decisions[:20]:
59
+ lines.append(f"- **{n['label']}**")
60
+ if n.get("body"):
61
+ lines.append(f" {n['body'][:100]}")
62
+ lines.append("")
63
+
64
+ # Ideas: sort by incoming edge count (most connected first)
65
+ ideas = by_type("idea")
66
+ ideas_sorted = sorted(ideas, key=lambda n: -len(in_edges.get(n["id"], [])))
67
+ lines.append("## Ideas")
68
+ for n in ideas_sorted[:max_ideas]:
69
+ edge_count = len(in_edges.get(n["id"], []))
70
+ marker = conf_marker(n.get("confidence",""))
71
+ lines.append(f"- **{n['label']}**{marker} *(connections: {edge_count})*")
72
+ if n.get("body"):
73
+ lines.append(f" {n['body'][:120]}")
74
+ lines.append("")
75
+
76
+ # Topics: k-core proxy, more than 1 incoming edge.
77
+ topics = [n for n in by_type("topic") if len(in_edges.get(n["id"],[])) > 1]
78
+ topics_sorted = sorted(topics, key=lambda n: -len(in_edges.get(n["id"],[])))
79
+ if topics_sorted:
80
+ lines.append("## Core Topics")
81
+ for n in topics_sorted[:20]:
82
+ lines.append(f"- {n['label']} *(x{len(in_edges[n['id']])})*")
83
+ lines.append("")
84
+
85
+ # Recent sources (last 5)
86
+ sources = sorted(by_type("source"), key=lambda n: n.get("created_at",""), reverse=True)[:5]
87
+ if sources:
88
+ lines.append("## Recent Sources")
89
+ for n in sources:
90
+ lines.append(f"- {n['label']}")
91
+ lines.append("")
92
+
93
+ # Questions
94
+ questions = by_type("question")
95
+ if questions:
96
+ lines.append("## Open Questions")
97
+ for n in questions[:10]:
98
+ lines.append(f"- {n['label']}{conf_marker(n.get('confidence',''))}")
99
+ lines.append("")
100
+
101
+ return "\n".join(lines)
102
+
103
+
104
+ def run_export_context(args: list[str]) -> int:
105
+ parser = argparse.ArgumentParser()
106
+ parser.add_argument("--graph", default=None)
107
+ parser.add_argument("--out", default=None)
108
+ parser.add_argument("--max-ideas", type=int, default=20)
109
+ parsed = parser.parse_args(args)
110
+
111
+ g = load(parsed.graph)
112
+ text = export_context(g, max_ideas=parsed.max_ideas)
113
+
114
+ if parsed.out:
115
+ with open(parsed.out, "w", encoding="utf-8") as f:
116
+ f.write(text)
117
+ print(f"Written to {parsed.out}")
118
+ else:
119
+ print(text)
120
+ return 0
121
+
122
+
123
+ if __name__ == "__main__":
124
+ sys.exit(run_export_context(sys.argv[1:]))
mygraph/extractor.py ADDED
@@ -0,0 +1,243 @@
1
+ """
2
+ extractor.py — Stage 1 of the v1 ingest pipeline.
3
+
4
+ Reads a markdown file, calls the Anthropic API with a schema-constrained prompt,
5
+ and writes a candidates.json. No graph mutation here.
6
+
7
+ Provenance-or-bust: the prompt requires literal excerpts for `high`-confidence
8
+ candidates. Validator (Stage 2) enforces it; this stage just asks for it.
9
+
10
+ Env:
11
+ Anthropic auth is inferred from env. Supported providers:
12
+ - Anthropic API: ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN
13
+ - Foundry: ANTHROPIC_FOUNDRY_API_KEY plus resource/base URL
14
+ - Bedrock: AWS_BEARER_TOKEN_BEDROCK or AWS credentials plus region
15
+ MYGRAPH_MODEL — optional model override; default `claude-sonnet-4-6`.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import os
22
+ import sys
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+ from mygraph import Graph, NODE_TYPES, EDGE_TYPES, slug
28
+ try:
29
+ from .anthropic_client import get_anthropic_client
30
+ except ImportError: # direct script execution
31
+ from anthropic_client import get_anthropic_client
32
+
33
+ DEFAULT_MODEL = "claude-sonnet-4-6"
34
+
35
+ EXTRACTION_TOOL = {
36
+ "name": "emit_candidates",
37
+ "description": "Emit candidate nodes and edges extracted from the source markdown.",
38
+ "input_schema": {
39
+ "type": "object",
40
+ "properties": {
41
+ "source": {
42
+ "type": "object",
43
+ "properties": {
44
+ "id": {"type": "string"},
45
+ "label": {"type": "string"},
46
+ "body": {"type": "string"},
47
+ },
48
+ "required": ["id", "label", "body"],
49
+ },
50
+ "nodes": {
51
+ "type": "array",
52
+ "items": {
53
+ "type": "object",
54
+ "properties": {
55
+ "id": {"type": "string"},
56
+ "type": {"type": "string", "enum": sorted(NODE_TYPES)},
57
+ "label": {"type": "string"},
58
+ "body": {"type": "string"},
59
+ "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
60
+ "excerpt": {"type": "string"},
61
+ },
62
+ "required": ["id", "type", "label", "confidence"],
63
+ },
64
+ },
65
+ "edges": {
66
+ "type": "array",
67
+ "items": {
68
+ "type": "object",
69
+ "properties": {
70
+ "src": {"type": "string"},
71
+ "dst": {"type": "string"},
72
+ "type": {"type": "string", "enum": sorted(EDGE_TYPES)},
73
+ "confidence": {"type": "string", "enum": ["high", "medium", "low"]},
74
+ "excerpt": {"type": "string"},
75
+ },
76
+ "required": ["src", "dst", "type", "confidence"],
77
+ },
78
+ },
79
+ },
80
+ "required": ["source", "nodes", "edges"],
81
+ },
82
+ }
83
+
84
+
85
+ PROMPT_TEMPLATE = """\
86
+ You are extracting nodes and edges for a personal knowledge graph centered on the user.
87
+ The graph stores durable concepts (Person, Idea, Project, Goal, Topic, Reference,
88
+ Question, Decision, Source) and relations between them.
89
+
90
+ Rules:
91
+ 1. Every node and edge MUST cite a literal excerpt from the source. No paraphrase.
92
+ 2. Use confidence "high" only when you have a direct quote in the `excerpt` field.
93
+ 3. Use confidence "medium" for clear paraphrase (still quote what you paraphrased FROM).
94
+ 4. Use confidence "low" for inference. Quote what you inferred FROM.
95
+ 5. Slug-style IDs: lowercase, hyphenated, type-prefixed. E.g. `idea:context-memory`.
96
+ 6. Reuse existing IDs (below) when a candidate refers to an existing concept.
97
+ 7. Do NOT invent biographical facts. If the source doesn't say it, it doesn't go in.
98
+ 8. The Source node `id` MUST equal: {source_id}
99
+ 9. Every NEW concept node MUST have a `MENTIONED_IN` edge to the Source.
100
+ 10. Output the tool call exactly per schema. No prose.
101
+
102
+ Allowed node types: {node_types}
103
+ Allowed edge types: {edge_types}
104
+
105
+ Existing node IDs (reuse when applicable):
106
+ {existing_ids}
107
+
108
+ SOURCE METADATA:
109
+ id : {source_id}
110
+ label : {source_label}
111
+ path : {source_path}
112
+
113
+ SOURCE MARKDOWN follows between <<<SOURCE>>> markers. Extract.
114
+
115
+ <<<SOURCE>>>
116
+ {source_text}
117
+ <<<SOURCE>>>
118
+ """
119
+
120
+
121
+ def build_source_decl(md_path: Path) -> dict:
122
+ sid_slug = slug(md_path.stem)
123
+ return {
124
+ "source_id": f"source:{sid_slug}",
125
+ "source_label": md_path.name,
126
+ "source_path": str(md_path.resolve()),
127
+ "ingested_at": datetime.now(timezone.utc).isoformat(),
128
+ }
129
+
130
+
131
+ def call_anthropic(prompt: str, model: str | None = None) -> dict:
132
+ """Invoke Claude with the extraction tool. Returns the tool input dict."""
133
+ try:
134
+ client, config = get_anthropic_client()
135
+ except RuntimeError as e:
136
+ raise SystemExit(
137
+ f"extractor: {e}\n"
138
+ "Or run `mykg ingest <file> --candidates-file <path>` "
139
+ "to skip extraction with a hand-curated candidates JSON."
140
+ ) from e
141
+ model = model or config.model
142
+ resp = client.messages.create(
143
+ model=model,
144
+ max_tokens=8000,
145
+ tools=[EXTRACTION_TOOL],
146
+ tool_choice={"type": "tool", "name": "emit_candidates"},
147
+ messages=[{"role": "user", "content": prompt}],
148
+ )
149
+ for block in resp.content:
150
+ if getattr(block, "type", None) == "tool_use" and block.name == "emit_candidates":
151
+ return block.input # type: ignore[return-value]
152
+ raise RuntimeError("extractor: model did not emit the emit_candidates tool call.")
153
+
154
+
155
+ def ensure_provenance_edges(payload: dict) -> int:
156
+ """Backfill MENTIONED_IN edges when a gateway drops tool-emitted edges."""
157
+ src_id = payload.get("source", {}).get("id")
158
+ if not src_id:
159
+ return 0
160
+
161
+ edges = payload.get("edges")
162
+ if not isinstance(edges, list):
163
+ edges = []
164
+ payload["edges"] = edges
165
+ nodes = payload.get("nodes", [])
166
+ if not isinstance(nodes, list):
167
+ return 0
168
+
169
+ existing = {
170
+ (e.get("src"), e.get("dst"), e.get("type"))
171
+ for e in edges
172
+ if isinstance(e, dict)
173
+ }
174
+ injected = 0
175
+ for node in nodes:
176
+ if not isinstance(node, dict):
177
+ continue
178
+ node_id = node.get("id")
179
+ if not node_id or node_id == src_id:
180
+ continue
181
+ key = (node_id, src_id, "MENTIONED_IN")
182
+ if key in existing:
183
+ continue
184
+ edges.append({
185
+ "src": node_id,
186
+ "dst": src_id,
187
+ "type": "MENTIONED_IN",
188
+ "confidence": node.get("confidence", "medium"),
189
+ "excerpt": (node.get("excerpt") or node.get("body") or "")[:300],
190
+ })
191
+ existing.add(key)
192
+ injected += 1
193
+ return injected
194
+
195
+
196
+ def extract(md_path: Path, out_path: Path | None = None,
197
+ model: str | None = None) -> dict:
198
+ """End-to-end extract: read markdown, call LLM, write candidates.json."""
199
+ g = Graph.load()
200
+ decl = build_source_decl(md_path)
201
+ source_text = md_path.read_text(encoding="utf-8")
202
+ existing_ids = sorted(g.nodes.keys())
203
+ prompt = PROMPT_TEMPLATE.format(
204
+ source_id=decl["source_id"],
205
+ source_label=decl["source_label"],
206
+ source_path=decl["source_path"],
207
+ node_types=", ".join(sorted(NODE_TYPES)),
208
+ edge_types=", ".join(sorted(EDGE_TYPES)),
209
+ existing_ids="\n".join(f" - {i}" for i in existing_ids),
210
+ source_text=source_text,
211
+ )
212
+ payload = call_anthropic(prompt, model=model)
213
+ injected = ensure_provenance_edges(payload)
214
+ if injected:
215
+ print(
216
+ "extractor: gateway returned missing provenance edges; "
217
+ f"synthesized {injected} MENTIONED_IN edges.",
218
+ file=sys.stderr,
219
+ )
220
+ # ensure source_path/ingested_at hitch a ride for downstream stages
221
+ payload.setdefault("_meta", {})
222
+ payload["_meta"]["source_path"] = decl["source_path"]
223
+ payload["_meta"]["ingested_at"] = decl["ingested_at"]
224
+ payload["_meta"]["model"] = model or os.environ.get("MYGRAPH_MODEL", DEFAULT_MODEL)
225
+ if out_path:
226
+ out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
227
+ return payload
228
+
229
+
230
+ def main(argv: list[str]) -> int:
231
+ if len(argv) < 2:
232
+ print("Usage: python extractor.py <path/to/file.md> [out.json]")
233
+ return 1
234
+ md = Path(argv[1]).expanduser().resolve()
235
+ out = Path(argv[2]).resolve() if len(argv) > 2 else md.parent / f"{md.stem}.candidates.json"
236
+ payload = extract(md, out)
237
+ print(f"extractor: wrote {len(payload.get('nodes', []))} nodes, "
238
+ f"{len(payload.get('edges', []))} edges -> {out}")
239
+ return 0
240
+
241
+
242
+ if __name__ == "__main__":
243
+ sys.exit(main(sys.argv))
@@ -0,0 +1,165 @@
1
+ """
2
+ extractor_openai.py — Stage 1 extractor backed by OpenAI Responses API.
3
+
4
+ Reads a markdown file, calls OpenAI with a JSON-schema constrained response,
5
+ and writes a candidates.json. No graph mutation here.
6
+
7
+ Env:
8
+ OPENAI_API_KEY — required.
9
+ MYGRAPH_OPENAI_MODEL — optional model override; default `gpt-5.2`.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ import sys
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from mygraph import Graph, NODE_TYPES, EDGE_TYPES
21
+
22
+ try:
23
+ from .extractor import (
24
+ EXTRACTION_TOOL,
25
+ PROMPT_TEMPLATE,
26
+ build_source_decl,
27
+ ensure_provenance_edges,
28
+ )
29
+ except ImportError: # direct script execution
30
+ from extractor import (
31
+ EXTRACTION_TOOL,
32
+ PROMPT_TEMPLATE,
33
+ build_source_decl,
34
+ ensure_provenance_edges,
35
+ )
36
+
37
+
38
+ DEFAULT_MODEL = os.environ.get("MYGRAPH_OPENAI_MODEL", "gpt-5.2")
39
+
40
+
41
+ def _response_text(resp: Any) -> str:
42
+ """Extract text from OpenAI Responses SDK objects across SDK versions."""
43
+ output_text = getattr(resp, "output_text", None)
44
+ if output_text:
45
+ return output_text
46
+
47
+ chunks: list[str] = []
48
+ for item in getattr(resp, "output", []) or []:
49
+ for content in getattr(item, "content", []) or []:
50
+ text = getattr(content, "text", None)
51
+ if text:
52
+ chunks.append(text)
53
+ return "".join(chunks)
54
+
55
+
56
+ def _loads_json(text: str) -> dict:
57
+ stripped = text.strip()
58
+ if stripped.startswith("```"):
59
+ stripped = stripped.strip("`")
60
+ stripped = stripped.split("\n", 1)[1] if "\n" in stripped else stripped
61
+ stripped = stripped.rsplit("```", 1)[0] if stripped.endswith("```") else stripped
62
+ return json.loads(stripped)
63
+
64
+
65
+ def call_openai(prompt: str, model: str = DEFAULT_MODEL) -> dict:
66
+ """Invoke OpenAI with Structured Outputs. Returns the parsed JSON dict."""
67
+ try:
68
+ from openai import OpenAI # type: ignore
69
+ except ImportError as e:
70
+ raise SystemExit(
71
+ "extractor_openai: `openai` package not installed. Run:\n"
72
+ " python -m pip install -e '.[openai]'"
73
+ ) from e
74
+ if not os.environ.get("OPENAI_API_KEY"):
75
+ raise SystemExit(
76
+ "extractor_openai: OPENAI_API_KEY env var is not set.\n"
77
+ "Either export it, or run `mykg ingest <file> --candidates-file <path>` "
78
+ "to skip extraction with a hand-curated candidates JSON."
79
+ )
80
+
81
+ client = OpenAI()
82
+ resp = client.responses.create(
83
+ model=model,
84
+ input=[{"role": "user", "content": prompt}],
85
+ max_output_tokens=8000,
86
+ text={
87
+ "format": {
88
+ "type": "json_schema",
89
+ "name": EXTRACTION_TOOL["name"],
90
+ "description": EXTRACTION_TOOL["description"],
91
+ "schema": EXTRACTION_TOOL["input_schema"],
92
+ "strict": False,
93
+ }
94
+ },
95
+ )
96
+ text = _response_text(resp)
97
+ if not text:
98
+ raise RuntimeError("extractor_openai: model returned no text output.")
99
+ try:
100
+ return _loads_json(text)
101
+ except json.JSONDecodeError as e:
102
+ raise RuntimeError(
103
+ f"extractor_openai: model returned non-JSON: {text[:300]}"
104
+ ) from e
105
+
106
+
107
+ def extract(md_path: Path, out_path: Path | None = None,
108
+ model: str = DEFAULT_MODEL) -> dict:
109
+ """End-to-end extract via OpenAI. Same return shape as extractor.extract."""
110
+ g = Graph.load()
111
+ decl = build_source_decl(md_path)
112
+ source_text = md_path.read_text(encoding="utf-8")
113
+ existing_ids = sorted(g.nodes.keys())
114
+ prompt = PROMPT_TEMPLATE.format(
115
+ source_id=decl["source_id"],
116
+ source_label=decl["source_label"],
117
+ source_path=decl["source_path"],
118
+ node_types=", ".join(sorted(NODE_TYPES)),
119
+ edge_types=", ".join(sorted(EDGE_TYPES)),
120
+ existing_ids="\n".join(f" - {i}" for i in existing_ids),
121
+ source_text=source_text,
122
+ )
123
+ payload = call_openai(prompt, model=model)
124
+ injected = ensure_provenance_edges(payload)
125
+ if injected:
126
+ print(
127
+ "extractor_openai: gateway returned missing provenance edges; "
128
+ f"synthesized {injected} MENTIONED_IN edges.",
129
+ file=sys.stderr,
130
+ )
131
+ payload.setdefault("_meta", {})
132
+ payload["_meta"]["source_path"] = decl["source_path"]
133
+ payload["_meta"]["ingested_at"] = decl["ingested_at"]
134
+ payload["_meta"]["model"] = model
135
+ payload["_meta"]["backend"] = "openai"
136
+ if out_path:
137
+ out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
138
+ return payload
139
+
140
+
141
+ def main(argv: list[str]) -> int:
142
+ if len(argv) < 2:
143
+ print("Usage: python extractor_openai.py <path/to/file.md> [out.json] [--model NAME]")
144
+ return 1
145
+ md = Path(argv[1]).expanduser().resolve()
146
+ out = None
147
+ model = DEFAULT_MODEL
148
+ rest = argv[2:]
149
+ if "--model" in rest:
150
+ i = rest.index("--model")
151
+ model = rest[i + 1]
152
+ del rest[i:i + 2]
153
+ if rest:
154
+ out = Path(rest[0]).expanduser().resolve()
155
+ else:
156
+ out = md.parent / f"{md.stem}.candidates.openai.json"
157
+ payload = extract(md, out, model=model)
158
+ print(f"extractor_openai: model={model} "
159
+ f"nodes={len(payload.get('nodes', []))} "
160
+ f"edges={len(payload.get('edges', []))} -> {out}")
161
+ return 0
162
+
163
+
164
+ if __name__ == "__main__":
165
+ sys.exit(main(sys.argv[1:]))