knowledge-worker 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mygraph/ingest.py ADDED
@@ -0,0 +1,170 @@
1
+ """
2
+ ingest.py — orchestrates the 5-stage v1 pipeline.
3
+
4
+ mykg ingest <path/to/file.md>
5
+ [--non-interactive]
6
+ [--auto-accept-high]
7
+ [--auto-accept-all]
8
+ [--candidates-file <path>] # skip Stage 1 (extractor)
9
+ [--keep-candidates] # don't delete intermediate JSON
10
+ [--backend claude|openai|ollama] # extractor LLM (default claude)
11
+ [--model <name>] # v1.5: override model tag
12
+
13
+ Stage 1 (extractor) → candidates.json
14
+ Stage 2 (validator) → manifest + validated.json (in-memory)
15
+ Stage 3 (review CLI) → approved subset
16
+ Stage 4 (merge) → graph mutated, saved
17
+ Stage 5 (eval log) → review verdicts appended
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import sys
24
+ from pathlib import Path
25
+
26
+ try:
27
+ from .validator import validate
28
+ from .review import review
29
+ from .merge import merge
30
+ from .eval_log import append as eval_append
31
+ except ImportError: # direct script execution
32
+ from validator import validate
33
+ from review import review
34
+ from merge import merge
35
+ from eval_log import append as eval_append
36
+
37
+
38
+ def _load_extractor(backend: str):
39
+ """Return the extract() callable for the chosen backend.
40
+ backend ∈ {"claude", "openai", "ollama"}. Imported lazily so a missing dep
41
+ on one side doesn't break the other."""
42
+ if backend == "ollama":
43
+ # ollama_proxy lives as a sibling to mygraph/
44
+ import sys as _sys
45
+ from pathlib import Path as _Path
46
+ op = _Path(__file__).resolve().parent.parent / "ollama_proxy"
47
+ if str(op) not in _sys.path:
48
+ _sys.path.insert(0, str(op))
49
+ from extractor_adapter import extract as _extract # type: ignore
50
+ return _extract
51
+ if backend == "claude":
52
+ try:
53
+ from .extractor import extract as _extract
54
+ except ImportError:
55
+ from extractor import extract as _extract
56
+ return _extract
57
+ if backend == "openai":
58
+ try:
59
+ from .extractor_openai import extract as _extract
60
+ except ImportError:
61
+ from extractor_openai import extract as _extract
62
+ return _extract
63
+ raise ValueError(f"ingest: unknown --backend {backend!r} (valid: claude, openai, ollama)")
64
+
65
+
66
+ def run_ingest(args: list[str]) -> int:
67
+ if not args:
68
+ print("Usage: mykg ingest <file.md> [flags]")
69
+ return 1
70
+ md_path = Path(args[0]).expanduser().resolve()
71
+ if not md_path.exists():
72
+ print(f"ingest: file not found: {md_path}")
73
+ return 1
74
+
75
+ flags = set(args[1:]) # simple set membership; value-bearing flags handled below
76
+ candidates_file = None
77
+ if "--candidates-file" in args:
78
+ i = args.index("--candidates-file")
79
+ if i + 1 >= len(args):
80
+ print("ingest: --candidates-file needs a path")
81
+ return 1
82
+ candidates_file = Path(args[i + 1]).expanduser().resolve()
83
+ backend = "claude"
84
+ if "--backend" in args:
85
+ i = args.index("--backend")
86
+ if i + 1 >= len(args):
87
+ print("ingest: --backend needs a value (claude|openai|ollama)")
88
+ return 1
89
+ backend = args[i + 1]
90
+ model = None
91
+ if "--model" in args:
92
+ i = args.index("--model")
93
+ if i + 1 >= len(args):
94
+ print("ingest: --model needs a value")
95
+ return 1
96
+ model = args[i + 1]
97
+ non_interactive = "--non-interactive" in flags
98
+ auto_high = "--auto-accept-high" in flags
99
+ auto_all = "--auto-accept-all" in flags
100
+ keep_candidates = "--keep-candidates" in flags
101
+
102
+ if non_interactive and not (auto_high or auto_all):
103
+ # Default headless behavior: be conservative — accept only `high`.
104
+ auto_high = True
105
+
106
+ # ---- Stage 1: Extract --------------------------------------------------
107
+ if candidates_file:
108
+ print(f"[1/5] using candidates from: {candidates_file}")
109
+ payload = json.loads(candidates_file.read_text(encoding="utf-8"))
110
+ candidates_path = candidates_file
111
+ else:
112
+ extract = _load_extractor(backend)
113
+ print(f"[1/5] extract -> backend={backend} on {md_path.name} ...")
114
+ candidates_path = md_path.parent / f"{md_path.stem}.candidates.json"
115
+ payload = extract(md_path, candidates_path, model=model) if model else extract(md_path, candidates_path)
116
+ print(f" wrote {candidates_path}")
117
+
118
+ # ---- Stage 2: Validate -------------------------------------------------
119
+ print("[2/5] validate ...")
120
+ src_text = md_path.read_text(encoding="utf-8")
121
+ validated, manifest = validate(payload, src_text)
122
+ print(manifest.summary())
123
+
124
+ # log the manifest
125
+ eval_append({"kind": "extract_manifest", "source_id": payload["source"]["id"],
126
+ "source_path": str(md_path),
127
+ "n_accepted_nodes": len(manifest.accepted_nodes),
128
+ "n_accepted_edges": len(manifest.accepted_edges),
129
+ "n_demoted_nodes": len(manifest.demoted_nodes),
130
+ "n_rejected_nodes": len(manifest.rejected_nodes),
131
+ "n_rejected_edges": len(manifest.rejected_edges),
132
+ "demotions": [{"id": n["id"], "reason": r} for n, r in manifest.demoted_nodes],
133
+ "rejections_n": [{"id": n.get("id", "?"), "reason": r} for n, r in manifest.rejected_nodes],
134
+ "rejections_e": [{"src": e.get("src", "?"), "dst": e.get("dst", "?"),
135
+ "type": e.get("type", "?"), "reason": r}
136
+ for e, r in manifest.rejected_edges]})
137
+
138
+ # ---- Stage 3: Review --------------------------------------------------
139
+ print("[3/5] review ...")
140
+ approved = review(validated, src_text,
141
+ auto_accept_high=auto_high, auto_accept_all=auto_all)
142
+ print(f" approved: {len(approved['nodes'])} nodes, {len(approved['edges'])} edges")
143
+
144
+ # ---- Stage 4: Merge ----------------------------------------------------
145
+ print("[4/5] merge ...")
146
+ n_added, e_added = merge(approved, interactive=not non_interactive)
147
+ print(f" +{n_added} nodes, +{e_added} edges")
148
+
149
+ # ---- Stage 5: Eval log -------------------------------------------------
150
+ eval_append({"kind": "ingest_complete", "source_id": approved["source"]["id"],
151
+ "source_path": str(md_path), "nodes_added": n_added,
152
+ "edges_added": e_added,
153
+ "candidates_file": str(candidates_path) if candidates_path else None,
154
+ "backend": backend, "model": model,
155
+ "non_interactive": non_interactive,
156
+ "auto_accept_high": auto_high, "auto_accept_all": auto_all})
157
+ print("[5/5] eval log updated.")
158
+
159
+ if not keep_candidates and candidates_path and candidates_file is None:
160
+ # only auto-clean if WE wrote it
161
+ try:
162
+ candidates_path.unlink()
163
+ except OSError:
164
+ pass
165
+
166
+ return 0
167
+
168
+
169
+ if __name__ == "__main__":
170
+ sys.exit(run_ingest(sys.argv[1:]))