kglite-docs 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ """kglite-docs — agent-first PDF knowledge base on top of kglite."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from kglite_docs.corpus import Corpus
6
+ from kglite_docs.errors import (
7
+ ConcurrencyError,
8
+ GroundingError,
9
+ IngestError,
10
+ InvalidEnumError,
11
+ KgliteDocsError,
12
+ MissingSourceError,
13
+ ReviewConflict,
14
+ SelfVerificationError,
15
+ UnsupportedFormatError,
16
+ )
17
+ from kglite_docs.schema import (
18
+ AGENT,
19
+ CHUNK,
20
+ CHUNK_TEXT_EMB,
21
+ CLUSTER,
22
+ DOCUMENT,
23
+ DOCUMENT_TITLE_EMB,
24
+ NOTE,
25
+ PAGE,
26
+ SUMMARY,
27
+ SUMMARY_TEXT_EMB,
28
+ TAG,
29
+ VIEW,
30
+ )
31
+
32
+ try:
33
+ from importlib.metadata import version as _pkg_version
34
+ __version__ = _pkg_version("kglite-docs")
35
+ except Exception: # pragma: no cover - not installed (e.g. running from source)
36
+ __version__ = "0.0.0+local"
37
+
38
+ __all__ = [
39
+ "AGENT",
40
+ "CHUNK",
41
+ "CHUNK_TEXT_EMB",
42
+ "CLUSTER",
43
+ "DOCUMENT",
44
+ "DOCUMENT_TITLE_EMB",
45
+ "NOTE",
46
+ "PAGE",
47
+ "SUMMARY",
48
+ "SUMMARY_TEXT_EMB",
49
+ "TAG",
50
+ "VIEW",
51
+ "ConcurrencyError",
52
+ "Corpus",
53
+ "GroundingError",
54
+ "IngestError",
55
+ "InvalidEnumError",
56
+ "KgliteDocsError",
57
+ "MissingSourceError",
58
+ "ReviewConflict",
59
+ "SelfVerificationError",
60
+ "UnsupportedFormatError",
61
+ "__version__",
62
+ ]
@@ -0,0 +1,146 @@
1
+ """Agent identity + view tracking.
2
+
3
+ Agents are lazily registered on their first mutation; views can be
4
+ recorded explicitly (with context) or implicitly (when `search` /
5
+ `get_chunk` are called with an `agent_id`).
6
+
7
+ Aggregate `view_count` + `last_viewed_at` on the Chunk is updated on
8
+ every recorded view — a cheap denormalisation so listings can sort by
9
+ attention without joining View nodes.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import uuid
15
+ from datetime import datetime, timezone
16
+ from typing import Any
17
+
18
+ from kglite_docs.schema import (
19
+ AGENT,
20
+ AUTHORED,
21
+ CHUNK,
22
+ VIEW,
23
+ VIEWED,
24
+ )
25
+ from kglite_docs.store import Store
26
+
27
+
28
+ def _now() -> str:
29
+ return datetime.now(timezone.utc).isoformat()
30
+
31
+
32
+ from kglite_docs.store import rows as _df_dicts # noqa: E402
33
+
34
+
35
+ def register_agent(
36
+ store: Store, *, agent_id: str, kind: str = "llm", model: str = ""
37
+ ) -> dict[str, Any]:
38
+ """Idempotent. Touches `last_seen` if the agent exists; creates otherwise."""
39
+ now = _now()
40
+ existing = _df_dicts(
41
+ store.cypher("MATCH (a:Agent {id: $id}) RETURN a.id AS id", params={"id": agent_id})
42
+ )
43
+ if existing:
44
+ store.cypher(
45
+ "MATCH (a:Agent {id: $id}) SET a.last_seen = $now, a.action_count = coalesce(a.action_count, 0) + 1",
46
+ params={"id": agent_id, "now": now},
47
+ )
48
+ return {"id": agent_id, "created": False, "last_seen": now}
49
+ store.upsert_nodes(
50
+ AGENT,
51
+ [{
52
+ "id": agent_id,
53
+ "title": agent_id,
54
+ "kind": kind,
55
+ "model": model,
56
+ "first_seen": now,
57
+ "last_seen": now,
58
+ "action_count": 1,
59
+ }],
60
+ )
61
+ return {"id": agent_id, "created": True, "last_seen": now}
62
+
63
+
64
+ def list_agents(store: Store) -> list[dict[str, Any]]:
65
+ df = store.cypher(
66
+ "MATCH (a:Agent) RETURN a.id AS id, a.kind AS kind, a.model AS model, "
67
+ "a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions "
68
+ "ORDER BY a.last_seen DESC"
69
+ )
70
+ return _df_dicts(df)
71
+
72
+
73
+ def record_view(
74
+ store: Store,
75
+ *,
76
+ agent_id: str,
77
+ target_id: str,
78
+ target_kind: str = CHUNK,
79
+ context: str = "",
80
+ ) -> dict[str, Any]:
81
+ """Record an agent viewing a target. Lazy-registers the agent.
82
+
83
+ - Always bumps the target's `view_count` and `last_viewed_at`.
84
+ - Creates a `View` node + edges when `context` is non-empty (so we
85
+ can surface "the query that led here" later); pure visits skip
86
+ the View node to keep the graph lean.
87
+ """
88
+ register_agent(store, agent_id=agent_id)
89
+ now = _now()
90
+ if target_kind == CHUNK:
91
+ store.cypher(
92
+ "MATCH (c:Chunk {id: $id}) "
93
+ "SET c.view_count = coalesce(c.view_count, 0) + 1, c.last_viewed_at = $now",
94
+ params={"id": target_id, "now": now},
95
+ )
96
+ if not context:
97
+ return {"recorded": True, "view_node": None}
98
+ vid = str(uuid.uuid4())
99
+ store.upsert_nodes(
100
+ VIEW,
101
+ [{
102
+ "id": vid,
103
+ "title": context[:60],
104
+ "agent_id": agent_id,
105
+ "target_id": target_id,
106
+ "target_kind": target_kind,
107
+ "at": now,
108
+ "context": context,
109
+ }],
110
+ )
111
+ store.upsert_edges(
112
+ AUTHORED, [{"src": agent_id, "dst": vid}],
113
+ source_type=AGENT, target_type=VIEW,
114
+ )
115
+ # Aggregate VIEWED edge (Agent → Chunk) — multiple writes are tolerated;
116
+ # we don't need uniqueness here.
117
+ if target_kind == CHUNK:
118
+ store.upsert_edges(
119
+ VIEWED, [{"src": agent_id, "dst": target_id, "at": now, "context": context}],
120
+ source_type=AGENT, target_type=CHUNK,
121
+ )
122
+ return {"recorded": True, "view_node": vid}
123
+
124
+
125
+ def agent_activity(store: Store, agent_id: str, *, limit: int = 50) -> dict[str, Any]:
126
+ """Return summary + recent activity for an agent."""
127
+ a_df = _df_dicts(store.cypher(
128
+ "MATCH (a:Agent {id: $id}) RETURN a.id AS id, a.kind AS kind, "
129
+ "a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions",
130
+ params={"id": agent_id},
131
+ ))
132
+ if not a_df:
133
+ return {"agent": None, "views": [], "summaries": [], "tags": []}
134
+ views = _df_dicts(store.cypher(
135
+ "MATCH (a:Agent {id: $id})-[:AUTHORED]->(v:View) "
136
+ f"RETURN v.target_id AS target_id, v.target_kind AS target_kind, v.context AS context, v.at AS at "
137
+ f"ORDER BY v.at DESC LIMIT {int(limit)}",
138
+ params={"id": agent_id},
139
+ ))
140
+ sums = _df_dicts(store.cypher(
141
+ "MATCH (a:Agent {id: $id})-[:AUTHORED]->(s:Summary) "
142
+ f"RETURN s.id AS id, s.target_id AS target_id, s.text AS text, s.verification_status AS status "
143
+ f"ORDER BY s.created_at DESC LIMIT {int(limit)}",
144
+ params={"id": agent_id},
145
+ ))
146
+ return {"agent": a_df[0], "views": views, "summaries": sums}
kglite_docs/agents.py ADDED
@@ -0,0 +1,98 @@
1
+ """LLM caller abstraction used by the workflow demo and any user-built
2
+ agent loop.
3
+
4
+ Two backends:
5
+
6
+ - **anthropic_sdk** — if `ANTHROPIC_API_KEY` is set (and the `anthropic`
7
+ package installed), uses the official SDK. Best for production.
8
+ - **claude_cli** — shells out to the `claude -p` CLI. Reuses the user's
9
+ existing Claude Code auth — no separate API key required. Good for
10
+ one-off scripts and demos.
11
+
12
+ Pick automatically with `default_caller()`, or pass a specific one to
13
+ `call_agent()`.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ from typing import Protocol
22
+
23
+
24
+ class AgentCaller(Protocol):
25
+ """One-shot LLM interface. Stateless; each call is independent."""
26
+
27
+ def __call__(self, prompt: str, *, system: str = "", model: str = "sonnet") -> str: ...
28
+
29
+
30
+ def call_sdk(prompt: str, *, system: str = "", model: str = "claude-sonnet-4-6") -> str:
31
+ """Anthropic SDK path. Requires `ANTHROPIC_API_KEY` and `anthropic`."""
32
+ try:
33
+ import anthropic # type: ignore
34
+ except ImportError as e:
35
+ raise RuntimeError(
36
+ "anthropic SDK not installed. pip install anthropic"
37
+ ) from e
38
+ client = anthropic.Anthropic()
39
+ msg = client.messages.create(
40
+ model=model,
41
+ max_tokens=4096,
42
+ system=system or "You are a helpful assistant.",
43
+ messages=[{"role": "user", "content": prompt}],
44
+ )
45
+ parts = []
46
+ for block in msg.content:
47
+ if hasattr(block, "text"):
48
+ parts.append(block.text)
49
+ return "".join(parts).strip()
50
+
51
+
52
+ def call_cli(prompt: str, *, system: str = "", model: str = "sonnet") -> str:
53
+ """`claude -p` subprocess. Reuses the user's existing Claude Code auth.
54
+ Slower per call than the SDK (~3-5s startup) but needs no API key."""
55
+ if not shutil.which("claude"):
56
+ raise RuntimeError("claude CLI not found on PATH")
57
+ args = ["claude", "-p", "--bare", "--model", model]
58
+ if system:
59
+ args.extend(["--append-system-prompt", system])
60
+ proc = subprocess.run(
61
+ args,
62
+ input=prompt,
63
+ text=True,
64
+ capture_output=True,
65
+ timeout=180,
66
+ check=False,
67
+ )
68
+ if proc.returncode != 0:
69
+ raise RuntimeError(f"claude CLI failed (exit {proc.returncode}): {proc.stderr[:500]}")
70
+ return proc.stdout.strip()
71
+
72
+
73
+ def default_caller() -> AgentCaller:
74
+ """Pick `call_sdk` if an API key is available, else `call_cli`."""
75
+ if os.environ.get("ANTHROPIC_API_KEY"):
76
+ try:
77
+ import anthropic # noqa: F401
78
+ return call_sdk
79
+ except ImportError:
80
+ pass
81
+ if shutil.which("claude"):
82
+ return call_cli
83
+ raise RuntimeError(
84
+ "No LLM caller available — set ANTHROPIC_API_KEY (with `pip install anthropic`) "
85
+ "or install the `claude` CLI."
86
+ )
87
+
88
+
89
+ def call_agent(
90
+ prompt: str,
91
+ *,
92
+ system: str = "",
93
+ model: str = "sonnet",
94
+ caller: AgentCaller | None = None,
95
+ ) -> str:
96
+ """One-shot agent call. Returns the text response."""
97
+ fn = caller or default_caller()
98
+ return fn(prompt, system=system, model=model)
kglite_docs/cli.py ADDED
@@ -0,0 +1,291 @@
1
+ """kglite-docs CLI — ingest, search, list, cluster from the shell.
2
+
3
+ Examples::
4
+
5
+ kglite-docs ingest paper.pdf --db kb.kgl
6
+ kglite-docs ingest ./pdfs/ --db kb.kgl --recursive
7
+ kglite-docs search "transformer attention" --db kb.kgl
8
+ kglite-docs list --db kb.kgl
9
+ kglite-docs cluster --db kb.kgl --algorithm louvain
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ from kglite_docs import Corpus
20
+
21
+
22
+ def _open_or_create(db_path: str | Path) -> Corpus:
23
+ p = Path(db_path)
24
+ return Corpus.open(p) if p.exists() else Corpus.create(p)
25
+
26
+
27
+ def _cmd_ingest(args: argparse.Namespace) -> int:
28
+ corpus = _open_or_create(args.db)
29
+ target = Path(args.target)
30
+ if target.is_dir():
31
+ results = corpus.ingest_dir(target, recursive=args.recursive)
32
+ print(json.dumps({
33
+ "ingested": sum(1 for r in results if r.created),
34
+ "skipped": sum(1 for r in results if not r.created),
35
+ "total_chunks": sum(r.chunk_count for r in results),
36
+ "ocr_pending": sum(r.ocr_pending_pages for r in results),
37
+ }, indent=2))
38
+ else:
39
+ r = corpus.ingest(target)
40
+ print(json.dumps({
41
+ "doc_id": r.doc_id, "created": r.created,
42
+ "pages": r.page_count, "chunks": r.chunk_count,
43
+ "ocr_pending": r.ocr_pending_pages,
44
+ }, indent=2))
45
+ corpus.save(args.db)
46
+ return 0
47
+
48
+
49
+ def _cmd_search(args: argparse.Namespace) -> int:
50
+ corpus = Corpus.open(args.db)
51
+ hits = corpus.search(args.query, top_k=args.top_k, agent_id=args.agent or None)
52
+ for h in hits:
53
+ text = (h.get("text") or "")[: args.snippet]
54
+ print(f"[{h.get('score', 0):.3f}] {h['id']} p.{h.get('page')} {text}")
55
+ return 0
56
+
57
+
58
+ def _cmd_list(args: argparse.Namespace) -> int:
59
+ corpus = Corpus.open(args.db)
60
+ docs = corpus.list_documents(limit=args.limit)
61
+ for d in docs:
62
+ print(f"{d.get('id')} {d.get('title')} pages={d.get('pages')} chunks={d.get('chunk_count')}")
63
+ return 0
64
+
65
+
66
+ def _cmd_cluster(args: argparse.Namespace) -> int:
67
+ corpus = _open_or_create(args.db)
68
+ r = corpus.cluster_chunks(algorithm=args.algorithm)
69
+ print(json.dumps(r, indent=2))
70
+ corpus.save(args.db)
71
+ return 0
72
+
73
+
74
+ def _cmd_ocr_do(args: argparse.Namespace) -> int:
75
+ """Run an agent command across every page flagged `needs_ocr=True`.
76
+
77
+ The command may use these placeholders:
78
+
79
+ - ``{image}`` — path to a freshly-rendered PNG of the page
80
+ - ``{page}`` — 1-based page number
81
+ - ``{doc_title}`` — document title
82
+ - ``{doc_id}`` — document id
83
+
84
+ The command's stdout is taken as the OCR markdown and passed to
85
+ `submit_ocr`. Exit code != 0 (or empty stdout) on a page → that page
86
+ is skipped and logged. Pages are processed serially in v1.
87
+ """
88
+ import shlex
89
+ import subprocess
90
+ import sys
91
+ import tempfile
92
+ from pathlib import Path
93
+
94
+ corpus = Corpus.open(args.db)
95
+ pending = corpus.list_pending_ocr(
96
+ doc_id=args.doc or None,
97
+ limit=args.limit,
98
+ include_images=False, # we render to a temp file ourselves
99
+ dpi=args.dpi,
100
+ )
101
+ if not pending:
102
+ print("nothing to do — no pages flagged needs_ocr=True")
103
+ return 0
104
+
105
+ print(f"{len(pending)} pages pending across {len({p['doc_id'] for p in pending})} docs")
106
+ if args.dry_run:
107
+ for p in pending:
108
+ print(f" would process p.{p['page_number']} of {p['doc_title']} ({p['page_id']})")
109
+ return 0
110
+
111
+ if "{image}" not in args.agent_cmd:
112
+ print(
113
+ "ERROR: --agent-cmd must contain the {image} placeholder so the\n"
114
+ " page render can be passed to your vision agent.",
115
+ file=sys.stderr,
116
+ )
117
+ return 2
118
+
119
+ from kglite_docs.ingest.formats import render_page_image
120
+ succeeded = failed = 0
121
+ for p in pending:
122
+ # Render the page to a temp PNG using whatever path was stored on
123
+ # the Document node when it was ingested.
124
+ try:
125
+ doc_path = corpus.cypher(
126
+ "MATCH (d:Document {id: $id}) RETURN d.path AS path",
127
+ params={"id": p["doc_id"]},
128
+ ).to_list()[0]["path"]
129
+ png = render_page_image(doc_path, int(p["page_number"]), dpi=args.dpi)
130
+ except Exception as exc:
131
+ print(f" ✗ p.{p['page_number']} {p['page_id']}: render failed — {exc}", file=sys.stderr)
132
+ failed += 1
133
+ continue
134
+
135
+ with tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) as fh:
136
+ fh.write(png)
137
+ image_path = Path(fh.name)
138
+ try:
139
+ cmd = args.agent_cmd.format(
140
+ image=str(image_path),
141
+ page=p["page_number"],
142
+ doc_title=p.get("doc_title", ""),
143
+ doc_id=p["doc_id"],
144
+ )
145
+ try:
146
+ proc = subprocess.run(
147
+ shlex.split(cmd) if args.shell != "yes" else cmd,
148
+ shell=(args.shell == "yes"),
149
+ capture_output=True, text=True,
150
+ timeout=args.timeout, check=False,
151
+ )
152
+ except subprocess.TimeoutExpired:
153
+ print(f" ✗ p.{p['page_number']} {p['page_id']}: agent timed out (>{args.timeout}s)", file=sys.stderr)
154
+ failed += 1
155
+ continue
156
+ if proc.returncode != 0:
157
+ stderr_preview = (proc.stderr or "")[:160].replace("\n", " ")
158
+ print(f" ✗ p.{p['page_number']} {p['page_id']}: agent exited {proc.returncode} — {stderr_preview}", file=sys.stderr)
159
+ failed += 1
160
+ continue
161
+ md = (proc.stdout or "").strip()
162
+ if not md:
163
+ print(f" ✗ p.{p['page_number']} {p['page_id']}: empty agent output", file=sys.stderr)
164
+ failed += 1
165
+ continue
166
+ corpus.submit_ocr(
167
+ p["page_id"], md,
168
+ agent_id=args.agent_id, model=args.model,
169
+ )
170
+ print(f" ✓ p.{p['page_number']} of {p.get('doc_title','')} ({len(md)} chars)")
171
+ succeeded += 1
172
+ finally:
173
+ image_path.unlink(missing_ok=True)
174
+
175
+ corpus.save(args.db)
176
+ total = succeeded + failed
177
+ print(f"\nfinished: {succeeded}/{total} pages OCR'd, {failed} failures")
178
+ return 0 if failed == 0 else 1
179
+
180
+
181
+ def _cmd_ocr_status(args: argparse.Namespace) -> int:
182
+ corpus = Corpus.open(args.db)
183
+ status = corpus.ocr_status(doc_id=args.doc or None)
184
+ print(
185
+ f"{status['pending_pages']}/{status['total_pages']} pages pending OCR "
186
+ f"({status['documents_with_pending']}/{status['documents_total']} docs)"
187
+ )
188
+ if args.verbose:
189
+ for d in status["documents"]:
190
+ marker = "!" if d["pending"] else " "
191
+ print(
192
+ f" {marker} {d['pending']:>3}/{d['pages']:<3} {d['format']:<5} "
193
+ f"{d['title']} ({d['doc_id'][:18]}…)"
194
+ )
195
+ return 0 if status["pending_pages"] == 0 else 1
196
+
197
+
198
+ def _cmd_show(args: argparse.Namespace) -> int:
199
+ corpus = Corpus.open(args.db)
200
+ if args.kind == "doc":
201
+ d = corpus.get_document(args.id)
202
+ elif args.kind == "chunk":
203
+ d = corpus.get_chunk(args.id, with_neighbors=True, with_summaries=True)
204
+ else:
205
+ print(f"unknown kind: {args.kind}", file=sys.stderr)
206
+ return 2
207
+ print(json.dumps(d, indent=2, default=str))
208
+ return 0
209
+
210
+
211
+ def main(argv: list[str] | None = None) -> int:
212
+ p = argparse.ArgumentParser(prog="kglite-docs")
213
+ sp = p.add_subparsers(dest="cmd", required=True)
214
+
215
+ pi = sp.add_parser("ingest", help="Ingest a document or directory (PDF/DOCX/PPTX/MD/HTML/TXT/images)")
216
+ pi.add_argument("target", help="Path to a file or a directory")
217
+ pi.add_argument("--db", required=True)
218
+ pi.add_argument("--recursive", action="store_true")
219
+ pi.set_defaults(func=_cmd_ingest)
220
+
221
+ ps = sp.add_parser("search", help="Semantic search")
222
+ ps.add_argument("query")
223
+ ps.add_argument("--db", required=True)
224
+ ps.add_argument("--top-k", type=int, default=10)
225
+ ps.add_argument("--snippet", type=int, default=180)
226
+ ps.add_argument("--agent", default="")
227
+ ps.set_defaults(func=_cmd_search)
228
+
229
+ pl = sp.add_parser("list", help="List documents")
230
+ pl.add_argument("--db", required=True)
231
+ pl.add_argument("--limit", type=int, default=100)
232
+ pl.set_defaults(func=_cmd_list)
233
+
234
+ pc = sp.add_parser("cluster", help="Run clustering")
235
+ pc.add_argument("--db", required=True)
236
+ pc.add_argument("--algorithm", default="louvain")
237
+ pc.set_defaults(func=_cmd_cluster)
238
+
239
+ po = sp.add_parser("ocr-status", help="OCR coverage summary across the corpus")
240
+ po.add_argument("--db", required=True)
241
+ po.add_argument("--doc", default="", help="Scope to one document id")
242
+ po.add_argument("-v", "--verbose", action="store_true", help="Per-document detail")
243
+ po.set_defaults(func=_cmd_ocr_status)
244
+
245
+ pdo = sp.add_parser(
246
+ "ocr-do",
247
+ help="Run an agent command across every page flagged needs_ocr=True",
248
+ description=(
249
+ "Iterate over pages that need OCR. For each, render the page to a "
250
+ "PNG, run the supplied agent command (must include the {image} "
251
+ "placeholder), and submit the command's stdout back as the page's "
252
+ "markdown. Pages are processed serially."
253
+ ),
254
+ )
255
+ pdo.add_argument("--db", required=True)
256
+ pdo.add_argument(
257
+ "--agent-cmd", required=True,
258
+ help='Command template — must contain {image}. Other placeholders: '
259
+ '{page}, {doc_title}, {doc_id}. Example: '
260
+ '\'claude -p --bare --image {image} "Transcribe to markdown"\'',
261
+ )
262
+ pdo.add_argument("--agent-id", default="cli-ocr-agent",
263
+ help="Agent id recorded on each submission (default: cli-ocr-agent)")
264
+ pdo.add_argument("--model", default="",
265
+ help="Model name to record on each page (informational)")
266
+ pdo.add_argument("--doc", default="", help="Scope to one document id")
267
+ pdo.add_argument("--limit", type=int, default=100,
268
+ help="Max pages to process this run (default: 100)")
269
+ pdo.add_argument("--dpi", type=int, default=200,
270
+ help="DPI for the page render handed to the agent")
271
+ pdo.add_argument("--timeout", type=int, default=180,
272
+ help="Per-page agent timeout in seconds (default: 180)")
273
+ pdo.add_argument("--shell", choices=["no", "yes"], default="no",
274
+ help='If "yes", run the command through a shell (allows '
275
+ 'pipes/quoting). Default splits with shlex.')
276
+ pdo.add_argument("--dry-run", action="store_true",
277
+ help="List what would be processed; don't invoke the agent")
278
+ pdo.set_defaults(func=_cmd_ocr_do)
279
+
280
+ psh = sp.add_parser("show", help="Show a document or chunk by id")
281
+ psh.add_argument("kind", choices=["doc", "chunk"])
282
+ psh.add_argument("id")
283
+ psh.add_argument("--db", required=True)
284
+ psh.set_defaults(func=_cmd_show)
285
+
286
+ args = p.parse_args(argv)
287
+ return int(args.func(args) or 0)
288
+
289
+
290
+ if __name__ == "__main__":
291
+ sys.exit(main())