kglite-docs 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kglite_docs/__init__.py +62 -0
- kglite_docs/activity.py +146 -0
- kglite_docs/agents.py +98 -0
- kglite_docs/cli.py +291 -0
- kglite_docs/cluster.py +277 -0
- kglite_docs/context.py +81 -0
- kglite_docs/corpus.py +754 -0
- kglite_docs/embed.py +37 -0
- kglite_docs/enrich.py +343 -0
- kglite_docs/errors.py +65 -0
- kglite_docs/export.py +334 -0
- kglite_docs/ingest/__init__.py +12 -0
- kglite_docs/ingest/chunker.py +242 -0
- kglite_docs/ingest/formats.py +242 -0
- kglite_docs/ingest/hashing.py +55 -0
- kglite_docs/ingest/parser.py +76 -0
- kglite_docs/ingest/pipeline.py +272 -0
- kglite_docs/mcp_server/__init__.py +2 -0
- kglite_docs/mcp_server/__main__.py +37 -0
- kglite_docs/mcp_server/server.py +42 -0
- kglite_docs/mcp_server/tools.py +432 -0
- kglite_docs/ocr.py +244 -0
- kglite_docs/quality.py +212 -0
- kglite_docs/review.py +511 -0
- kglite_docs/schema.py +93 -0
- kglite_docs/store.py +215 -0
- kglite_docs/tagging.py +161 -0
- kglite_docs/translate.py +157 -0
- kglite_docs/types.py +306 -0
- kglite_docs-0.0.1.dist-info/METADATA +135 -0
- kglite_docs-0.0.1.dist-info/RECORD +34 -0
- kglite_docs-0.0.1.dist-info/WHEEL +4 -0
- kglite_docs-0.0.1.dist-info/entry_points.txt +3 -0
- kglite_docs-0.0.1.dist-info/licenses/LICENSE +21 -0
kglite_docs/__init__.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""kglite-docs — agent-first PDF knowledge base on top of kglite."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from kglite_docs.corpus import Corpus
|
|
6
|
+
from kglite_docs.errors import (
|
|
7
|
+
ConcurrencyError,
|
|
8
|
+
GroundingError,
|
|
9
|
+
IngestError,
|
|
10
|
+
InvalidEnumError,
|
|
11
|
+
KgliteDocsError,
|
|
12
|
+
MissingSourceError,
|
|
13
|
+
ReviewConflict,
|
|
14
|
+
SelfVerificationError,
|
|
15
|
+
UnsupportedFormatError,
|
|
16
|
+
)
|
|
17
|
+
from kglite_docs.schema import (
|
|
18
|
+
AGENT,
|
|
19
|
+
CHUNK,
|
|
20
|
+
CHUNK_TEXT_EMB,
|
|
21
|
+
CLUSTER,
|
|
22
|
+
DOCUMENT,
|
|
23
|
+
DOCUMENT_TITLE_EMB,
|
|
24
|
+
NOTE,
|
|
25
|
+
PAGE,
|
|
26
|
+
SUMMARY,
|
|
27
|
+
SUMMARY_TEXT_EMB,
|
|
28
|
+
TAG,
|
|
29
|
+
VIEW,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from importlib.metadata import version as _pkg_version
|
|
34
|
+
__version__ = _pkg_version("kglite-docs")
|
|
35
|
+
except Exception: # pragma: no cover - not installed (e.g. running from source)
|
|
36
|
+
__version__ = "0.0.0+local"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"AGENT",
|
|
40
|
+
"CHUNK",
|
|
41
|
+
"CHUNK_TEXT_EMB",
|
|
42
|
+
"CLUSTER",
|
|
43
|
+
"DOCUMENT",
|
|
44
|
+
"DOCUMENT_TITLE_EMB",
|
|
45
|
+
"NOTE",
|
|
46
|
+
"PAGE",
|
|
47
|
+
"SUMMARY",
|
|
48
|
+
"SUMMARY_TEXT_EMB",
|
|
49
|
+
"TAG",
|
|
50
|
+
"VIEW",
|
|
51
|
+
"ConcurrencyError",
|
|
52
|
+
"Corpus",
|
|
53
|
+
"GroundingError",
|
|
54
|
+
"IngestError",
|
|
55
|
+
"InvalidEnumError",
|
|
56
|
+
"KgliteDocsError",
|
|
57
|
+
"MissingSourceError",
|
|
58
|
+
"ReviewConflict",
|
|
59
|
+
"SelfVerificationError",
|
|
60
|
+
"UnsupportedFormatError",
|
|
61
|
+
"__version__",
|
|
62
|
+
]
|
kglite_docs/activity.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Agent identity + view tracking.
|
|
2
|
+
|
|
3
|
+
Agents are lazily registered on their first mutation; views can be
|
|
4
|
+
recorded explicitly (with context) or implicitly (when `search` /
|
|
5
|
+
`get_chunk` are called with an `agent_id`).
|
|
6
|
+
|
|
7
|
+
Aggregate `view_count` + `last_viewed_at` on the Chunk is updated on
|
|
8
|
+
every recorded view — a cheap denormalisation so listings can sort by
|
|
9
|
+
attention without joining View nodes.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import uuid
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from kglite_docs.schema import (
|
|
19
|
+
AGENT,
|
|
20
|
+
AUTHORED,
|
|
21
|
+
CHUNK,
|
|
22
|
+
VIEW,
|
|
23
|
+
VIEWED,
|
|
24
|
+
)
|
|
25
|
+
from kglite_docs.store import Store
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _now() -> str:
|
|
29
|
+
return datetime.now(timezone.utc).isoformat()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
from kglite_docs.store import rows as _df_dicts # noqa: E402
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def register_agent(
|
|
36
|
+
store: Store, *, agent_id: str, kind: str = "llm", model: str = ""
|
|
37
|
+
) -> dict[str, Any]:
|
|
38
|
+
"""Idempotent. Touches `last_seen` if the agent exists; creates otherwise."""
|
|
39
|
+
now = _now()
|
|
40
|
+
existing = _df_dicts(
|
|
41
|
+
store.cypher("MATCH (a:Agent {id: $id}) RETURN a.id AS id", params={"id": agent_id})
|
|
42
|
+
)
|
|
43
|
+
if existing:
|
|
44
|
+
store.cypher(
|
|
45
|
+
"MATCH (a:Agent {id: $id}) SET a.last_seen = $now, a.action_count = coalesce(a.action_count, 0) + 1",
|
|
46
|
+
params={"id": agent_id, "now": now},
|
|
47
|
+
)
|
|
48
|
+
return {"id": agent_id, "created": False, "last_seen": now}
|
|
49
|
+
store.upsert_nodes(
|
|
50
|
+
AGENT,
|
|
51
|
+
[{
|
|
52
|
+
"id": agent_id,
|
|
53
|
+
"title": agent_id,
|
|
54
|
+
"kind": kind,
|
|
55
|
+
"model": model,
|
|
56
|
+
"first_seen": now,
|
|
57
|
+
"last_seen": now,
|
|
58
|
+
"action_count": 1,
|
|
59
|
+
}],
|
|
60
|
+
)
|
|
61
|
+
return {"id": agent_id, "created": True, "last_seen": now}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def list_agents(store: Store) -> list[dict[str, Any]]:
|
|
65
|
+
df = store.cypher(
|
|
66
|
+
"MATCH (a:Agent) RETURN a.id AS id, a.kind AS kind, a.model AS model, "
|
|
67
|
+
"a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions "
|
|
68
|
+
"ORDER BY a.last_seen DESC"
|
|
69
|
+
)
|
|
70
|
+
return _df_dicts(df)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def record_view(
|
|
74
|
+
store: Store,
|
|
75
|
+
*,
|
|
76
|
+
agent_id: str,
|
|
77
|
+
target_id: str,
|
|
78
|
+
target_kind: str = CHUNK,
|
|
79
|
+
context: str = "",
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
"""Record an agent viewing a target. Lazy-registers the agent.
|
|
82
|
+
|
|
83
|
+
- Always bumps the target's `view_count` and `last_viewed_at`.
|
|
84
|
+
- Creates a `View` node + edges when `context` is non-empty (so we
|
|
85
|
+
can surface "the query that led here" later); pure visits skip
|
|
86
|
+
the View node to keep the graph lean.
|
|
87
|
+
"""
|
|
88
|
+
register_agent(store, agent_id=agent_id)
|
|
89
|
+
now = _now()
|
|
90
|
+
if target_kind == CHUNK:
|
|
91
|
+
store.cypher(
|
|
92
|
+
"MATCH (c:Chunk {id: $id}) "
|
|
93
|
+
"SET c.view_count = coalesce(c.view_count, 0) + 1, c.last_viewed_at = $now",
|
|
94
|
+
params={"id": target_id, "now": now},
|
|
95
|
+
)
|
|
96
|
+
if not context:
|
|
97
|
+
return {"recorded": True, "view_node": None}
|
|
98
|
+
vid = str(uuid.uuid4())
|
|
99
|
+
store.upsert_nodes(
|
|
100
|
+
VIEW,
|
|
101
|
+
[{
|
|
102
|
+
"id": vid,
|
|
103
|
+
"title": context[:60],
|
|
104
|
+
"agent_id": agent_id,
|
|
105
|
+
"target_id": target_id,
|
|
106
|
+
"target_kind": target_kind,
|
|
107
|
+
"at": now,
|
|
108
|
+
"context": context,
|
|
109
|
+
}],
|
|
110
|
+
)
|
|
111
|
+
store.upsert_edges(
|
|
112
|
+
AUTHORED, [{"src": agent_id, "dst": vid}],
|
|
113
|
+
source_type=AGENT, target_type=VIEW,
|
|
114
|
+
)
|
|
115
|
+
# Aggregate VIEWED edge (Agent → Chunk) — multiple writes are tolerated;
|
|
116
|
+
# we don't need uniqueness here.
|
|
117
|
+
if target_kind == CHUNK:
|
|
118
|
+
store.upsert_edges(
|
|
119
|
+
VIEWED, [{"src": agent_id, "dst": target_id, "at": now, "context": context}],
|
|
120
|
+
source_type=AGENT, target_type=CHUNK,
|
|
121
|
+
)
|
|
122
|
+
return {"recorded": True, "view_node": vid}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def agent_activity(store: Store, agent_id: str, *, limit: int = 50) -> dict[str, Any]:
|
|
126
|
+
"""Return summary + recent activity for an agent."""
|
|
127
|
+
a_df = _df_dicts(store.cypher(
|
|
128
|
+
"MATCH (a:Agent {id: $id}) RETURN a.id AS id, a.kind AS kind, "
|
|
129
|
+
"a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions",
|
|
130
|
+
params={"id": agent_id},
|
|
131
|
+
))
|
|
132
|
+
if not a_df:
|
|
133
|
+
return {"agent": None, "views": [], "summaries": [], "tags": []}
|
|
134
|
+
views = _df_dicts(store.cypher(
|
|
135
|
+
"MATCH (a:Agent {id: $id})-[:AUTHORED]->(v:View) "
|
|
136
|
+
f"RETURN v.target_id AS target_id, v.target_kind AS target_kind, v.context AS context, v.at AS at "
|
|
137
|
+
f"ORDER BY v.at DESC LIMIT {int(limit)}",
|
|
138
|
+
params={"id": agent_id},
|
|
139
|
+
))
|
|
140
|
+
sums = _df_dicts(store.cypher(
|
|
141
|
+
"MATCH (a:Agent {id: $id})-[:AUTHORED]->(s:Summary) "
|
|
142
|
+
f"RETURN s.id AS id, s.target_id AS target_id, s.text AS text, s.verification_status AS status "
|
|
143
|
+
f"ORDER BY s.created_at DESC LIMIT {int(limit)}",
|
|
144
|
+
params={"id": agent_id},
|
|
145
|
+
))
|
|
146
|
+
return {"agent": a_df[0], "views": views, "summaries": sums}
|
kglite_docs/agents.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""LLM caller abstraction used by the workflow demo and any user-built
|
|
2
|
+
agent loop.
|
|
3
|
+
|
|
4
|
+
Two backends:
|
|
5
|
+
|
|
6
|
+
- **anthropic_sdk** — if `ANTHROPIC_API_KEY` is set (and the `anthropic`
|
|
7
|
+
package installed), uses the official SDK. Best for production.
|
|
8
|
+
- **claude_cli** — shells out to the `claude -p` CLI. Reuses the user's
|
|
9
|
+
existing Claude Code auth — no separate API key required. Good for
|
|
10
|
+
one-off scripts and demos.
|
|
11
|
+
|
|
12
|
+
Pick automatically with `default_caller()`, or pass a specific one to
|
|
13
|
+
`call_agent()`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
from typing import Protocol
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AgentCaller(Protocol):
|
|
25
|
+
"""One-shot LLM interface. Stateless; each call is independent."""
|
|
26
|
+
|
|
27
|
+
def __call__(self, prompt: str, *, system: str = "", model: str = "sonnet") -> str: ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def call_sdk(prompt: str, *, system: str = "", model: str = "claude-sonnet-4-6") -> str:
|
|
31
|
+
"""Anthropic SDK path. Requires `ANTHROPIC_API_KEY` and `anthropic`."""
|
|
32
|
+
try:
|
|
33
|
+
import anthropic # type: ignore
|
|
34
|
+
except ImportError as e:
|
|
35
|
+
raise RuntimeError(
|
|
36
|
+
"anthropic SDK not installed. pip install anthropic"
|
|
37
|
+
) from e
|
|
38
|
+
client = anthropic.Anthropic()
|
|
39
|
+
msg = client.messages.create(
|
|
40
|
+
model=model,
|
|
41
|
+
max_tokens=4096,
|
|
42
|
+
system=system or "You are a helpful assistant.",
|
|
43
|
+
messages=[{"role": "user", "content": prompt}],
|
|
44
|
+
)
|
|
45
|
+
parts = []
|
|
46
|
+
for block in msg.content:
|
|
47
|
+
if hasattr(block, "text"):
|
|
48
|
+
parts.append(block.text)
|
|
49
|
+
return "".join(parts).strip()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def call_cli(prompt: str, *, system: str = "", model: str = "sonnet") -> str:
|
|
53
|
+
"""`claude -p` subprocess. Reuses the user's existing Claude Code auth.
|
|
54
|
+
Slower per call than the SDK (~3-5s startup) but needs no API key."""
|
|
55
|
+
if not shutil.which("claude"):
|
|
56
|
+
raise RuntimeError("claude CLI not found on PATH")
|
|
57
|
+
args = ["claude", "-p", "--bare", "--model", model]
|
|
58
|
+
if system:
|
|
59
|
+
args.extend(["--append-system-prompt", system])
|
|
60
|
+
proc = subprocess.run(
|
|
61
|
+
args,
|
|
62
|
+
input=prompt,
|
|
63
|
+
text=True,
|
|
64
|
+
capture_output=True,
|
|
65
|
+
timeout=180,
|
|
66
|
+
check=False,
|
|
67
|
+
)
|
|
68
|
+
if proc.returncode != 0:
|
|
69
|
+
raise RuntimeError(f"claude CLI failed (exit {proc.returncode}): {proc.stderr[:500]}")
|
|
70
|
+
return proc.stdout.strip()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def default_caller() -> AgentCaller:
|
|
74
|
+
"""Pick `call_sdk` if an API key is available, else `call_cli`."""
|
|
75
|
+
if os.environ.get("ANTHROPIC_API_KEY"):
|
|
76
|
+
try:
|
|
77
|
+
import anthropic # noqa: F401
|
|
78
|
+
return call_sdk
|
|
79
|
+
except ImportError:
|
|
80
|
+
pass
|
|
81
|
+
if shutil.which("claude"):
|
|
82
|
+
return call_cli
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
"No LLM caller available — set ANTHROPIC_API_KEY (with `pip install anthropic`) "
|
|
85
|
+
"or install the `claude` CLI."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def call_agent(
|
|
90
|
+
prompt: str,
|
|
91
|
+
*,
|
|
92
|
+
system: str = "",
|
|
93
|
+
model: str = "sonnet",
|
|
94
|
+
caller: AgentCaller | None = None,
|
|
95
|
+
) -> str:
|
|
96
|
+
"""One-shot agent call. Returns the text response."""
|
|
97
|
+
fn = caller or default_caller()
|
|
98
|
+
return fn(prompt, system=system, model=model)
|
kglite_docs/cli.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""kglite-docs CLI — ingest, search, list, cluster from the shell.
|
|
2
|
+
|
|
3
|
+
Examples::
|
|
4
|
+
|
|
5
|
+
kglite-docs ingest paper.pdf --db kb.kgl
|
|
6
|
+
kglite-docs ingest ./pdfs/ --db kb.kgl --recursive
|
|
7
|
+
kglite-docs search "transformer attention" --db kb.kgl
|
|
8
|
+
kglite-docs list --db kb.kgl
|
|
9
|
+
kglite-docs cluster --db kb.kgl --algorithm louvain
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from kglite_docs import Corpus
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _open_or_create(db_path: str | Path) -> Corpus:
|
|
23
|
+
p = Path(db_path)
|
|
24
|
+
return Corpus.open(p) if p.exists() else Corpus.create(p)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _cmd_ingest(args: argparse.Namespace) -> int:
|
|
28
|
+
corpus = _open_or_create(args.db)
|
|
29
|
+
target = Path(args.target)
|
|
30
|
+
if target.is_dir():
|
|
31
|
+
results = corpus.ingest_dir(target, recursive=args.recursive)
|
|
32
|
+
print(json.dumps({
|
|
33
|
+
"ingested": sum(1 for r in results if r.created),
|
|
34
|
+
"skipped": sum(1 for r in results if not r.created),
|
|
35
|
+
"total_chunks": sum(r.chunk_count for r in results),
|
|
36
|
+
"ocr_pending": sum(r.ocr_pending_pages for r in results),
|
|
37
|
+
}, indent=2))
|
|
38
|
+
else:
|
|
39
|
+
r = corpus.ingest(target)
|
|
40
|
+
print(json.dumps({
|
|
41
|
+
"doc_id": r.doc_id, "created": r.created,
|
|
42
|
+
"pages": r.page_count, "chunks": r.chunk_count,
|
|
43
|
+
"ocr_pending": r.ocr_pending_pages,
|
|
44
|
+
}, indent=2))
|
|
45
|
+
corpus.save(args.db)
|
|
46
|
+
return 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _cmd_search(args: argparse.Namespace) -> int:
|
|
50
|
+
corpus = Corpus.open(args.db)
|
|
51
|
+
hits = corpus.search(args.query, top_k=args.top_k, agent_id=args.agent or None)
|
|
52
|
+
for h in hits:
|
|
53
|
+
text = (h.get("text") or "")[: args.snippet]
|
|
54
|
+
print(f"[{h.get('score', 0):.3f}] {h['id']} p.{h.get('page')} {text}")
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _cmd_list(args: argparse.Namespace) -> int:
|
|
59
|
+
corpus = Corpus.open(args.db)
|
|
60
|
+
docs = corpus.list_documents(limit=args.limit)
|
|
61
|
+
for d in docs:
|
|
62
|
+
print(f"{d.get('id')} {d.get('title')} pages={d.get('pages')} chunks={d.get('chunk_count')}")
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _cmd_cluster(args: argparse.Namespace) -> int:
|
|
67
|
+
corpus = _open_or_create(args.db)
|
|
68
|
+
r = corpus.cluster_chunks(algorithm=args.algorithm)
|
|
69
|
+
print(json.dumps(r, indent=2))
|
|
70
|
+
corpus.save(args.db)
|
|
71
|
+
return 0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _cmd_ocr_do(args: argparse.Namespace) -> int:
|
|
75
|
+
"""Run an agent command across every page flagged `needs_ocr=True`.
|
|
76
|
+
|
|
77
|
+
The command may use these placeholders:
|
|
78
|
+
|
|
79
|
+
- ``{image}`` — path to a freshly-rendered PNG of the page
|
|
80
|
+
- ``{page}`` — 1-based page number
|
|
81
|
+
- ``{doc_title}`` — document title
|
|
82
|
+
- ``{doc_id}`` — document id
|
|
83
|
+
|
|
84
|
+
The command's stdout is taken as the OCR markdown and passed to
|
|
85
|
+
`submit_ocr`. Exit code != 0 (or empty stdout) on a page → that page
|
|
86
|
+
is skipped and logged. Pages are processed serially in v1.
|
|
87
|
+
"""
|
|
88
|
+
import shlex
|
|
89
|
+
import subprocess
|
|
90
|
+
import sys
|
|
91
|
+
import tempfile
|
|
92
|
+
from pathlib import Path
|
|
93
|
+
|
|
94
|
+
corpus = Corpus.open(args.db)
|
|
95
|
+
pending = corpus.list_pending_ocr(
|
|
96
|
+
doc_id=args.doc or None,
|
|
97
|
+
limit=args.limit,
|
|
98
|
+
include_images=False, # we render to a temp file ourselves
|
|
99
|
+
dpi=args.dpi,
|
|
100
|
+
)
|
|
101
|
+
if not pending:
|
|
102
|
+
print("nothing to do — no pages flagged needs_ocr=True")
|
|
103
|
+
return 0
|
|
104
|
+
|
|
105
|
+
print(f"{len(pending)} pages pending across {len({p['doc_id'] for p in pending})} docs")
|
|
106
|
+
if args.dry_run:
|
|
107
|
+
for p in pending:
|
|
108
|
+
print(f" would process p.{p['page_number']} of {p['doc_title']} ({p['page_id']})")
|
|
109
|
+
return 0
|
|
110
|
+
|
|
111
|
+
if "{image}" not in args.agent_cmd:
|
|
112
|
+
print(
|
|
113
|
+
"ERROR: --agent-cmd must contain the {image} placeholder so the\n"
|
|
114
|
+
" page render can be passed to your vision agent.",
|
|
115
|
+
file=sys.stderr,
|
|
116
|
+
)
|
|
117
|
+
return 2
|
|
118
|
+
|
|
119
|
+
from kglite_docs.ingest.formats import render_page_image
|
|
120
|
+
succeeded = failed = 0
|
|
121
|
+
for p in pending:
|
|
122
|
+
# Render the page to a temp PNG using whatever path was stored on
|
|
123
|
+
# the Document node when it was ingested.
|
|
124
|
+
try:
|
|
125
|
+
doc_path = corpus.cypher(
|
|
126
|
+
"MATCH (d:Document {id: $id}) RETURN d.path AS path",
|
|
127
|
+
params={"id": p["doc_id"]},
|
|
128
|
+
).to_list()[0]["path"]
|
|
129
|
+
png = render_page_image(doc_path, int(p["page_number"]), dpi=args.dpi)
|
|
130
|
+
except Exception as exc:
|
|
131
|
+
print(f" ✗ p.{p['page_number']} {p['page_id']}: render failed — {exc}", file=sys.stderr)
|
|
132
|
+
failed += 1
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
with tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) as fh:
|
|
136
|
+
fh.write(png)
|
|
137
|
+
image_path = Path(fh.name)
|
|
138
|
+
try:
|
|
139
|
+
cmd = args.agent_cmd.format(
|
|
140
|
+
image=str(image_path),
|
|
141
|
+
page=p["page_number"],
|
|
142
|
+
doc_title=p.get("doc_title", ""),
|
|
143
|
+
doc_id=p["doc_id"],
|
|
144
|
+
)
|
|
145
|
+
try:
|
|
146
|
+
proc = subprocess.run(
|
|
147
|
+
shlex.split(cmd) if args.shell != "yes" else cmd,
|
|
148
|
+
shell=(args.shell == "yes"),
|
|
149
|
+
capture_output=True, text=True,
|
|
150
|
+
timeout=args.timeout, check=False,
|
|
151
|
+
)
|
|
152
|
+
except subprocess.TimeoutExpired:
|
|
153
|
+
print(f" ✗ p.{p['page_number']} {p['page_id']}: agent timed out (>{args.timeout}s)", file=sys.stderr)
|
|
154
|
+
failed += 1
|
|
155
|
+
continue
|
|
156
|
+
if proc.returncode != 0:
|
|
157
|
+
stderr_preview = (proc.stderr or "")[:160].replace("\n", " ")
|
|
158
|
+
print(f" ✗ p.{p['page_number']} {p['page_id']}: agent exited {proc.returncode} — {stderr_preview}", file=sys.stderr)
|
|
159
|
+
failed += 1
|
|
160
|
+
continue
|
|
161
|
+
md = (proc.stdout or "").strip()
|
|
162
|
+
if not md:
|
|
163
|
+
print(f" ✗ p.{p['page_number']} {p['page_id']}: empty agent output", file=sys.stderr)
|
|
164
|
+
failed += 1
|
|
165
|
+
continue
|
|
166
|
+
corpus.submit_ocr(
|
|
167
|
+
p["page_id"], md,
|
|
168
|
+
agent_id=args.agent_id, model=args.model,
|
|
169
|
+
)
|
|
170
|
+
print(f" ✓ p.{p['page_number']} of {p.get('doc_title','')} ({len(md)} chars)")
|
|
171
|
+
succeeded += 1
|
|
172
|
+
finally:
|
|
173
|
+
image_path.unlink(missing_ok=True)
|
|
174
|
+
|
|
175
|
+
corpus.save(args.db)
|
|
176
|
+
total = succeeded + failed
|
|
177
|
+
print(f"\nfinished: {succeeded}/{total} pages OCR'd, {failed} failures")
|
|
178
|
+
return 0 if failed == 0 else 1
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _cmd_ocr_status(args: argparse.Namespace) -> int:
|
|
182
|
+
corpus = Corpus.open(args.db)
|
|
183
|
+
status = corpus.ocr_status(doc_id=args.doc or None)
|
|
184
|
+
print(
|
|
185
|
+
f"{status['pending_pages']}/{status['total_pages']} pages pending OCR "
|
|
186
|
+
f"({status['documents_with_pending']}/{status['documents_total']} docs)"
|
|
187
|
+
)
|
|
188
|
+
if args.verbose:
|
|
189
|
+
for d in status["documents"]:
|
|
190
|
+
marker = "!" if d["pending"] else " "
|
|
191
|
+
print(
|
|
192
|
+
f" {marker} {d['pending']:>3}/{d['pages']:<3} {d['format']:<5} "
|
|
193
|
+
f"{d['title']} ({d['doc_id'][:18]}…)"
|
|
194
|
+
)
|
|
195
|
+
return 0 if status["pending_pages"] == 0 else 1
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _cmd_show(args: argparse.Namespace) -> int:
|
|
199
|
+
corpus = Corpus.open(args.db)
|
|
200
|
+
if args.kind == "doc":
|
|
201
|
+
d = corpus.get_document(args.id)
|
|
202
|
+
elif args.kind == "chunk":
|
|
203
|
+
d = corpus.get_chunk(args.id, with_neighbors=True, with_summaries=True)
|
|
204
|
+
else:
|
|
205
|
+
print(f"unknown kind: {args.kind}", file=sys.stderr)
|
|
206
|
+
return 2
|
|
207
|
+
print(json.dumps(d, indent=2, default=str))
|
|
208
|
+
return 0
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def main(argv: list[str] | None = None) -> int:
|
|
212
|
+
p = argparse.ArgumentParser(prog="kglite-docs")
|
|
213
|
+
sp = p.add_subparsers(dest="cmd", required=True)
|
|
214
|
+
|
|
215
|
+
pi = sp.add_parser("ingest", help="Ingest a document or directory (PDF/DOCX/PPTX/MD/HTML/TXT/images)")
|
|
216
|
+
pi.add_argument("target", help="Path to a file or a directory")
|
|
217
|
+
pi.add_argument("--db", required=True)
|
|
218
|
+
pi.add_argument("--recursive", action="store_true")
|
|
219
|
+
pi.set_defaults(func=_cmd_ingest)
|
|
220
|
+
|
|
221
|
+
ps = sp.add_parser("search", help="Semantic search")
|
|
222
|
+
ps.add_argument("query")
|
|
223
|
+
ps.add_argument("--db", required=True)
|
|
224
|
+
ps.add_argument("--top-k", type=int, default=10)
|
|
225
|
+
ps.add_argument("--snippet", type=int, default=180)
|
|
226
|
+
ps.add_argument("--agent", default="")
|
|
227
|
+
ps.set_defaults(func=_cmd_search)
|
|
228
|
+
|
|
229
|
+
pl = sp.add_parser("list", help="List documents")
|
|
230
|
+
pl.add_argument("--db", required=True)
|
|
231
|
+
pl.add_argument("--limit", type=int, default=100)
|
|
232
|
+
pl.set_defaults(func=_cmd_list)
|
|
233
|
+
|
|
234
|
+
pc = sp.add_parser("cluster", help="Run clustering")
|
|
235
|
+
pc.add_argument("--db", required=True)
|
|
236
|
+
pc.add_argument("--algorithm", default="louvain")
|
|
237
|
+
pc.set_defaults(func=_cmd_cluster)
|
|
238
|
+
|
|
239
|
+
po = sp.add_parser("ocr-status", help="OCR coverage summary across the corpus")
|
|
240
|
+
po.add_argument("--db", required=True)
|
|
241
|
+
po.add_argument("--doc", default="", help="Scope to one document id")
|
|
242
|
+
po.add_argument("-v", "--verbose", action="store_true", help="Per-document detail")
|
|
243
|
+
po.set_defaults(func=_cmd_ocr_status)
|
|
244
|
+
|
|
245
|
+
pdo = sp.add_parser(
|
|
246
|
+
"ocr-do",
|
|
247
|
+
help="Run an agent command across every page flagged needs_ocr=True",
|
|
248
|
+
description=(
|
|
249
|
+
"Iterate over pages that need OCR. For each, render the page to a "
|
|
250
|
+
"PNG, run the supplied agent command (must include the {image} "
|
|
251
|
+
"placeholder), and submit the command's stdout back as the page's "
|
|
252
|
+
"markdown. Pages are processed serially."
|
|
253
|
+
),
|
|
254
|
+
)
|
|
255
|
+
pdo.add_argument("--db", required=True)
|
|
256
|
+
pdo.add_argument(
|
|
257
|
+
"--agent-cmd", required=True,
|
|
258
|
+
help='Command template — must contain {image}. Other placeholders: '
|
|
259
|
+
'{page}, {doc_title}, {doc_id}. Example: '
|
|
260
|
+
'\'claude -p --bare --image {image} "Transcribe to markdown"\'',
|
|
261
|
+
)
|
|
262
|
+
pdo.add_argument("--agent-id", default="cli-ocr-agent",
|
|
263
|
+
help="Agent id recorded on each submission (default: cli-ocr-agent)")
|
|
264
|
+
pdo.add_argument("--model", default="",
|
|
265
|
+
help="Model name to record on each page (informational)")
|
|
266
|
+
pdo.add_argument("--doc", default="", help="Scope to one document id")
|
|
267
|
+
pdo.add_argument("--limit", type=int, default=100,
|
|
268
|
+
help="Max pages to process this run (default: 100)")
|
|
269
|
+
pdo.add_argument("--dpi", type=int, default=200,
|
|
270
|
+
help="DPI for the page render handed to the agent")
|
|
271
|
+
pdo.add_argument("--timeout", type=int, default=180,
|
|
272
|
+
help="Per-page agent timeout in seconds (default: 180)")
|
|
273
|
+
pdo.add_argument("--shell", choices=["no", "yes"], default="no",
|
|
274
|
+
help='If "yes", run the command through a shell (allows '
|
|
275
|
+
'pipes/quoting). Default splits with shlex.')
|
|
276
|
+
pdo.add_argument("--dry-run", action="store_true",
|
|
277
|
+
help="List what would be processed; don't invoke the agent")
|
|
278
|
+
pdo.set_defaults(func=_cmd_ocr_do)
|
|
279
|
+
|
|
280
|
+
psh = sp.add_parser("show", help="Show a document or chunk by id")
|
|
281
|
+
psh.add_argument("kind", choices=["doc", "chunk"])
|
|
282
|
+
psh.add_argument("id")
|
|
283
|
+
psh.add_argument("--db", required=True)
|
|
284
|
+
psh.set_defaults(func=_cmd_show)
|
|
285
|
+
|
|
286
|
+
args = p.parse_args(argv)
|
|
287
|
+
return int(args.func(args) or 0)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == "__main__":
|
|
291
|
+
sys.exit(main())
|