chunksmith-agent 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_agent/__init__.py +13 -0
- chunksmith_agent/agent.py +59 -0
- chunksmith_agent/element_retrieval.py +164 -0
- chunksmith_agent/index_builder.py +325 -0
- chunksmith_agent/index_context.py +310 -0
- chunksmith_agent/langchain_runtime.py +101 -0
- chunksmith_agent/models.py +60 -0
- chunksmith_agent/retrieval.py +80 -0
- chunksmith_agent/session.py +44 -0
- chunksmith_agent/settings.py +68 -0
- chunksmith_agent/tool_agent.py +264 -0
- chunksmith_agent-0.4.0.dist-info/METADATA +82 -0
- chunksmith_agent-0.4.0.dist-info/RECORD +15 -0
- chunksmith_agent-0.4.0.dist-info/WHEEL +5 -0
- chunksmith_agent-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Reasoning-based Q&A over ChunkSmith multimodal indexes (outline tree + elements)."""
|
|
2
|
+
|
|
3
|
+
from chunksmith_agent.agent import ChunkSmithAgent
|
|
4
|
+
from chunksmith_agent.index_builder import build_document_index, build_document_index_from_saved
|
|
5
|
+
from chunksmith_agent.models import AgentAnswer, DocumentIndex
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"AgentAnswer",
|
|
9
|
+
"ChunkSmithAgent",
|
|
10
|
+
"DocumentIndex",
|
|
11
|
+
"build_document_index",
|
|
12
|
+
"build_document_index_from_saved",
|
|
13
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""ChunkSmith agent session: Q&A over a built document index."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Iterator
|
|
6
|
+
|
|
7
|
+
from chunksmith_agent.index_builder import build_document_index
|
|
8
|
+
from chunksmith_agent.models import AgentAnswer, DocumentIndex
|
|
9
|
+
from chunksmith_agent.retrieval import answer_question, iter_answer_events
|
|
10
|
+
from chunksmith_agent.session import AgentConversation
|
|
11
|
+
from chunksmith_agent.settings import AgentSettings, load_settings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ChunkSmithAgent:
|
|
15
|
+
"""Holds a document index and answers with session memory (reuse sections on follow-ups)."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
index: DocumentIndex,
|
|
20
|
+
settings: AgentSettings | None = None,
|
|
21
|
+
) -> None:
|
|
22
|
+
self.index = index
|
|
23
|
+
self.settings = settings or load_settings()
|
|
24
|
+
self.conversation = AgentConversation()
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_multimodal_output(cls, out: dict[str, Any], settings: AgentSettings | None = None) -> ChunkSmithAgent:
|
|
28
|
+
return cls(build_document_index(out), settings=settings)
|
|
29
|
+
|
|
30
|
+
def reset_conversation(self) -> None:
|
|
31
|
+
"""Clear chat history and cached section selection."""
|
|
32
|
+
self.conversation = AgentConversation()
|
|
33
|
+
|
|
34
|
+
def ask(self, query: str, *, stream_tokens: bool = False) -> AgentAnswer:
|
|
35
|
+
return answer_question(
|
|
36
|
+
self.index,
|
|
37
|
+
query,
|
|
38
|
+
self.settings,
|
|
39
|
+
conversation=self.conversation,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def ask_events(
|
|
43
|
+
self,
|
|
44
|
+
query: str,
|
|
45
|
+
*,
|
|
46
|
+
event_sink: Any | None = None,
|
|
47
|
+
emit_image_events: bool = True,
|
|
48
|
+
emit_table_events: bool = True,
|
|
49
|
+
) -> Iterator[tuple[str, dict[str, Any]]]:
|
|
50
|
+
"""Yield ``(event_name, payload)`` for CLI streaming."""
|
|
51
|
+
yield from iter_answer_events(
|
|
52
|
+
self.index,
|
|
53
|
+
query,
|
|
54
|
+
self.settings,
|
|
55
|
+
event_sink=event_sink,
|
|
56
|
+
emit_image_events=emit_image_events,
|
|
57
|
+
emit_table_events=emit_table_events,
|
|
58
|
+
conversation=self.conversation,
|
|
59
|
+
)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Element helpers for building per-node media from canonical bundles (standalone)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
ANCHOR_START_KEYS = ("split_document_anchor_start", "anchor_start")
|
|
9
|
+
ANCHOR_END_KEYS = ("split_document_anchor_end", "anchor_end")
|
|
10
|
+
FALLBACK_START_KEYS = ("split_document_anchor", "anchor")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def anchor_start_from_row(row: dict[str, Any]) -> str:
|
|
14
|
+
for key in ANCHOR_START_KEYS:
|
|
15
|
+
value = str(row.get(key) or "").strip()
|
|
16
|
+
if value:
|
|
17
|
+
return value[:240]
|
|
18
|
+
for key in FALLBACK_START_KEYS:
|
|
19
|
+
value = str(row.get(key) or "").strip()
|
|
20
|
+
if value:
|
|
21
|
+
return value[:240]
|
|
22
|
+
return ""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def flatten_nodes(structure: list[Any]) -> list[dict[str, Any]]:
|
|
26
|
+
out: list[dict[str, Any]] = []
|
|
27
|
+
|
|
28
|
+
def _walk(nodes: list[Any]) -> None:
|
|
29
|
+
for node in nodes:
|
|
30
|
+
if not isinstance(node, dict):
|
|
31
|
+
continue
|
|
32
|
+
out.append(node)
|
|
33
|
+
child = node.get("nodes")
|
|
34
|
+
if isinstance(child, list):
|
|
35
|
+
_walk(child)
|
|
36
|
+
|
|
37
|
+
_walk(structure)
|
|
38
|
+
return out
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class BundleElement:
|
|
43
|
+
element_type: str
|
|
44
|
+
text: str
|
|
45
|
+
text_as_html: str | None
|
|
46
|
+
page_number: int
|
|
47
|
+
element_id: int | str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _parse_element(raw: dict[str, Any]) -> BundleElement | None:
|
|
51
|
+
if not isinstance(raw, dict):
|
|
52
|
+
return None
|
|
53
|
+
pn = raw.get("page_number")
|
|
54
|
+
if pn is None:
|
|
55
|
+
return None
|
|
56
|
+
eid = raw.get("element_id", 0)
|
|
57
|
+
etype = str(raw.get("element_type") or raw.get("type") or "Text")
|
|
58
|
+
text = str(raw.get("text") or "")
|
|
59
|
+
html = raw.get("text_as_html")
|
|
60
|
+
html_str = str(html) if isinstance(html, str) and html.strip() else None
|
|
61
|
+
return BundleElement(
|
|
62
|
+
element_type=etype,
|
|
63
|
+
text=text,
|
|
64
|
+
text_as_html=html_str,
|
|
65
|
+
page_number=max(1, int(pn)),
|
|
66
|
+
element_id=eid,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _bundle_elements(bundle: dict[str, Any]) -> list[BundleElement]:
|
|
71
|
+
out: list[BundleElement] = []
|
|
72
|
+
for raw in bundle.get("elements") or []:
|
|
73
|
+
el = _parse_element(raw)
|
|
74
|
+
if el is not None:
|
|
75
|
+
out.append(el)
|
|
76
|
+
return out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _element_body(el: BundleElement) -> str:
|
|
80
|
+
if el.element_type.strip().lower() == "table" and el.text_as_html:
|
|
81
|
+
return el.text_as_html
|
|
82
|
+
return el.text or ""
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _elements_in_page_span(
|
|
86
|
+
elements: list[BundleElement],
|
|
87
|
+
start_page: int,
|
|
88
|
+
end_page: int,
|
|
89
|
+
) -> list[BundleElement]:
|
|
90
|
+
lo = max(1, int(start_page))
|
|
91
|
+
hi = max(lo, int(end_page))
|
|
92
|
+
return [el for el in elements if lo <= el.page_number <= hi]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _needle_from_node(node: dict[str, Any]) -> str:
|
|
96
|
+
pseudo = str(node.get("split_document_anchor") or "").strip()
|
|
97
|
+
if pseudo:
|
|
98
|
+
return pseudo[:240]
|
|
99
|
+
return anchor_start_from_row(node)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _next_needle_from_structure(
|
|
103
|
+
structure: list[dict[str, Any]],
|
|
104
|
+
node: dict[str, Any],
|
|
105
|
+
) -> str:
|
|
106
|
+
flat = flatten_nodes(structure)
|
|
107
|
+
target_id = node.get("node_id")
|
|
108
|
+
for i, row in enumerate(flat):
|
|
109
|
+
if row.get("node_id") != target_id:
|
|
110
|
+
continue
|
|
111
|
+
if i + 1 >= len(flat):
|
|
112
|
+
return ""
|
|
113
|
+
nxt = flat[i + 1]
|
|
114
|
+
needle = anchor_start_from_row(nxt)
|
|
115
|
+
if needle:
|
|
116
|
+
return needle
|
|
117
|
+
title = str(nxt.get("title") or "").strip()
|
|
118
|
+
return title[:240] if title else ""
|
|
119
|
+
return ""
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _span_blob_with_offsets(
|
|
123
|
+
span_els: list[BundleElement],
|
|
124
|
+
) -> tuple[str, list[tuple[BundleElement, int, int]]]:
|
|
125
|
+
parts: list[str] = []
|
|
126
|
+
offsets: list[tuple[BundleElement, int, int]] = []
|
|
127
|
+
pos = 0
|
|
128
|
+
for el in span_els:
|
|
129
|
+
body = _element_body(el)
|
|
130
|
+
if not body:
|
|
131
|
+
continue
|
|
132
|
+
start = pos
|
|
133
|
+
parts.append(body)
|
|
134
|
+
pos += len(body)
|
|
135
|
+
parts.append("\n\n")
|
|
136
|
+
pos += 2
|
|
137
|
+
offsets.append((el, start, pos - 2))
|
|
138
|
+
return "".join(parts), offsets
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _slice_section_range(
|
|
142
|
+
blob: str,
|
|
143
|
+
offsets: list[tuple[BundleElement, int, int]],
|
|
144
|
+
*,
|
|
145
|
+
anchor: str,
|
|
146
|
+
pseudo_node: dict[str, Any],
|
|
147
|
+
next_needle: str,
|
|
148
|
+
) -> tuple[int, int]:
|
|
149
|
+
del pseudo_node # reserved for future anchor refinement
|
|
150
|
+
if not blob.strip():
|
|
151
|
+
return 0, 0
|
|
152
|
+
start = 0
|
|
153
|
+
if anchor:
|
|
154
|
+
idx = blob.find(anchor)
|
|
155
|
+
if idx >= 0:
|
|
156
|
+
start = idx
|
|
157
|
+
end = len(blob)
|
|
158
|
+
if next_needle:
|
|
159
|
+
idx = blob.find(next_needle, start + max(1, len(anchor)))
|
|
160
|
+
if idx >= 0:
|
|
161
|
+
end = idx
|
|
162
|
+
if not offsets:
|
|
163
|
+
return start, end
|
|
164
|
+
return start, end
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""Build ``DocumentIndex`` from a multimodal pipeline result or saved JSON."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from chunksmith_agent.index_context import flatten_structure
|
|
10
|
+
from chunksmith_agent.models import DocumentIndex
|
|
11
|
+
from chunksmith_agent.element_retrieval import (
|
|
12
|
+
_bundle_elements,
|
|
13
|
+
_element_body,
|
|
14
|
+
_elements_in_page_span,
|
|
15
|
+
_needle_from_node,
|
|
16
|
+
_next_needle_from_structure,
|
|
17
|
+
_slice_section_range,
|
|
18
|
+
_span_blob_with_offsets,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
_TABLE_TYPES = frozenset({"table"})
|
|
22
|
+
_IMAGE_TYPES = frozenset({"image", "figure", "chart"})
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _node_page_span(node: dict[str, Any]) -> tuple[int, int] | None:
|
|
26
|
+
si = node.get("start_index")
|
|
27
|
+
ei = node.get("end_index")
|
|
28
|
+
if si is None or ei is None:
|
|
29
|
+
pi = node.get("physical_index")
|
|
30
|
+
if pi is not None:
|
|
31
|
+
p = int(pi)
|
|
32
|
+
return p, p
|
|
33
|
+
return None
|
|
34
|
+
return max(1, int(si)), max(int(si), int(ei))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _load_png_b64(path: Path, *, max_bytes: int = 4_000_000) -> str | None:
|
|
38
|
+
try:
|
|
39
|
+
raw = path.read_bytes()
|
|
40
|
+
if len(raw) > max_bytes:
|
|
41
|
+
return None
|
|
42
|
+
return base64.standard_b64encode(raw).decode("ascii")
|
|
43
|
+
except OSError:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _media_for_node(
|
|
48
|
+
node: dict[str, Any],
|
|
49
|
+
*,
|
|
50
|
+
structure: list[dict[str, Any]],
|
|
51
|
+
elements: list[Any],
|
|
52
|
+
image_dir: Path | None,
|
|
53
|
+
png_cycle: list[Path],
|
|
54
|
+
) -> dict[str, Any]:
|
|
55
|
+
span = _node_page_span(node)
|
|
56
|
+
text = str(node.get("text") or "").strip()
|
|
57
|
+
tables: list[dict[str, Any]] = []
|
|
58
|
+
images: list[dict[str, Any]] = []
|
|
59
|
+
|
|
60
|
+
if span and elements:
|
|
61
|
+
start_p, end_p = span
|
|
62
|
+
span_els = _elements_in_page_span(elements, start_p, end_p)
|
|
63
|
+
blob, offsets = _span_blob_with_offsets(span_els)
|
|
64
|
+
if blob.strip():
|
|
65
|
+
needle = _needle_from_node(node)
|
|
66
|
+
nxt = _next_needle_from_structure(structure, node)
|
|
67
|
+
pseudo = {
|
|
68
|
+
"split_document_anchor": needle,
|
|
69
|
+
"title": node.get("title") or "",
|
|
70
|
+
}
|
|
71
|
+
try:
|
|
72
|
+
sc, ec = _slice_section_range(
|
|
73
|
+
blob,
|
|
74
|
+
offsets,
|
|
75
|
+
anchor=needle,
|
|
76
|
+
pseudo_node=pseudo,
|
|
77
|
+
next_needle=nxt,
|
|
78
|
+
)
|
|
79
|
+
section_els = [el for el, s, e in offsets if s < ec and e > sc]
|
|
80
|
+
except Exception:
|
|
81
|
+
section_els = span_els
|
|
82
|
+
else:
|
|
83
|
+
section_els = span_els
|
|
84
|
+
|
|
85
|
+
if not text:
|
|
86
|
+
parts = [_element_body(el).strip() for el in section_els if _element_body(el).strip()]
|
|
87
|
+
text = "\n\n".join(parts)
|
|
88
|
+
|
|
89
|
+
rank = 0
|
|
90
|
+
for el in section_els:
|
|
91
|
+
t = el.element_type.strip().lower()
|
|
92
|
+
if t in _TABLE_TYPES and el.text_as_html:
|
|
93
|
+
tables.append(
|
|
94
|
+
{
|
|
95
|
+
"page_number": el.page_number,
|
|
96
|
+
"html": el.text_as_html,
|
|
97
|
+
"element_id": el.element_id,
|
|
98
|
+
}
|
|
99
|
+
)
|
|
100
|
+
if t in _IMAGE_TYPES:
|
|
101
|
+
rank += 1
|
|
102
|
+
row: dict[str, Any] = {
|
|
103
|
+
"page_number": el.page_number,
|
|
104
|
+
"rank": rank,
|
|
105
|
+
"element_id": el.element_id,
|
|
106
|
+
"summary": (el.text or "")[:500],
|
|
107
|
+
}
|
|
108
|
+
if png_cycle:
|
|
109
|
+
p = png_cycle[(rank - 1) % len(png_cycle)]
|
|
110
|
+
row["image_path"] = str(p.resolve())
|
|
111
|
+
b64 = _load_png_b64(p)
|
|
112
|
+
if b64:
|
|
113
|
+
row["image_base64"] = b64
|
|
114
|
+
images.append(row)
|
|
115
|
+
|
|
116
|
+
return {"text": text, "tables": tables, "images": images}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def build_document_index(multimodal_out: dict[str, Any]) -> DocumentIndex:
|
|
120
|
+
"""
|
|
121
|
+
Build searchable index from :func:`build_multimodal_index` / ``ChunkSmith_MultimodalIndexer_PDF`` output.
|
|
122
|
+
"""
|
|
123
|
+
structure = multimodal_out.get("structure")
|
|
124
|
+
if not isinstance(structure, list):
|
|
125
|
+
structure = []
|
|
126
|
+
|
|
127
|
+
bundle = multimodal_out.get("canonical_bundle")
|
|
128
|
+
if not isinstance(bundle, dict):
|
|
129
|
+
bundle = None
|
|
130
|
+
|
|
131
|
+
image_dir: Path | None = None
|
|
132
|
+
if bundle:
|
|
133
|
+
pi = bundle.get("path_image")
|
|
134
|
+
if isinstance(pi, str) and pi.strip():
|
|
135
|
+
image_dir = Path(pi)
|
|
136
|
+
if image_dir is None:
|
|
137
|
+
ar = multimodal_out.get("artifact_root")
|
|
138
|
+
if isinstance(ar, str):
|
|
139
|
+
cand = Path(ar) / "image"
|
|
140
|
+
if cand.is_dir():
|
|
141
|
+
image_dir = cand
|
|
142
|
+
|
|
143
|
+
png_cycle: list[Path] = []
|
|
144
|
+
if image_dir and image_dir.is_dir():
|
|
145
|
+
png_cycle = sorted(image_dir.glob("*.png"))
|
|
146
|
+
|
|
147
|
+
elements = _bundle_elements(bundle) if bundle else []
|
|
148
|
+
|
|
149
|
+
media_by_node: dict[str, dict[str, Any]] = {}
|
|
150
|
+
for node in flatten_structure(structure):
|
|
151
|
+
nid = node.get("node_id")
|
|
152
|
+
if nid is None:
|
|
153
|
+
continue
|
|
154
|
+
key = str(nid).strip()
|
|
155
|
+
if not key:
|
|
156
|
+
continue
|
|
157
|
+
media_by_node[key] = _media_for_node(
|
|
158
|
+
node,
|
|
159
|
+
structure=structure,
|
|
160
|
+
elements=elements,
|
|
161
|
+
image_dir=image_dir,
|
|
162
|
+
png_cycle=png_cycle,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
cf = None
|
|
166
|
+
if bundle and isinstance(bundle.get("coded_formate"), str):
|
|
167
|
+
cf = bundle["coded_formate"]
|
|
168
|
+
elif isinstance(multimodal_out.get("compressed_tree_string"), str):
|
|
169
|
+
cf = multimodal_out.get("compressed_tree_string")
|
|
170
|
+
|
|
171
|
+
return DocumentIndex(
|
|
172
|
+
doc_name=str(multimodal_out.get("doc_name") or "document"),
|
|
173
|
+
structure=structure,
|
|
174
|
+
media_by_node=media_by_node,
|
|
175
|
+
canonical_bundle=bundle,
|
|
176
|
+
coded_formate=cf,
|
|
177
|
+
image_dir=str(image_dir.resolve()) if image_dir else None,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _page_outline_summary(bundle: dict[str, Any], page: int) -> str:
|
|
182
|
+
"""Short preview for page-level nodes so the picker can match topics."""
|
|
183
|
+
parts: list[str] = []
|
|
184
|
+
has_table = False
|
|
185
|
+
has_figure = False
|
|
186
|
+
for el in bundle.get("elements") or []:
|
|
187
|
+
if not isinstance(el, dict):
|
|
188
|
+
continue
|
|
189
|
+
if el.get("page_number") is None or int(el["page_number"]) != page:
|
|
190
|
+
continue
|
|
191
|
+
kind = str(el.get("element_type") or "").strip().lower()
|
|
192
|
+
if kind == "table":
|
|
193
|
+
has_table = True
|
|
194
|
+
if kind in _IMAGE_TYPES:
|
|
195
|
+
has_figure = True
|
|
196
|
+
text = str(el.get("text") or "").strip().replace("\n", " ")
|
|
197
|
+
if not text or len(text) < 4:
|
|
198
|
+
continue
|
|
199
|
+
if kind in ("title", "header", "subtitle", "narrativetext", "listitem") and len(parts) < 4:
|
|
200
|
+
snippet = text[:140] + ("…" if len(text) > 140 else "")
|
|
201
|
+
if snippet not in parts:
|
|
202
|
+
parts.append(snippet)
|
|
203
|
+
preview = " · ".join(parts)[:420].strip()
|
|
204
|
+
flags: list[str] = []
|
|
205
|
+
if has_table:
|
|
206
|
+
flags.append("table")
|
|
207
|
+
if has_figure:
|
|
208
|
+
flags.append("figure")
|
|
209
|
+
if flags:
|
|
210
|
+
preview = f"{preview} [{' '.join(flags)}]" if preview else f"[{' '.join(flags)}]"
|
|
211
|
+
return preview or f"Page {page}"
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def structure_from_canonical_bundle(bundle: dict[str, Any]) -> list[dict[str, Any]]:
|
|
215
|
+
"""Build a page-level outline when no ``*_pageindex.json`` exists."""
|
|
216
|
+
pages: set[int] = set()
|
|
217
|
+
for el in bundle.get("elements") or []:
|
|
218
|
+
if not isinstance(el, dict):
|
|
219
|
+
continue
|
|
220
|
+
pn = el.get("page_number")
|
|
221
|
+
if pn is not None:
|
|
222
|
+
pages.add(max(1, int(pn)))
|
|
223
|
+
if not pages:
|
|
224
|
+
return [
|
|
225
|
+
{
|
|
226
|
+
"node_id": "0000",
|
|
227
|
+
"title": "Document",
|
|
228
|
+
"start_index": 1,
|
|
229
|
+
"end_index": 1,
|
|
230
|
+
"summary": "Full document (no page numbers in bundle).",
|
|
231
|
+
}
|
|
232
|
+
]
|
|
233
|
+
return [
|
|
234
|
+
{
|
|
235
|
+
"node_id": f"{i:04d}",
|
|
236
|
+
"title": f"Page {p}",
|
|
237
|
+
"start_index": p,
|
|
238
|
+
"end_index": p,
|
|
239
|
+
"summary": _page_outline_summary(bundle, p),
|
|
240
|
+
}
|
|
241
|
+
for i, p in enumerate(sorted(pages))
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _artifact_root_for_saved_bundle(
|
|
246
|
+
canonical_bundle_path: Path,
|
|
247
|
+
bundle: dict[str, Any],
|
|
248
|
+
) -> Path | None:
|
|
249
|
+
ar = bundle.get("artifact_root")
|
|
250
|
+
if isinstance(ar, str) and ar.strip():
|
|
251
|
+
root = Path(ar)
|
|
252
|
+
if root.is_dir():
|
|
253
|
+
return root.resolve()
|
|
254
|
+
parent = canonical_bundle_path.parent
|
|
255
|
+
if parent.name == "json":
|
|
256
|
+
root = parent.parent
|
|
257
|
+
if (root / "image").is_dir() or (root / "json").is_dir():
|
|
258
|
+
return root.resolve()
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _bundle_with_resolved_image_dir(
|
|
263
|
+
bundle: dict[str, Any],
|
|
264
|
+
artifact_root: Path | None,
|
|
265
|
+
) -> dict[str, Any]:
|
|
266
|
+
if artifact_root is None:
|
|
267
|
+
return bundle
|
|
268
|
+
image_dir = artifact_root / "image"
|
|
269
|
+
if not image_dir.is_dir():
|
|
270
|
+
return bundle
|
|
271
|
+
pi = bundle.get("path_image")
|
|
272
|
+
if isinstance(pi, str) and pi.strip() and Path(pi).is_dir():
|
|
273
|
+
return bundle
|
|
274
|
+
out = dict(bundle)
|
|
275
|
+
out["path_image"] = str(image_dir.resolve())
|
|
276
|
+
out.setdefault("artifact_root", str(artifact_root.resolve()))
|
|
277
|
+
return out
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def build_document_index_from_saved(
|
|
281
|
+
*,
|
|
282
|
+
pageindex_path: Path | None = None,
|
|
283
|
+
canonical_bundle_path: Path | None = None,
|
|
284
|
+
artifact_root: Path | None = None,
|
|
285
|
+
) -> DocumentIndex:
|
|
286
|
+
"""Load index from saved JSON (pageindex, outline JSON, or canonical bundle only)."""
|
|
287
|
+
import json
|
|
288
|
+
|
|
289
|
+
if pageindex_path is None and canonical_bundle_path is None:
|
|
290
|
+
raise ValueError("pageindex_path or canonical_bundle_path is required")
|
|
291
|
+
|
|
292
|
+
if pageindex_path is not None and pageindex_path.is_file():
|
|
293
|
+
raw = json.loads(pageindex_path.read_text(encoding="utf-8"))
|
|
294
|
+
if isinstance(raw.get("elements"), list) and not isinstance(raw.get("structure"), list):
|
|
295
|
+
return build_document_index(raw)
|
|
296
|
+
if canonical_bundle_path and canonical_bundle_path.is_file():
|
|
297
|
+
cb = json.loads(canonical_bundle_path.read_text(encoding="utf-8"))
|
|
298
|
+
if isinstance(cb, dict):
|
|
299
|
+
ar = artifact_root or _artifact_root_for_saved_bundle(canonical_bundle_path, cb)
|
|
300
|
+
cb = _bundle_with_resolved_image_dir(cb, ar)
|
|
301
|
+
raw["canonical_bundle"] = cb
|
|
302
|
+
if ar is not None:
|
|
303
|
+
raw["artifact_root"] = str(ar)
|
|
304
|
+
return build_document_index(raw)
|
|
305
|
+
|
|
306
|
+
if canonical_bundle_path is None or not canonical_bundle_path.is_file():
|
|
307
|
+
raise FileNotFoundError(f"Index file not found: {pageindex_path}")
|
|
308
|
+
|
|
309
|
+
bundle = json.loads(canonical_bundle_path.read_text(encoding="utf-8"))
|
|
310
|
+
if not isinstance(bundle, dict) or not isinstance(bundle.get("elements"), list):
|
|
311
|
+
raise ValueError(f"Not a canonical bundle: {canonical_bundle_path}")
|
|
312
|
+
|
|
313
|
+
doc_name = str(bundle.get("doc_name") or canonical_bundle_path.stem)
|
|
314
|
+
if doc_name.endswith("_canonical_bundle"):
|
|
315
|
+
doc_name = doc_name[: -len("_canonical_bundle")]
|
|
316
|
+
ar = artifact_root or _artifact_root_for_saved_bundle(canonical_bundle_path, bundle)
|
|
317
|
+
bundle = _bundle_with_resolved_image_dir(bundle, ar)
|
|
318
|
+
raw: dict[str, Any] = {
|
|
319
|
+
"doc_name": doc_name,
|
|
320
|
+
"structure": structure_from_canonical_bundle(bundle),
|
|
321
|
+
"canonical_bundle": bundle,
|
|
322
|
+
}
|
|
323
|
+
if ar is not None:
|
|
324
|
+
raw["artifact_root"] = str(ar)
|
|
325
|
+
return build_document_index(raw)
|