chunksmith-agent 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ """Reasoning-based Q&A over ChunkSmith multimodal indexes (outline tree + elements)."""
2
+
3
+ from chunksmith_agent.agent import ChunkSmithAgent
4
+ from chunksmith_agent.index_builder import build_document_index, build_document_index_from_saved
5
+ from chunksmith_agent.models import AgentAnswer, DocumentIndex
6
+
7
+ __all__ = [
8
+ "AgentAnswer",
9
+ "ChunkSmithAgent",
10
+ "DocumentIndex",
11
+ "build_document_index",
12
+ "build_document_index_from_saved",
13
+ ]
@@ -0,0 +1,59 @@
1
+ """ChunkSmith agent session: Q&A over a built document index."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Iterator
6
+
7
+ from chunksmith_agent.index_builder import build_document_index
8
+ from chunksmith_agent.models import AgentAnswer, DocumentIndex
9
+ from chunksmith_agent.retrieval import answer_question, iter_answer_events
10
+ from chunksmith_agent.session import AgentConversation
11
+ from chunksmith_agent.settings import AgentSettings, load_settings
12
+
13
+
14
+ class ChunkSmithAgent:
15
+ """Holds a document index and answers with session memory (reuse sections on follow-ups)."""
16
+
17
+ def __init__(
18
+ self,
19
+ index: DocumentIndex,
20
+ settings: AgentSettings | None = None,
21
+ ) -> None:
22
+ self.index = index
23
+ self.settings = settings or load_settings()
24
+ self.conversation = AgentConversation()
25
+
26
+ @classmethod
27
+ def from_multimodal_output(cls, out: dict[str, Any], settings: AgentSettings | None = None) -> ChunkSmithAgent:
28
+ return cls(build_document_index(out), settings=settings)
29
+
30
+ def reset_conversation(self) -> None:
31
+ """Clear chat history and cached section selection."""
32
+ self.conversation = AgentConversation()
33
+
34
+ def ask(self, query: str, *, stream_tokens: bool = False) -> AgentAnswer:
35
+ return answer_question(
36
+ self.index,
37
+ query,
38
+ self.settings,
39
+ conversation=self.conversation,
40
+ )
41
+
42
+ def ask_events(
43
+ self,
44
+ query: str,
45
+ *,
46
+ event_sink: Any | None = None,
47
+ emit_image_events: bool = True,
48
+ emit_table_events: bool = True,
49
+ ) -> Iterator[tuple[str, dict[str, Any]]]:
50
+ """Yield ``(event_name, payload)`` for CLI streaming."""
51
+ yield from iter_answer_events(
52
+ self.index,
53
+ query,
54
+ self.settings,
55
+ event_sink=event_sink,
56
+ emit_image_events=emit_image_events,
57
+ emit_table_events=emit_table_events,
58
+ conversation=self.conversation,
59
+ )
@@ -0,0 +1,164 @@
1
+ """Element helpers for building per-node media from canonical bundles (standalone)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ ANCHOR_START_KEYS = ("split_document_anchor_start", "anchor_start")
9
+ ANCHOR_END_KEYS = ("split_document_anchor_end", "anchor_end")
10
+ FALLBACK_START_KEYS = ("split_document_anchor", "anchor")
11
+
12
+
13
+ def anchor_start_from_row(row: dict[str, Any]) -> str:
14
+ for key in ANCHOR_START_KEYS:
15
+ value = str(row.get(key) or "").strip()
16
+ if value:
17
+ return value[:240]
18
+ for key in FALLBACK_START_KEYS:
19
+ value = str(row.get(key) or "").strip()
20
+ if value:
21
+ return value[:240]
22
+ return ""
23
+
24
+
25
+ def flatten_nodes(structure: list[Any]) -> list[dict[str, Any]]:
26
+ out: list[dict[str, Any]] = []
27
+
28
+ def _walk(nodes: list[Any]) -> None:
29
+ for node in nodes:
30
+ if not isinstance(node, dict):
31
+ continue
32
+ out.append(node)
33
+ child = node.get("nodes")
34
+ if isinstance(child, list):
35
+ _walk(child)
36
+
37
+ _walk(structure)
38
+ return out
39
+
40
+
41
+ @dataclass
42
+ class BundleElement:
43
+ element_type: str
44
+ text: str
45
+ text_as_html: str | None
46
+ page_number: int
47
+ element_id: int | str
48
+
49
+
50
+ def _parse_element(raw: dict[str, Any]) -> BundleElement | None:
51
+ if not isinstance(raw, dict):
52
+ return None
53
+ pn = raw.get("page_number")
54
+ if pn is None:
55
+ return None
56
+ eid = raw.get("element_id", 0)
57
+ etype = str(raw.get("element_type") or raw.get("type") or "Text")
58
+ text = str(raw.get("text") or "")
59
+ html = raw.get("text_as_html")
60
+ html_str = str(html) if isinstance(html, str) and html.strip() else None
61
+ return BundleElement(
62
+ element_type=etype,
63
+ text=text,
64
+ text_as_html=html_str,
65
+ page_number=max(1, int(pn)),
66
+ element_id=eid,
67
+ )
68
+
69
+
70
+ def _bundle_elements(bundle: dict[str, Any]) -> list[BundleElement]:
71
+ out: list[BundleElement] = []
72
+ for raw in bundle.get("elements") or []:
73
+ el = _parse_element(raw)
74
+ if el is not None:
75
+ out.append(el)
76
+ return out
77
+
78
+
79
+ def _element_body(el: BundleElement) -> str:
80
+ if el.element_type.strip().lower() == "table" and el.text_as_html:
81
+ return el.text_as_html
82
+ return el.text or ""
83
+
84
+
85
+ def _elements_in_page_span(
86
+ elements: list[BundleElement],
87
+ start_page: int,
88
+ end_page: int,
89
+ ) -> list[BundleElement]:
90
+ lo = max(1, int(start_page))
91
+ hi = max(lo, int(end_page))
92
+ return [el for el in elements if lo <= el.page_number <= hi]
93
+
94
+
95
+ def _needle_from_node(node: dict[str, Any]) -> str:
96
+ pseudo = str(node.get("split_document_anchor") or "").strip()
97
+ if pseudo:
98
+ return pseudo[:240]
99
+ return anchor_start_from_row(node)
100
+
101
+
102
+ def _next_needle_from_structure(
103
+ structure: list[dict[str, Any]],
104
+ node: dict[str, Any],
105
+ ) -> str:
106
+ flat = flatten_nodes(structure)
107
+ target_id = node.get("node_id")
108
+ for i, row in enumerate(flat):
109
+ if row.get("node_id") != target_id:
110
+ continue
111
+ if i + 1 >= len(flat):
112
+ return ""
113
+ nxt = flat[i + 1]
114
+ needle = anchor_start_from_row(nxt)
115
+ if needle:
116
+ return needle
117
+ title = str(nxt.get("title") or "").strip()
118
+ return title[:240] if title else ""
119
+ return ""
120
+
121
+
122
+ def _span_blob_with_offsets(
123
+ span_els: list[BundleElement],
124
+ ) -> tuple[str, list[tuple[BundleElement, int, int]]]:
125
+ parts: list[str] = []
126
+ offsets: list[tuple[BundleElement, int, int]] = []
127
+ pos = 0
128
+ for el in span_els:
129
+ body = _element_body(el)
130
+ if not body:
131
+ continue
132
+ start = pos
133
+ parts.append(body)
134
+ pos += len(body)
135
+ parts.append("\n\n")
136
+ pos += 2
137
+ offsets.append((el, start, pos - 2))
138
+ return "".join(parts), offsets
139
+
140
+
141
+ def _slice_section_range(
142
+ blob: str,
143
+ offsets: list[tuple[BundleElement, int, int]],
144
+ *,
145
+ anchor: str,
146
+ pseudo_node: dict[str, Any],
147
+ next_needle: str,
148
+ ) -> tuple[int, int]:
149
+ del pseudo_node # reserved for future anchor refinement
150
+ if not blob.strip():
151
+ return 0, 0
152
+ start = 0
153
+ if anchor:
154
+ idx = blob.find(anchor)
155
+ if idx >= 0:
156
+ start = idx
157
+ end = len(blob)
158
+ if next_needle:
159
+ idx = blob.find(next_needle, start + max(1, len(anchor)))
160
+ if idx >= 0:
161
+ end = idx
162
+ if not offsets:
163
+ return start, end
164
+ return start, end
@@ -0,0 +1,325 @@
1
+ """Build ``DocumentIndex`` from a multimodal pipeline result or saved JSON."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from chunksmith_agent.index_context import flatten_structure
10
+ from chunksmith_agent.models import DocumentIndex
11
+ from chunksmith_agent.element_retrieval import (
12
+ _bundle_elements,
13
+ _element_body,
14
+ _elements_in_page_span,
15
+ _needle_from_node,
16
+ _next_needle_from_structure,
17
+ _slice_section_range,
18
+ _span_blob_with_offsets,
19
+ )
20
+
21
+ _TABLE_TYPES = frozenset({"table"})
22
+ _IMAGE_TYPES = frozenset({"image", "figure", "chart"})
23
+
24
+
25
+ def _node_page_span(node: dict[str, Any]) -> tuple[int, int] | None:
26
+ si = node.get("start_index")
27
+ ei = node.get("end_index")
28
+ if si is None or ei is None:
29
+ pi = node.get("physical_index")
30
+ if pi is not None:
31
+ p = int(pi)
32
+ return p, p
33
+ return None
34
+ return max(1, int(si)), max(int(si), int(ei))
35
+
36
+
37
+ def _load_png_b64(path: Path, *, max_bytes: int = 4_000_000) -> str | None:
38
+ try:
39
+ raw = path.read_bytes()
40
+ if len(raw) > max_bytes:
41
+ return None
42
+ return base64.standard_b64encode(raw).decode("ascii")
43
+ except OSError:
44
+ return None
45
+
46
+
47
+ def _media_for_node(
48
+ node: dict[str, Any],
49
+ *,
50
+ structure: list[dict[str, Any]],
51
+ elements: list[Any],
52
+ image_dir: Path | None,
53
+ png_cycle: list[Path],
54
+ ) -> dict[str, Any]:
55
+ span = _node_page_span(node)
56
+ text = str(node.get("text") or "").strip()
57
+ tables: list[dict[str, Any]] = []
58
+ images: list[dict[str, Any]] = []
59
+
60
+ if span and elements:
61
+ start_p, end_p = span
62
+ span_els = _elements_in_page_span(elements, start_p, end_p)
63
+ blob, offsets = _span_blob_with_offsets(span_els)
64
+ if blob.strip():
65
+ needle = _needle_from_node(node)
66
+ nxt = _next_needle_from_structure(structure, node)
67
+ pseudo = {
68
+ "split_document_anchor": needle,
69
+ "title": node.get("title") or "",
70
+ }
71
+ try:
72
+ sc, ec = _slice_section_range(
73
+ blob,
74
+ offsets,
75
+ anchor=needle,
76
+ pseudo_node=pseudo,
77
+ next_needle=nxt,
78
+ )
79
+ section_els = [el for el, s, e in offsets if s < ec and e > sc]
80
+ except Exception:
81
+ section_els = span_els
82
+ else:
83
+ section_els = span_els
84
+
85
+ if not text:
86
+ parts = [_element_body(el).strip() for el in section_els if _element_body(el).strip()]
87
+ text = "\n\n".join(parts)
88
+
89
+ rank = 0
90
+ for el in section_els:
91
+ t = el.element_type.strip().lower()
92
+ if t in _TABLE_TYPES and el.text_as_html:
93
+ tables.append(
94
+ {
95
+ "page_number": el.page_number,
96
+ "html": el.text_as_html,
97
+ "element_id": el.element_id,
98
+ }
99
+ )
100
+ if t in _IMAGE_TYPES:
101
+ rank += 1
102
+ row: dict[str, Any] = {
103
+ "page_number": el.page_number,
104
+ "rank": rank,
105
+ "element_id": el.element_id,
106
+ "summary": (el.text or "")[:500],
107
+ }
108
+ if png_cycle:
109
+ p = png_cycle[(rank - 1) % len(png_cycle)]
110
+ row["image_path"] = str(p.resolve())
111
+ b64 = _load_png_b64(p)
112
+ if b64:
113
+ row["image_base64"] = b64
114
+ images.append(row)
115
+
116
+ return {"text": text, "tables": tables, "images": images}
117
+
118
+
119
+ def build_document_index(multimodal_out: dict[str, Any]) -> DocumentIndex:
120
+ """
121
+ Build searchable index from :func:`build_multimodal_index` / ``ChunkSmith_MultimodalIndexer_PDF`` output.
122
+ """
123
+ structure = multimodal_out.get("structure")
124
+ if not isinstance(structure, list):
125
+ structure = []
126
+
127
+ bundle = multimodal_out.get("canonical_bundle")
128
+ if not isinstance(bundle, dict):
129
+ bundle = None
130
+
131
+ image_dir: Path | None = None
132
+ if bundle:
133
+ pi = bundle.get("path_image")
134
+ if isinstance(pi, str) and pi.strip():
135
+ image_dir = Path(pi)
136
+ if image_dir is None:
137
+ ar = multimodal_out.get("artifact_root")
138
+ if isinstance(ar, str):
139
+ cand = Path(ar) / "image"
140
+ if cand.is_dir():
141
+ image_dir = cand
142
+
143
+ png_cycle: list[Path] = []
144
+ if image_dir and image_dir.is_dir():
145
+ png_cycle = sorted(image_dir.glob("*.png"))
146
+
147
+ elements = _bundle_elements(bundle) if bundle else []
148
+
149
+ media_by_node: dict[str, dict[str, Any]] = {}
150
+ for node in flatten_structure(structure):
151
+ nid = node.get("node_id")
152
+ if nid is None:
153
+ continue
154
+ key = str(nid).strip()
155
+ if not key:
156
+ continue
157
+ media_by_node[key] = _media_for_node(
158
+ node,
159
+ structure=structure,
160
+ elements=elements,
161
+ image_dir=image_dir,
162
+ png_cycle=png_cycle,
163
+ )
164
+
165
+ cf = None
166
+ if bundle and isinstance(bundle.get("coded_formate"), str):
167
+ cf = bundle["coded_formate"]
168
+ elif isinstance(multimodal_out.get("compressed_tree_string"), str):
169
+ cf = multimodal_out.get("compressed_tree_string")
170
+
171
+ return DocumentIndex(
172
+ doc_name=str(multimodal_out.get("doc_name") or "document"),
173
+ structure=structure,
174
+ media_by_node=media_by_node,
175
+ canonical_bundle=bundle,
176
+ coded_formate=cf,
177
+ image_dir=str(image_dir.resolve()) if image_dir else None,
178
+ )
179
+
180
+
181
+ def _page_outline_summary(bundle: dict[str, Any], page: int) -> str:
182
+ """Short preview for page-level nodes so the picker can match topics."""
183
+ parts: list[str] = []
184
+ has_table = False
185
+ has_figure = False
186
+ for el in bundle.get("elements") or []:
187
+ if not isinstance(el, dict):
188
+ continue
189
+ if el.get("page_number") is None or int(el["page_number"]) != page:
190
+ continue
191
+ kind = str(el.get("element_type") or "").strip().lower()
192
+ if kind == "table":
193
+ has_table = True
194
+ if kind in _IMAGE_TYPES:
195
+ has_figure = True
196
+ text = str(el.get("text") or "").strip().replace("\n", " ")
197
+ if not text or len(text) < 4:
198
+ continue
199
+ if kind in ("title", "header", "subtitle", "narrativetext", "listitem") and len(parts) < 4:
200
+ snippet = text[:140] + ("…" if len(text) > 140 else "")
201
+ if snippet not in parts:
202
+ parts.append(snippet)
203
+ preview = " · ".join(parts)[:420].strip()
204
+ flags: list[str] = []
205
+ if has_table:
206
+ flags.append("table")
207
+ if has_figure:
208
+ flags.append("figure")
209
+ if flags:
210
+ preview = f"{preview} [{' '.join(flags)}]" if preview else f"[{' '.join(flags)}]"
211
+ return preview or f"Page {page}"
212
+
213
+
214
+ def structure_from_canonical_bundle(bundle: dict[str, Any]) -> list[dict[str, Any]]:
215
+ """Build a page-level outline when no ``*_pageindex.json`` exists."""
216
+ pages: set[int] = set()
217
+ for el in bundle.get("elements") or []:
218
+ if not isinstance(el, dict):
219
+ continue
220
+ pn = el.get("page_number")
221
+ if pn is not None:
222
+ pages.add(max(1, int(pn)))
223
+ if not pages:
224
+ return [
225
+ {
226
+ "node_id": "0000",
227
+ "title": "Document",
228
+ "start_index": 1,
229
+ "end_index": 1,
230
+ "summary": "Full document (no page numbers in bundle).",
231
+ }
232
+ ]
233
+ return [
234
+ {
235
+ "node_id": f"{i:04d}",
236
+ "title": f"Page {p}",
237
+ "start_index": p,
238
+ "end_index": p,
239
+ "summary": _page_outline_summary(bundle, p),
240
+ }
241
+ for i, p in enumerate(sorted(pages))
242
+ ]
243
+
244
+
245
+ def _artifact_root_for_saved_bundle(
246
+ canonical_bundle_path: Path,
247
+ bundle: dict[str, Any],
248
+ ) -> Path | None:
249
+ ar = bundle.get("artifact_root")
250
+ if isinstance(ar, str) and ar.strip():
251
+ root = Path(ar)
252
+ if root.is_dir():
253
+ return root.resolve()
254
+ parent = canonical_bundle_path.parent
255
+ if parent.name == "json":
256
+ root = parent.parent
257
+ if (root / "image").is_dir() or (root / "json").is_dir():
258
+ return root.resolve()
259
+ return None
260
+
261
+
262
+ def _bundle_with_resolved_image_dir(
263
+ bundle: dict[str, Any],
264
+ artifact_root: Path | None,
265
+ ) -> dict[str, Any]:
266
+ if artifact_root is None:
267
+ return bundle
268
+ image_dir = artifact_root / "image"
269
+ if not image_dir.is_dir():
270
+ return bundle
271
+ pi = bundle.get("path_image")
272
+ if isinstance(pi, str) and pi.strip() and Path(pi).is_dir():
273
+ return bundle
274
+ out = dict(bundle)
275
+ out["path_image"] = str(image_dir.resolve())
276
+ out.setdefault("artifact_root", str(artifact_root.resolve()))
277
+ return out
278
+
279
+
280
+ def build_document_index_from_saved(
281
+ *,
282
+ pageindex_path: Path | None = None,
283
+ canonical_bundle_path: Path | None = None,
284
+ artifact_root: Path | None = None,
285
+ ) -> DocumentIndex:
286
+ """Load index from saved JSON (pageindex, outline JSON, or canonical bundle only)."""
287
+ import json
288
+
289
+ if pageindex_path is None and canonical_bundle_path is None:
290
+ raise ValueError("pageindex_path or canonical_bundle_path is required")
291
+
292
+ if pageindex_path is not None and pageindex_path.is_file():
293
+ raw = json.loads(pageindex_path.read_text(encoding="utf-8"))
294
+ if isinstance(raw.get("elements"), list) and not isinstance(raw.get("structure"), list):
295
+ return build_document_index(raw)
296
+ if canonical_bundle_path and canonical_bundle_path.is_file():
297
+ cb = json.loads(canonical_bundle_path.read_text(encoding="utf-8"))
298
+ if isinstance(cb, dict):
299
+ ar = artifact_root or _artifact_root_for_saved_bundle(canonical_bundle_path, cb)
300
+ cb = _bundle_with_resolved_image_dir(cb, ar)
301
+ raw["canonical_bundle"] = cb
302
+ if ar is not None:
303
+ raw["artifact_root"] = str(ar)
304
+ return build_document_index(raw)
305
+
306
+ if canonical_bundle_path is None or not canonical_bundle_path.is_file():
307
+ raise FileNotFoundError(f"Index file not found: {pageindex_path}")
308
+
309
+ bundle = json.loads(canonical_bundle_path.read_text(encoding="utf-8"))
310
+ if not isinstance(bundle, dict) or not isinstance(bundle.get("elements"), list):
311
+ raise ValueError(f"Not a canonical bundle: {canonical_bundle_path}")
312
+
313
+ doc_name = str(bundle.get("doc_name") or canonical_bundle_path.stem)
314
+ if doc_name.endswith("_canonical_bundle"):
315
+ doc_name = doc_name[: -len("_canonical_bundle")]
316
+ ar = artifact_root or _artifact_root_for_saved_bundle(canonical_bundle_path, bundle)
317
+ bundle = _bundle_with_resolved_image_dir(bundle, ar)
318
+ raw: dict[str, Any] = {
319
+ "doc_name": doc_name,
320
+ "structure": structure_from_canonical_bundle(bundle),
321
+ "canonical_bundle": bundle,
322
+ }
323
+ if ar is not None:
324
+ raw["artifact_root"] = str(ar)
325
+ return build_document_index(raw)