chunksmith-agent 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunksmith-agent
3
+ Version: 0.4.0
4
+ Summary: ChunkSmith document Q&A agent over saved multi-indexing outlines.
5
+ Author-email: AnshulParate2004 <anshulnparate@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/AnshulParate2004/ChunkSmith
8
+ Project-URL: Repository, https://github.com/AnshulParate2004/ChunkSmith
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: python-dotenv>=1.0.0
12
+ Requires-Dist: pydantic>=2.10.0
13
+ Provides-Extra: langchain
14
+ Requires-Dist: langchain-core>=0.3.28; extra == "langchain"
15
+ Requires-Dist: langchain-openai>=0.3.0; extra == "langchain"
16
+ Requires-Dist: langchain-litellm>=0.2.0; extra == "langchain"
17
+
18
+ # chunksmith-agent
19
+
20
+ Standalone document Q&A over **saved ChunkSmith index JSON** (no dependency on `chunksmith-core`, `chunksmith-multimodal`, or `chunksmith-pageindex`).
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ pip install chunksmith-agent
26
+ pip install "chunksmith-agent[langchain]" # LangChain tool-calling Q&A
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ```python
32
+ from pathlib import Path
33
+ from chunksmith_agent import ChunkSmithAgent
34
+ from chunksmith_agent.index_builder import build_document_index_from_saved
35
+
36
+ index = build_document_index_from_saved(
37
+ pageindex_path=Path("runs/my-doc/json/my-doc_pageindex.json"),
38
+ )
39
+ agent = ChunkSmithAgent(index)
40
+ answer = agent.ask("What is this document about?")
41
+ print(answer.answer)
42
+ ```
43
+
44
+ ## JSON artifact contract
45
+
46
+ The agent reads files produced by `chunksmith-cli` (or compatible tools) under a run folder:
47
+
48
+ | File | Fields used |
49
+ |------|-------------|
50
+ | `json/*_pageindex.json` | `doc_name`, `structure`, optional embedded `canonical_bundle` |
51
+ | `json/*_canonical_bundle.json` | `elements[]`, `coded_formate`, `path_image` |
52
+ | Outline nodes | `node_id`, `title`, `summary`, `start_index`/`end_index`, anchor fields |
53
+
54
+ ## Environment variables
55
+
56
+ Same LLM env vars as ChunkSmith CLI / MVL:
57
+
58
+ - `OPENAI_API_KEY` (or `CHATGPT_API_KEY`)
59
+ - `PAGEINDEX_MODEL`, `CHUNKSMITH_LLM_MODEL`, `LLM_MODEL`
60
+ - Azure: `AZURE_API_KEY`, `AZURE_API_BASE`, `AZURE_API_VERSION`
61
+
62
+ ## Integration patterns (loose coupling)
63
+
64
+ **Do not** depend on `chunksmith-adapters` inside this package. Pass data in yourself:
65
+
66
+ | Caller | How to load index | Agent config |
67
+ |--------|-------------------|--------------|
68
+ | **CLI** | `build_document_index_from_saved(pageindex_path=...)` | `load_settings()` from `.env` |
69
+ | **MVL app** | `chunksmith_agent_bridge.load_document_index_from_mvl(repo, ...)` | `load_settings()` or explicit `AgentSettings` |
70
+ | **Custom app** | Fetch JSON from your DB/S3 → `build_document_index(dict)` | Your env / settings |
71
+
72
+ Install separately from CLI or MVL:
73
+
74
+ ```bash
75
+ pip install "chunksmith-agent[langchain]"
76
+ ```
77
+
78
+ ## Extensibility
79
+
80
+ - **Outline nodes:** come from saved JSON (`structure`); re-index or edit JSON to add sections.
81
+ - **Tools:** extend `_make_tools()` in `tool_agent.py` (e.g. add `get_page_images`).
82
+ - **Another agent in your app:** compose `ChunkSmithAgent` alongside your own planners/retrievers — this package is one document Q&A brain, not your whole system.
@@ -0,0 +1,65 @@
1
+ # chunksmith-agent
2
+
3
+ Standalone document Q&A over **saved ChunkSmith index JSON** (no dependency on `chunksmith-core`, `chunksmith-multimodal`, or `chunksmith-pageindex`).
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install chunksmith-agent
9
+ pip install "chunksmith-agent[langchain]" # LangChain tool-calling Q&A
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ ```python
15
+ from pathlib import Path
16
+ from chunksmith_agent import ChunkSmithAgent
17
+ from chunksmith_agent.index_builder import build_document_index_from_saved
18
+
19
+ index = build_document_index_from_saved(
20
+ pageindex_path=Path("runs/my-doc/json/my-doc_pageindex.json"),
21
+ )
22
+ agent = ChunkSmithAgent(index)
23
+ answer = agent.ask("What is this document about?")
24
+ print(answer.answer)
25
+ ```
26
+
27
+ ## JSON artifact contract
28
+
29
+ The agent reads files produced by `chunksmith-cli` (or compatible tools) under a run folder:
30
+
31
+ | File | Fields used |
32
+ |------|-------------|
33
+ | `json/*_pageindex.json` | `doc_name`, `structure`, optional embedded `canonical_bundle` |
34
+ | `json/*_canonical_bundle.json` | `elements[]`, `coded_formate`, `path_image` |
35
+ | Outline nodes | `node_id`, `title`, `summary`, `start_index`/`end_index`, anchor fields |
36
+
37
+ ## Environment variables
38
+
39
+ Same LLM env vars as ChunkSmith CLI / MVL:
40
+
41
+ - `OPENAI_API_KEY` (or `CHATGPT_API_KEY`)
42
+ - `PAGEINDEX_MODEL`, `CHUNKSMITH_LLM_MODEL`, `LLM_MODEL`
43
+ - Azure: `AZURE_API_KEY`, `AZURE_API_BASE`, `AZURE_API_VERSION`
44
+
45
+ ## Integration patterns (loose coupling)
46
+
47
+ **Do not** depend on `chunksmith-adapters` inside this package. Pass data in yourself:
48
+
49
+ | Caller | How to load index | Agent config |
50
+ |--------|-------------------|--------------|
51
+ | **CLI** | `build_document_index_from_saved(pageindex_path=...)` | `load_settings()` from `.env` |
52
+ | **MVL app** | `chunksmith_agent_bridge.load_document_index_from_mvl(repo, ...)` | `load_settings()` or explicit `AgentSettings` |
53
+ | **Custom app** | Fetch JSON from your DB/S3 → `build_document_index(dict)` | Your env / settings |
54
+
55
+ Install separately from CLI or MVL:
56
+
57
+ ```bash
58
+ pip install "chunksmith-agent[langchain]"
59
+ ```
60
+
61
+ ## Extensibility
62
+
63
+ - **Outline nodes:** come from saved JSON (`structure`); re-index or edit JSON to add sections.
64
+ - **Tools:** extend `_make_tools()` in `tool_agent.py` (e.g. add `get_page_images`).
65
+ - **Another agent in your app:** compose `ChunkSmithAgent` alongside your own planners/retrievers — this package is one document Q&A brain, not your whole system.
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "chunksmith-agent"
7
+ version = "0.4.0"
8
+ description = "ChunkSmith document Q&A agent over saved multi-indexing outlines."
9
+ requires-python = ">=3.10"
10
+ license = "MIT"
11
+ authors = [{ name = "AnshulParate2004", email = "anshulnparate@gmail.com" }]
12
+ readme = "README.md"
13
+ dependencies = [
14
+ "python-dotenv>=1.0.0",
15
+ "pydantic>=2.10.0",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ langchain = [
20
+ "langchain-core>=0.3.28",
21
+ "langchain-openai>=0.3.0",
22
+ "langchain-litellm>=0.2.0",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/AnshulParate2004/ChunkSmith"
27
+ Repository = "https://github.com/AnshulParate2004/ChunkSmith"
28
+
29
+ [tool.setuptools.packages.find]
30
+ where = ["src"]
31
+ include = ["chunksmith_agent", "chunksmith_agent.*"]
32
+
33
+ [tool.pytest.ini_options]
34
+ testpaths = ["tests"]
35
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,13 @@
1
+ """Reasoning-based Q&A over ChunkSmith multimodal indexes (outline tree + elements)."""
2
+
3
+ from chunksmith_agent.agent import ChunkSmithAgent
4
+ from chunksmith_agent.index_builder import build_document_index, build_document_index_from_saved
5
+ from chunksmith_agent.models import AgentAnswer, DocumentIndex
6
+
7
+ __all__ = [
8
+ "AgentAnswer",
9
+ "ChunkSmithAgent",
10
+ "DocumentIndex",
11
+ "build_document_index",
12
+ "build_document_index_from_saved",
13
+ ]
@@ -0,0 +1,59 @@
1
+ """ChunkSmith agent session: Q&A over a built document index."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Iterator
6
+
7
+ from chunksmith_agent.index_builder import build_document_index
8
+ from chunksmith_agent.models import AgentAnswer, DocumentIndex
9
+ from chunksmith_agent.retrieval import answer_question, iter_answer_events
10
+ from chunksmith_agent.session import AgentConversation
11
+ from chunksmith_agent.settings import AgentSettings, load_settings
12
+
13
+
14
+ class ChunkSmithAgent:
15
+ """Holds a document index and answers with session memory (reuse sections on follow-ups)."""
16
+
17
+ def __init__(
18
+ self,
19
+ index: DocumentIndex,
20
+ settings: AgentSettings | None = None,
21
+ ) -> None:
22
+ self.index = index
23
+ self.settings = settings or load_settings()
24
+ self.conversation = AgentConversation()
25
+
26
+ @classmethod
27
+ def from_multimodal_output(cls, out: dict[str, Any], settings: AgentSettings | None = None) -> ChunkSmithAgent:
28
+ return cls(build_document_index(out), settings=settings)
29
+
30
+ def reset_conversation(self) -> None:
31
+ """Clear chat history and cached section selection."""
32
+ self.conversation = AgentConversation()
33
+
34
+ def ask(self, query: str, *, stream_tokens: bool = False) -> AgentAnswer:
35
+ return answer_question(
36
+ self.index,
37
+ query,
38
+ self.settings,
39
+ conversation=self.conversation,
40
+ )
41
+
42
+ def ask_events(
43
+ self,
44
+ query: str,
45
+ *,
46
+ event_sink: Any | None = None,
47
+ emit_image_events: bool = True,
48
+ emit_table_events: bool = True,
49
+ ) -> Iterator[tuple[str, dict[str, Any]]]:
50
+ """Yield ``(event_name, payload)`` for CLI streaming."""
51
+ yield from iter_answer_events(
52
+ self.index,
53
+ query,
54
+ self.settings,
55
+ event_sink=event_sink,
56
+ emit_image_events=emit_image_events,
57
+ emit_table_events=emit_table_events,
58
+ conversation=self.conversation,
59
+ )
@@ -0,0 +1,164 @@
1
+ """Element helpers for building per-node media from canonical bundles (standalone)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ ANCHOR_START_KEYS = ("split_document_anchor_start", "anchor_start")
9
+ ANCHOR_END_KEYS = ("split_document_anchor_end", "anchor_end")
10
+ FALLBACK_START_KEYS = ("split_document_anchor", "anchor")
11
+
12
+
13
+ def anchor_start_from_row(row: dict[str, Any]) -> str:
14
+ for key in ANCHOR_START_KEYS:
15
+ value = str(row.get(key) or "").strip()
16
+ if value:
17
+ return value[:240]
18
+ for key in FALLBACK_START_KEYS:
19
+ value = str(row.get(key) or "").strip()
20
+ if value:
21
+ return value[:240]
22
+ return ""
23
+
24
+
25
+ def flatten_nodes(structure: list[Any]) -> list[dict[str, Any]]:
26
+ out: list[dict[str, Any]] = []
27
+
28
+ def _walk(nodes: list[Any]) -> None:
29
+ for node in nodes:
30
+ if not isinstance(node, dict):
31
+ continue
32
+ out.append(node)
33
+ child = node.get("nodes")
34
+ if isinstance(child, list):
35
+ _walk(child)
36
+
37
+ _walk(structure)
38
+ return out
39
+
40
+
41
+ @dataclass
42
+ class BundleElement:
43
+ element_type: str
44
+ text: str
45
+ text_as_html: str | None
46
+ page_number: int
47
+ element_id: int | str
48
+
49
+
50
+ def _parse_element(raw: dict[str, Any]) -> BundleElement | None:
51
+ if not isinstance(raw, dict):
52
+ return None
53
+ pn = raw.get("page_number")
54
+ if pn is None:
55
+ return None
56
+ eid = raw.get("element_id", 0)
57
+ etype = str(raw.get("element_type") or raw.get("type") or "Text")
58
+ text = str(raw.get("text") or "")
59
+ html = raw.get("text_as_html")
60
+ html_str = str(html) if isinstance(html, str) and html.strip() else None
61
+ return BundleElement(
62
+ element_type=etype,
63
+ text=text,
64
+ text_as_html=html_str,
65
+ page_number=max(1, int(pn)),
66
+ element_id=eid,
67
+ )
68
+
69
+
70
+ def _bundle_elements(bundle: dict[str, Any]) -> list[BundleElement]:
71
+ out: list[BundleElement] = []
72
+ for raw in bundle.get("elements") or []:
73
+ el = _parse_element(raw)
74
+ if el is not None:
75
+ out.append(el)
76
+ return out
77
+
78
+
79
+ def _element_body(el: BundleElement) -> str:
80
+ if el.element_type.strip().lower() == "table" and el.text_as_html:
81
+ return el.text_as_html
82
+ return el.text or ""
83
+
84
+
85
+ def _elements_in_page_span(
86
+ elements: list[BundleElement],
87
+ start_page: int,
88
+ end_page: int,
89
+ ) -> list[BundleElement]:
90
+ lo = max(1, int(start_page))
91
+ hi = max(lo, int(end_page))
92
+ return [el for el in elements if lo <= el.page_number <= hi]
93
+
94
+
95
+ def _needle_from_node(node: dict[str, Any]) -> str:
96
+ pseudo = str(node.get("split_document_anchor") or "").strip()
97
+ if pseudo:
98
+ return pseudo[:240]
99
+ return anchor_start_from_row(node)
100
+
101
+
102
+ def _next_needle_from_structure(
103
+ structure: list[dict[str, Any]],
104
+ node: dict[str, Any],
105
+ ) -> str:
106
+ flat = flatten_nodes(structure)
107
+ target_id = node.get("node_id")
108
+ for i, row in enumerate(flat):
109
+ if row.get("node_id") != target_id:
110
+ continue
111
+ if i + 1 >= len(flat):
112
+ return ""
113
+ nxt = flat[i + 1]
114
+ needle = anchor_start_from_row(nxt)
115
+ if needle:
116
+ return needle
117
+ title = str(nxt.get("title") or "").strip()
118
+ return title[:240] if title else ""
119
+ return ""
120
+
121
+
122
+ def _span_blob_with_offsets(
123
+ span_els: list[BundleElement],
124
+ ) -> tuple[str, list[tuple[BundleElement, int, int]]]:
125
+ parts: list[str] = []
126
+ offsets: list[tuple[BundleElement, int, int]] = []
127
+ pos = 0
128
+ for el in span_els:
129
+ body = _element_body(el)
130
+ if not body:
131
+ continue
132
+ start = pos
133
+ parts.append(body)
134
+ pos += len(body)
135
+ parts.append("\n\n")
136
+ pos += 2
137
+ offsets.append((el, start, pos - 2))
138
+ return "".join(parts), offsets
139
+
140
+
141
+ def _slice_section_range(
142
+ blob: str,
143
+ offsets: list[tuple[BundleElement, int, int]],
144
+ *,
145
+ anchor: str,
146
+ pseudo_node: dict[str, Any],
147
+ next_needle: str,
148
+ ) -> tuple[int, int]:
149
+ del pseudo_node # reserved for future anchor refinement
150
+ if not blob.strip():
151
+ return 0, 0
152
+ start = 0
153
+ if anchor:
154
+ idx = blob.find(anchor)
155
+ if idx >= 0:
156
+ start = idx
157
+ end = len(blob)
158
+ if next_needle:
159
+ idx = blob.find(next_needle, start + max(1, len(anchor)))
160
+ if idx >= 0:
161
+ end = idx
162
+ if not offsets:
163
+ return start, end
164
+ return start, end