PyPI - chunksmith-agent - Versions diffs - 0.4.0__py3-none-any.whl - Mend

chunksmith-agent 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

chunksmith_agent/__init__.py +13 -0
chunksmith_agent/agent.py +59 -0
chunksmith_agent/element_retrieval.py +164 -0
chunksmith_agent/index_builder.py +325 -0
chunksmith_agent/index_context.py +310 -0
chunksmith_agent/langchain_runtime.py +101 -0
chunksmith_agent/models.py +60 -0
chunksmith_agent/retrieval.py +80 -0
chunksmith_agent/session.py +44 -0
chunksmith_agent/settings.py +68 -0
chunksmith_agent/tool_agent.py +264 -0
chunksmith_agent-0.4.0.dist-info/METADATA +82 -0
chunksmith_agent-0.4.0.dist-info/RECORD +15 -0
chunksmith_agent-0.4.0.dist-info/WHEEL +5 -0
chunksmith_agent-0.4.0.dist-info/top_level.txt +1 -0

chunksmith_agent/index_context.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Shared outline traversal and context assembly for retrieval backends."""
+from __future__ import annotations
+import os
+import re
+from difflib import get_close_matches
+from pathlib import Path
+from typing import Any
+from chunksmith_agent.models import DocumentIndex
+def flatten_structure(nodes: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    out: list[dict[str, Any]] = []
+    def _walk(n: list[dict[str, Any]]) -> None:
+        for node in n:
+            out.append(node)
+            ch = node.get("nodes")
+            if isinstance(ch, list):
+                _walk(ch)
+    _walk(nodes)
+    return out
+def normalize_node_id(raw: Any) -> str | None:
+    if raw is None:
+        return None
+    s = str(raw).strip()
+    return s or None
+def fallback_nodes_lexical(
+    flat_nodes: list[dict[str, Any]],
+    query: str,
+    max_nodes: int,
+) -> list[str]:
+    q = (query or "").lower()
+    q_tokens = [t for t in re.split(r"\W+", q) if len(t) >= 3]
+    if not q_tokens:
+        return []
+    scored: list[tuple[int, str]] = []
+    for node in flat_nodes:
+        nid = normalize_node_id(node.get("node_id"))
+        if not nid:
+            continue
+        title = (node.get("title") or "").lower()
+        summary = (node.get("summary") or "").lower()
+        blob_words = set(re.findall(r"[a-z]{4,}", f"{title} {summary}"))
+        score = 0
+        for t in q_tokens:
+            if t in title or t in summary:
+                score += 4
+                continue
+            matches = get_close_matches(t, list(blob_words), n=1, cutoff=0.72)
+            if matches:
+                score += 2
+        if score > 0:
+            scored.append((score, nid))
+    scored.sort(key=lambda x: -x[0])
+    seen: set[str] = set()
+    result: list[str] = []
+    for _, nid in scored:
+        if nid in seen:
+            continue
+        seen.add(nid)
+        result.append(nid)
+        if len(result) >= max_nodes:
+            break
+    return result
+def outline_prompt_lines(index: DocumentIndex, *, limit: int = 128) -> str:
+    lines: list[str] = []
+    for node in flatten_structure(index.structure)[:limit]:
+        nid = normalize_node_id(node.get("node_id")) or ""
+        title = (node.get("title") or "").replace("\n", " ")
+        summary = (node.get("summary") or "").replace("\n", " ")
+        if len(summary) > 400:
+            summary = summary[:400] + "..."
+        lines.append(f"{nid}: {title} || {summary}")
+    return "\n".join(lines)
+def collect_context(index: DocumentIndex, node_ids: list[str]) -> tuple[str, str]:
+    fragments: list[str] = []
+    def _walk(nodes: list[dict[str, Any]]) -> None:
+        for node in nodes:
+            nid = normalize_node_id(node.get("node_id"))
+            if nid and nid in node_ids:
+                title = node.get("title") or ""
+                summary = node.get("summary") or ""
+                media = index.media_by_node.get(nid, {})
+                text = media.get("text") or node.get("text") or ""
+                fragments.append(f"Title: {title}\nSummary: {summary}\nText:\n{text}")
+            ch = node.get("nodes")
+            if isinstance(ch, list):
+                _walk(ch)
+    _walk(index.structure)
+    context = "\n\n".join(fragments).strip()
+    tables = ""
+    for nid in node_ids:
+        media = index.media_by_node.get(nid, {})
+        for tbl in media.get("tables") or []:
+            if isinstance(tbl, dict):
+                tables += f"\nTable (node {nid}, page {tbl.get('page_number')}):\n{tbl.get('html')}\n"
+    return context, tables
+_FIGURE_MENTION = re.compile(r"Figure\s+\d+\s*:[^\n]+", re.IGNORECASE)
+_TABLE_MENTION = re.compile(r"Table\s+\d+[^\n]*", re.IGNORECASE)
+def index_media_counts(index: DocumentIndex) -> tuple[int, int]:
+    """Return (table_count, image_count) stored in the index."""
+    tables = 0
+    images = 0
+    for media in index.media_by_node.values():
+        tables += len(media.get("tables") or [])
+        images += len(media.get("images") or [])
+    return tables, images
+def index_media_inventory(
+    index: DocumentIndex,
+    *,
+    max_list: int = 20,
+) -> tuple[list[str], list[str]]:
+    """Human-readable lines: where tables and figures were loaded (by node/page)."""
+    table_lines: list[str] = []
+    figure_lines: list[str] = []
+    flat = {normalize_node_id(n.get("node_id")): n for n in flatten_structure(index.structure)}
+    for nid in sorted(index.media_by_node.keys(), key=lambda x: int(x) if x.isdigit() else x):
+        media = index.media_by_node.get(nid) or {}
+        node = flat.get(nid) or {}
+        title = str(node.get("title") or f"node {nid}")
+        for i, tbl in enumerate(media.get("tables") or [], start=1):
+            if not isinstance(tbl, dict):
+                continue
+            pg = tbl.get("page_number", "?")
+            table_lines.append(f"• {escape_display(title)} (node {nid}, page {pg}, table {i})")
+        for i, img in enumerate(media.get("images") or [], start=1):
+            if not isinstance(img, dict):
+                continue
+            pg = img.get("page_number", "?")
+            path = str(img.get("image_path") or "").strip()
+            name = Path(path).name if path else f"figure {i}"
+            figure_lines.append(f"• {escape_display(title)} (node {nid}, page {pg}) — {name}")
+    return table_lines[:max_list], figure_lines[:max_list]
+def escape_display(text: str) -> str:
+    return (text or "").replace("[", "\\[")
+def node_text(index: DocumentIndex, node_id: str) -> str:
+    media = index.media_by_node.get(node_id) or {}
+    text = str(media.get("text") or "").strip()
+    if text:
+        return text
+    for node in flatten_structure(index.structure):
+        if normalize_node_id(node.get("node_id")) == node_id:
+            return str(node.get("text") or "").strip()
+    return ""
+def text_media_mentions(
+    index: DocumentIndex,
+    node_ids: list[str],
+    *,
+    max_each: int = 8,
+) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
+    """Figure/table captions mentioned in section text (no image/HTML files)."""
+    figures: list[dict[str, str]] = []
+    tables: list[dict[str, str]] = []
+    seen_f: set[str] = set()
+    seen_t: set[str] = set()
+    for nid in node_ids:
+        text = node_text(index, nid)
+        if not text:
+            continue
+        for match in _FIGURE_MENTION.findall(text):
+            line = match.strip()
+            if line in seen_f:
+                continue
+            seen_f.add(line)
+            figures.append({"node_id": nid, "caption": line})
+            if len(figures) >= max_each:
+                break
+        for match in _TABLE_MENTION.findall(text):
+            line = match.strip()
+            if line in seen_t:
+                continue
+            seen_t.add(line)
+            tables.append({"node_id": nid, "caption": line})
+            if len(tables) >= max_each:
+                break
+    return figures, tables
+_QUERY_WANTS_TABLES = re.compile(
+    r"\b("
+    r"table|tables|tabular|chart|matrix|grid|spreadsheet|"
+    r"compare|comparison|statistics|stats|metrics|numbers|data"
+    r")\b",
+    re.IGNORECASE,
+)
+_QUERY_WANTS_FIGURES = re.compile(
+    r"\b(figure|figures|image|images|diagram|plot|visual|picture|illustration)\b",
+    re.IGNORECASE,
+)
+def is_substantive_table_html(html: str) -> bool:
+    """True when HTML looks like a real table (not TOC prose tagged as Table)."""
+    h = (html or "").lower()
+    if "<tr" not in h:
+        return False
+    return h.count("<td") + h.count("<th") >= 2
+def query_wants_table_display(query: str) -> bool:
+    return bool(_QUERY_WANTS_TABLES.search(query or ""))
+def query_wants_figure_display(query: str) -> bool:
+    return bool(_QUERY_WANTS_FIGURES.search(query or ""))
+def select_tables(
+    index: DocumentIndex,
+    node_ids: list[str],
+    *,
+    max_tables: int = 6,
+    substantive_only: bool = False,
+) -> list[dict[str, Any]]:
+    chosen: list[dict[str, Any]] = []
+    for nid in node_ids:
+        media = index.media_by_node.get(nid, {})
+        for tbl in media.get("tables") or []:
+            if not isinstance(tbl, dict):
+                continue
+            html = tbl.get("html")
+            if not html or not str(html).strip():
+                continue
+            if substantive_only and not is_substantive_table_html(str(html)):
+                continue
+            row = dict(tbl)
+            row["node_id"] = nid
+            chosen.append(row)
+            if len(chosen) >= max_tables:
+                return chosen
+    return chosen
+def select_tables_for_display(
+    index: DocumentIndex,
+    node_ids: list[str],
+    query: str,
+    *,
+    max_tables: int = 3,
+) -> list[dict[str, Any]]:
+    """Substantive tables in retrieved sections (skip TOC-like blocks)."""
+    _ = query
+    mode = os.environ.get("CHUNKSMITH_CLI_TABLES_MODE", "auto").strip().lower()
+    if mode in ("0", "false", "no", "off", "never"):
+        return []
+    return select_tables(index, node_ids, max_tables=max_tables, substantive_only=True)
+def select_images_for_display(
+    index: DocumentIndex,
+    node_ids: list[str],
+    query: str,
+    *,
+    max_images: int = 3,
+) -> list[dict[str, Any]]:
+    """Figures from retrieved sections (capped so the terminal stays readable)."""
+    _ = query
+    mode = os.environ.get("CHUNKSMITH_CLI_IMAGES_MODE", "auto").strip().lower()
+    if mode in ("0", "false", "no", "off", "never"):
+        return []
+    return select_images(index, node_ids, max_images=max_images)
+def select_images(index: DocumentIndex, node_ids: list[str], *, max_images: int = 8) -> list[dict[str, Any]]:
+    chosen: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for nid in node_ids:
+        media = index.media_by_node.get(nid, {})
+        for img in media.get("images") or []:
+            if not isinstance(img, dict):
+                continue
+            path = str(img.get("image_path") or "")
+            fp = path or str(img.get("element_id") or id(img))
+            if fp in seen:
+                continue
+            seen.add(fp)
+            row = dict(img)
+            row["node_id"] = nid
+            chosen.append(row)
+            if len(chosen) >= max_images:
+                return chosen
+    return chosen

chunksmith_agent/langchain_runtime.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""LangChain helpers for the tool-calling agent."""
+from __future__ import annotations
+from typing import Any
+from chunksmith_agent.settings import AgentSettings
+from pydantic import BaseModel, Field
+from chunksmith_agent.index_context import (
+    fallback_nodes_lexical,
+    flatten_structure,
+    normalize_node_id,
+    outline_prompt_lines,
+)
+from chunksmith_agent.models import DocumentIndex
+def build_chat_model(settings: AgentSettings, *, temperature: float = 0.0):
+    from langchain_openai import AzureChatOpenAI, ChatOpenAI
+    kwargs = dict(settings.litellm_kwargs or {})
+    model = settings.llm_model or settings.pageindex_model
+    if str(model).startswith("azure/") or kwargs.get("api_base"):
+        deployment = kwargs.get("azure_deployment") or settings.pageindex_model
+        return AzureChatOpenAI(
+            azure_endpoint=kwargs.get("api_base") or kwargs.get("azure_endpoint"),
+            api_key=kwargs.get("api_key") or settings.openai_api_key,
+            api_version=kwargs.get("api_version") or "2024-02-15-preview",
+            azure_deployment=str(deployment).replace("azure/", ""),
+            temperature=temperature,
+        )
+    return ChatOpenAI(
+        model=settings.pageindex_model,
+        api_key=settings.openai_api_key,
+        temperature=temperature,
+    )
+class NodeSelection(BaseModel):
+    thinking: str = Field(description="Brief reasoning")
+    node_list: list[str] = Field(description="Up to 8 node_id strings")
+def _structured_output(llm: Any, schema: type[BaseModel]) -> Any:
+    """Structured LLM output without OpenAI ``parsed`` serialization warnings."""
+    return llm.with_structured_output(schema, method="function_calling")
+def select_relevant_nodes(
+    index: DocumentIndex,
+    query: str,
+    settings: AgentSettings,
+    *,
+    max_nodes: int = 8,
+) -> tuple[list[str], str]:
+    from langchain_core.prompts import ChatPromptTemplate
+    llm = build_chat_model(settings)
+    chain = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                "Pick outline node_ids most likely to answer the user question. "
+                f"Return at most {max_nodes} ids. "
+                "Prefer pages marked [table] or [figure] when the question is about "
+                "statistics, inflation, prices, economic data, or charts.",
+            ),
+            (
+                "human",
+                "Question:\n{query}\n\nOutline (node_id: title || summary):\n{nodes}",
+            ),
+        ]
+    ) | _structured_output(llm, NodeSelection)
+    out: NodeSelection = chain.invoke({"query": query, "nodes": outline_prompt_lines(index)})
+    result: list[str] = []
+    seen: set[str] = set()
+    for raw_id in out.node_list:
+        nid = normalize_node_id(raw_id)
+        if not nid or nid in seen:
+            continue
+        seen.add(nid)
+        result.append(nid)
+        if len(result) >= max_nodes:
+            break
+    thinking = (out.thinking or "").strip()
+    if not result:
+        result = fallback_nodes_lexical(flatten_structure(index.structure), query, max_nodes)
+        if result:
+            thinking = (thinking + " [fallback: lexical title match]").strip()
+    return result, thinking
+def chunk_content(chunk: Any) -> str:
+    """Extract text from a LangChain stream chunk."""
+    content = getattr(chunk, "content", None)
+    if isinstance(content, str):
+        return content
+    return ""

chunksmith_agent/models.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Agent index and answer types."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class DocumentIndex:
+    """Searchable document: nested outline + per-node media."""
+    doc_name: str
+    structure: list[dict[str, Any]]
+    media_by_node: dict[str, dict[str, Any]] = field(default_factory=dict)
+    canonical_bundle: dict[str, Any] | None = None
+    coded_formate: str | None = None
+    image_dir: str | None = None
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "doc_name": self.doc_name,
+            "structure": self.structure,
+            "media_by_node": self.media_by_node,
+            "canonical_bundle": self.canonical_bundle,
+            "coded_formate": self.coded_formate,
+            "image_dir": self.image_dir,
+        }
+    @classmethod
+    def from_dict(cls, raw: dict[str, Any]) -> DocumentIndex:
+        bundle = raw.get("canonical_bundle")
+        if not isinstance(bundle, dict):
+            bundle = None
+        st = raw.get("structure")
+        if not isinstance(st, list):
+            st = []
+        mbn = raw.get("media_by_node")
+        if not isinstance(mbn, dict):
+            mbn = {}
+        cf = raw.get("coded_formate")
+        if bundle and isinstance(bundle.get("coded_formate"), str):
+            cf = bundle.get("coded_formate")
+        return cls(
+            doc_name=str(raw.get("doc_name") or "document"),
+            structure=st,
+            media_by_node=mbn,
+            canonical_bundle=bundle,
+            coded_formate=cf if isinstance(cf, str) else None,
+            image_dir=raw.get("image_dir") if isinstance(raw.get("image_dir"), str) else None,
+        )
+@dataclass
+class AgentAnswer:
+    answer: str
+    nodes_used: list[str]
+    selection_thinking: str
+    images: list[dict[str, Any]]
+    raw_context: dict[str, Any] = field(default_factory=dict)

chunksmith_agent/retrieval.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Agent Q&A entry points (delegates to the LangChain tool agent)."""
+from __future__ import annotations
+from typing import Any, Callable, Iterator
+from chunksmith_agent.index_context import select_images as _select_images
+from chunksmith_agent.models import AgentAnswer, DocumentIndex
+from chunksmith_agent.session import AgentConversation
+from chunksmith_agent.settings import AgentSettings
+def answer_question(
+    index: DocumentIndex,
+    query: str,
+    settings: AgentSettings,
+    *,
+    conversation: AgentConversation | None = None,
+    on_thinking_delta: Callable[[str], None] | None = None,
+    on_token: Callable[[str], None] | None = None,
+) -> AgentAnswer:
+    conv = conversation if conversation is not None else AgentConversation()
+    answer = ""
+    node_ids: list[str] = []
+    def _sink(name: str, payload: dict[str, Any]) -> None:
+        nonlocal answer, node_ids
+        if name == "agent:token":
+            chunk = payload.get("content") or ""
+            answer += chunk
+            if on_token and chunk:
+                on_token(chunk)
+        elif name == "agent:thinking":
+            text = str(payload.get("text") or "")
+            if on_thinking_delta and text:
+                on_thinking_delta(text)
+        elif name == "agent:complete":
+            node_ids = list(payload.get("node_ids") or [])
+    for _ in iter_answer_events(
+        index,
+        query,
+        settings,
+        event_sink=_sink,
+        emit_image_events=False,
+        conversation=conv,
+    ):
+        pass
+    return AgentAnswer(
+        answer=answer,
+        nodes_used=node_ids,
+        selection_thinking="",
+        images=_select_images(index, node_ids),
+        raw_context={"node_ids": node_ids, "mode": "tools"},
+    )
+def iter_answer_events(
+    index: DocumentIndex,
+    query: str,
+    settings: AgentSettings,
+    *,
+    event_sink: Callable[[str, dict[str, Any]], None] | None = None,
+    emit_image_events: bool = True,
+    emit_table_events: bool = True,
+    conversation: AgentConversation | None = None,
+) -> Iterator[tuple[str, dict[str, Any]]]:
+    """Event stream for CLI — LangChain tool-calling agent."""
+    conv = conversation if conversation is not None else AgentConversation()
+    from chunksmith_agent.tool_agent import iter_tool_agent_events
+    yield from iter_tool_agent_events(
+        index,
+        query,
+        settings,
+        conversation=conv,
+        event_sink=event_sink,
+        emit_image_events=emit_image_events,
+        emit_table_events=emit_table_events,
+    )

chunksmith_agent/session.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Conversation memory for multi-turn agent chat."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class ChatTurn:
+    query: str
+    answer: str
+    node_ids: list[str]
+@dataclass
+class AgentConversation:
+    """In-session memory for one loaded document."""
+    turns: list[ChatTurn] = field(default_factory=list)
+    last_node_ids: list[str] = field(default_factory=list)
+    def record(self, query: str, answer: str, node_ids: list[str]) -> None:
+        ids = list(node_ids)
+        self.turns.append(ChatTurn(query=query, answer=answer, node_ids=ids))
+        if ids:
+            self.last_node_ids = ids
+    def chat_messages(self, *, max_turns: int = 6) -> list[dict[str, str]]:
+        recent = self.turns[-max_turns:]
+        out: list[dict[str, str]] = []
+        for t in recent:
+            out.append({"role": "user", "content": t.query})
+            if t.answer.strip():
+                out.append({"role": "assistant", "content": t.answer})
+        return out
+    def recent_context(self, *, max_turns: int = 3) -> str:
+        if not self.turns:
+            return "(no prior turns)"
+        lines: list[str] = []
+        for t in self.turns[-max_turns:]:
+            lines.append(f"User: {t.query}")
+            lines.append(f"Assistant: {t.answer[:400]}")
+        return "\n".join(lines)

chunksmith_agent/settings.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Agent LLM settings (env-only; no chunksmith pipeline imports)."""
+from __future__ import annotations
+import os
+from dataclasses import dataclass, field
+from typing import Any
+from dotenv import load_dotenv
+def _clean_env(name: str) -> str | None:
+    raw = os.getenv(name)
+    if raw is None:
+        return None
+    value = str(raw).split("#", 1)[0].strip()
+    return value or None
+@dataclass(frozen=True)
+class AgentSettings:
+    llm_model: str
+    openai_api_key: str | None
+    pageindex_model: str
+    litellm_kwargs: dict[str, Any] = field(default_factory=dict)
+def _resolve_litellm(*, pageindex_model: str) -> tuple[str, dict[str, Any]]:
+    model = (_clean_env("CHUNKSMITH_LLM_MODEL") or _clean_env("LLM_MODEL") or pageindex_model).strip()
+    kwargs: dict[str, Any] = {}
+    openai_key = _clean_env("OPENAI_API_KEY") or _clean_env("CHATGPT_API_KEY")
+    azure_key = _clean_env("AZURE_API_KEY") or _clean_env("AZURE_OPENAI_API_KEY")
+    azure_base = _clean_env("AZURE_API_BASE") or _clean_env("AZURE_OPENAI_ENDPOINT")
+    azure_version = _clean_env("AZURE_API_VERSION") or _clean_env("AZURE_OPENAI_API_VERSION")
+    if model.startswith("azure/"):
+        if azure_key:
+            kwargs["api_key"] = azure_key
+        if azure_base:
+            kwargs["api_base"] = azure_base.rstrip("/")
+        if azure_version:
+            kwargs["api_version"] = azure_version
+    elif openai_key:
+        kwargs["api_key"] = openai_key
+    return model, kwargs
+def load_settings(*, pageindex_model: str | None = None) -> AgentSettings:
+    """Load agent LLM settings from environment (``.env`` supported)."""
+    load_dotenv()
+    model_name = pageindex_model or _clean_env("PAGEINDEX_MODEL") or _clean_env("LLM_MODEL") or "gpt-4o-2024-11-20"
+    llm_model, litellm_kwargs = _resolve_litellm(pageindex_model=str(model_name).strip())
+    openai_key = _clean_env("OPENAI_API_KEY") or _clean_env("CHATGPT_API_KEY")
+    if llm_model.startswith("azure/"):
+        if not (litellm_kwargs.get("api_key") and litellm_kwargs.get("api_base")):
+            raise ValueError(f"Azure model {llm_model!r} requires AZURE_API_KEY and AZURE_API_BASE in .env")
+    elif not openai_key and not litellm_kwargs.get("api_key"):
+        raise ValueError("Missing OPENAI_API_KEY in .env")
+    return AgentSettings(
+        llm_model=llm_model,
+        openai_api_key=openai_key,
+        pageindex_model=str(model_name).strip(),
+        litellm_kwargs=litellm_kwargs,
+    )