chunksmith-pageindex 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. chunksmith_pageindex-0.3.0/PKG-INFO +19 -0
  2. chunksmith_pageindex-0.3.0/pyproject.toml +32 -0
  3. chunksmith_pageindex-0.3.0/setup.cfg +4 -0
  4. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/__init__.py +6 -0
  5. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/config.py +75 -0
  6. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/config_defaults.py +11 -0
  7. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/exceptions.py +13 -0
  8. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/__init__.py +5 -0
  9. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/client.py +170 -0
  10. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/doc_description.py +51 -0
  11. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/extractor.py +96 -0
  12. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/prompts.py +56 -0
  13. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/llm_config.py +43 -0
  14. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/parser/__init__.py +5 -0
  15. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/parser/chunker.py +65 -0
  16. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/parser/document.py +66 -0
  17. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/__init__.py +5 -0
  18. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/builder.py +127 -0
  19. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/node_text.py +84 -0
  20. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/physical_index.py +114 -0
  21. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/schema.py +50 -0
  22. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/tree_transform.py +98 -0
  23. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/PKG-INFO +19 -0
  24. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/SOURCES.txt +25 -0
  25. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/dependency_links.txt +1 -0
  26. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/requires.txt +11 -0
  27. chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/top_level.txt +1 -0
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunksmith-pageindex
3
+ Version: 0.3.0
4
+ Summary: ChunkSmith PDF page-index outline pipeline (no Unstructured partition).
5
+ Author-email: AnshulParate2004 <anshulnparate@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
8
+ Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
9
+ Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: chunksmith-core>=0.3.0
12
+ Provides-Extra: pdf
13
+ Requires-Dist: pymupdf>=1.24.0; extra == "pdf"
14
+ Requires-Dist: PyPDF2>=3.0.0; extra == "pdf"
15
+ Provides-Extra: llm
16
+ Requires-Dist: httpx>=0.27.0; extra == "llm"
17
+ Requires-Dist: tiktoken>=0.7.0; extra == "llm"
18
+ Requires-Dist: langchain-core>=0.3.28; extra == "llm"
19
+ Requires-Dist: langchain-litellm>=0.2.0; extra == "llm"
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "chunksmith-pageindex"
7
+ version = "0.3.0"
8
+ description = "ChunkSmith PDF page-index outline pipeline (no Unstructured partition)."
9
+ requires-python = ">=3.10"
10
+ license = "MIT"
11
+ authors = [{ name = "AnshulParate2004", email = "anshulnparate@gmail.com" }]
12
+ dependencies = [
13
+ "chunksmith-core>=0.3.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ pdf = ["pymupdf>=1.24.0", "PyPDF2>=3.0.0"]
18
+ llm = [
19
+ "httpx>=0.27.0",
20
+ "tiktoken>=0.7.0",
21
+ "langchain-core>=0.3.28",
22
+ "langchain-litellm>=0.2.0",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/AnshulParate2004/chunksmith-lib"
27
+ Repository = "https://github.com/AnshulParate2004/chunksmith-lib"
28
+ Changelog = "https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md"
29
+
30
+ [tool.setuptools.packages.find]
31
+ where = ["src"]
32
+ include = ["chunksmith_pageindex", "chunksmith_pageindex.*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ """ChunkSmith PageIndexer (PDF parser + LLM outline)."""
2
+
3
+ from chunksmith_pageindex.config import RuntimeSettings, load_settings
4
+ from chunksmith_pageindex.tree.builder import build_outline_from_pdf
5
+
6
+ __all__ = ["RuntimeSettings", "load_settings", "build_outline_from_pdf"]
@@ -0,0 +1,75 @@
1
+ """Runtime settings: env vars + Python defaults (no secrets in defaults module)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+ from dotenv import load_dotenv
10
+
11
+ from chunksmith_pageindex.config_defaults import DEFAULTS
12
+ from chunksmith_pageindex.llm_config import _clean_env, resolve_litellm_config
13
+
14
+
15
+ def _env_bool(name: str, default: bool) -> bool:
16
+ v = os.getenv(name)
17
+ if v is None or v == "":
18
+ return default
19
+ return v.strip().lower() in ("1", "true", "yes", "on")
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class RuntimeSettings:
24
+ llm_model: str
25
+ openai_api_key: str | None
26
+ pageindex_model: str
27
+ pdf_parser: str
28
+ max_tokens_per_chunk: int
29
+ overlap_pages: int
30
+ generate_doc_summary: bool
31
+ litellm_kwargs: dict[str, Any] = field(default_factory=dict)
32
+
33
+
34
+ def _default_values(overrides: dict[str, Any] | None = None) -> dict[str, Any]:
35
+ merged = dict(DEFAULTS)
36
+ if overrides:
37
+ merged.update(overrides)
38
+ return merged
39
+
40
+
41
+ def load_settings(*, defaults: dict[str, Any] | None = None) -> RuntimeSettings:
42
+ load_dotenv()
43
+ y = _default_values(defaults)
44
+
45
+ openai_key = _clean_env("OPENAI_API_KEY") or _clean_env("CHATGPT_API_KEY")
46
+ model = (
47
+ _clean_env("CHUNKSMITH_LLM_MODEL")
48
+ or os.getenv("PAGEINDEX_MODEL")
49
+ or os.getenv("LLM_MODEL")
50
+ or y.get("model")
51
+ or "gpt-4o-2024-11-20"
52
+ )
53
+ pdf_parser = os.getenv("CHUNKSMITH_PDF_PARSER") or y.get("pdf_parser") or "PyPDF2"
54
+ max_tokens = int(os.getenv("CHUNKSMITH_MAX_TOKENS_PER_CHUNK") or y.get("max_tokens_per_chunk") or 20000)
55
+ overlap = int(os.getenv("CHUNKSMITH_OVERLAP_PAGES") or y.get("overlap_pages") or 1)
56
+ gen_doc = _env_bool("CHUNKSMITH_GENERATE_DOC_SUMMARY", bool(y.get("generate_doc_summary", False)))
57
+
58
+ litellm = resolve_litellm_config(pageindex_model=str(model).strip())
59
+
60
+ if litellm.model.startswith("azure/"):
61
+ if not (_clean_env("AZURE_API_KEY") and _clean_env("AZURE_API_BASE")):
62
+ raise ValueError(f"Azure model {litellm.model!r} requires AZURE_API_KEY and AZURE_API_BASE in .env")
63
+ elif not openai_key:
64
+ raise ValueError("Missing OPENAI_API_KEY in .env")
65
+
66
+ return RuntimeSettings(
67
+ llm_model=litellm.model,
68
+ openai_api_key=openai_key,
69
+ pageindex_model=str(model).strip(),
70
+ pdf_parser=pdf_parser.strip(),
71
+ max_tokens_per_chunk=max_tokens,
72
+ overlap_pages=max(0, overlap),
73
+ generate_doc_summary=gen_doc,
74
+ litellm_kwargs=dict(litellm.kwargs),
75
+ )
@@ -0,0 +1,11 @@
1
+ """Non-secret defaults (override with env vars — see repo .env.example)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ DEFAULTS: dict[str, object] = {
6
+ "model": "gpt-4o-2024-11-20",
7
+ "pdf_parser": "PyPDF2",
8
+ "max_tokens_per_chunk": 20_000,
9
+ "overlap_pages": 1,
10
+ "generate_doc_summary": False,
11
+ }
@@ -0,0 +1,13 @@
1
+ """Domain-specific errors for the page-index outline pipeline."""
2
+
3
+
4
+ class ChunksmithError(Exception):
5
+ """Base error for this package."""
6
+
7
+
8
+ class PdfLoadError(ChunksmithError):
9
+ """Failed to open or read a PDF."""
10
+
11
+
12
+ class OutlineExtractionError(ChunksmithError):
13
+ """LLM returned unusable output or finish_reason was not ``finished``."""
@@ -0,0 +1,5 @@
1
+ """LLM client and outline extractors."""
2
+
3
+ from chunksmith_pageindex.indexer import client, doc_description, extractor, prompts
4
+
5
+ __all__ = ["client", "doc_description", "extractor", "prompts"]
@@ -0,0 +1,170 @@
1
+ """ChatLiteLLM client, token counting, and JSON extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import time
8
+ from typing import Any, List, Optional, Tuple
9
+
10
+ import tiktoken
11
+ from langchain_core.messages import AIMessage, HumanMessage
12
+ from langchain_litellm import ChatLiteLLM
13
+
14
+ from chunksmith_pageindex.config import RuntimeSettings
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ _USAGE_TOTALS: dict[str, int] = {
19
+ "calls": 0,
20
+ "prompt_tokens": 0,
21
+ "completion_tokens": 0,
22
+ "total_tokens": 0,
23
+ }
24
+
25
+
26
+ def reset_usage_totals() -> None:
27
+ for k in _USAGE_TOTALS:
28
+ _USAGE_TOTALS[k] = 0
29
+
30
+
31
+ def get_usage_totals() -> dict[str, int]:
32
+ return dict(_USAGE_TOTALS)
33
+
34
+
35
+ def _record_usage(response: Any) -> None:
36
+ meta = getattr(response, "response_metadata", None) or {}
37
+ usage = meta.get("token_usage") or meta.get("usage") or {}
38
+ if not usage:
39
+ return
40
+ _USAGE_TOTALS["calls"] += 1
41
+ _USAGE_TOTALS["prompt_tokens"] += int(usage.get("prompt_tokens", 0) or 0)
42
+ _USAGE_TOTALS["completion_tokens"] += int(usage.get("completion_tokens", 0) or 0)
43
+ _USAGE_TOTALS["total_tokens"] += int(usage.get("total_tokens", 0) or 0)
44
+
45
+
46
+ def _tiktoken_encoding_for(model: str | None) -> tiktoken.Encoding:
47
+ probe = (model or "gpt-4o").split("/")[-1]
48
+ try:
49
+ return tiktoken.encoding_for_model(probe)
50
+ except KeyError:
51
+ return tiktoken.get_encoding("cl100k_base")
52
+
53
+
54
+ def count_tokens(text: str | None, model: str | None = None) -> int:
55
+ if not text:
56
+ return 0
57
+ enc = _tiktoken_encoding_for(model)
58
+ return len(enc.encode(text))
59
+
60
+
61
+ def build_chat_model(settings: RuntimeSettings, model: str | None = None) -> ChatLiteLLM:
62
+ return ChatLiteLLM(
63
+ model=model or settings.llm_model,
64
+ temperature=0,
65
+ **settings.litellm_kwargs,
66
+ )
67
+
68
+
69
+ def _to_langchain_messages(
70
+ prompt: str,
71
+ chat_history: Optional[List[dict]] = None,
72
+ ) -> list[HumanMessage | AIMessage]:
73
+ if not chat_history:
74
+ return [HumanMessage(content=prompt)]
75
+ messages: list[HumanMessage | AIMessage] = []
76
+ for row in chat_history:
77
+ role = str(row.get("role") or "").strip().lower()
78
+ content = str(row.get("content") or "")
79
+ if role == "assistant":
80
+ messages.append(AIMessage(content=content))
81
+ else:
82
+ messages.append(HumanMessage(content=content))
83
+ messages.append(HumanMessage(content=prompt))
84
+ return messages
85
+
86
+
87
+ def _finish_reason_from_response(response: Any) -> str:
88
+ meta = getattr(response, "response_metadata", None) or {}
89
+ reason = meta.get("finish_reason") or meta.get("stop_reason")
90
+ if isinstance(reason, str) and reason.strip():
91
+ if reason in ("length", "max_tokens"):
92
+ return "max_output_reached"
93
+ return "finished"
94
+ return "finished"
95
+
96
+
97
+ def ChatGPT_API_with_finish_reason(
98
+ settings: RuntimeSettings,
99
+ model: str,
100
+ prompt: str,
101
+ chat_history: Optional[List[dict]] = None,
102
+ ) -> Tuple[str, str]:
103
+ max_retries = 10
104
+ llm = build_chat_model(settings, model=model or None)
105
+ messages = _to_langchain_messages(prompt, chat_history)
106
+ for i in range(max_retries):
107
+ try:
108
+ response = llm.invoke(messages)
109
+ _record_usage(response)
110
+ if not (getattr(response, "response_metadata", None) or {}).get("token_usage"):
111
+ _USAGE_TOTALS["calls"] += 1
112
+ text = response.content if isinstance(response.content, str) else str(response.content or "")
113
+ return text, _finish_reason_from_response(response)
114
+ except Exception as e:
115
+ logger.warning("Chat completion retry %s: %s", i + 1, e)
116
+ if i < max_retries - 1:
117
+ time.sleep(1)
118
+ else:
119
+ logger.error("Max retries reached for chat completion")
120
+ return "Error", "error"
121
+ return "Error", "error"
122
+
123
+
124
+ def ChatGPT_API(
125
+ settings: RuntimeSettings,
126
+ model: str,
127
+ prompt: str,
128
+ chat_history: Optional[List[dict]] = None,
129
+ ) -> str:
130
+ text, _ = ChatGPT_API_with_finish_reason(settings, model, prompt, chat_history)
131
+ return text
132
+
133
+
134
+ def extract_json(content: str) -> Any:
135
+ try:
136
+ start_idx = content.find("```json")
137
+ if start_idx != -1:
138
+ start_idx += 7
139
+ end_idx = content.rfind("```")
140
+ json_content = content[start_idx:end_idx].strip()
141
+ else:
142
+ json_content = content.strip()
143
+
144
+ json_content = json_content.replace("None", "null")
145
+ json_content = " ".join(json_content.replace("\n", " ").replace("\r", " ").split())
146
+
147
+ return json.loads(json_content)
148
+ except json.JSONDecodeError:
149
+ try:
150
+ json_content = json_content.replace(",]", "]").replace(",}", "}")
151
+ return json.loads(json_content)
152
+ except Exception:
153
+ logger.exception("Failed to parse JSON from model output")
154
+ return {}
155
+ except Exception:
156
+ logger.exception("Unexpected error while extracting JSON")
157
+ return {}
158
+
159
+
160
+ def llm_completion(
161
+ settings: RuntimeSettings,
162
+ model: str | None,
163
+ prompt: str,
164
+ chat_history: Optional[List[dict]] = None,
165
+ return_finish_reason: bool = False,
166
+ ):
167
+ m = model or settings.llm_model
168
+ if return_finish_reason:
169
+ return ChatGPT_API_with_finish_reason(settings, m, prompt, chat_history=chat_history)
170
+ return ChatGPT_API(settings, m, prompt, chat_history=chat_history)
@@ -0,0 +1,51 @@
1
+ """One-shot document description after the outline tree is built (separate LLM call from TOC extraction)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any
8
+
9
+ from chunksmith_pageindex.config import RuntimeSettings
10
+ from chunksmith_pageindex.indexer import prompts
11
+ from chunksmith_pageindex.indexer.client import llm_completion
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def create_clean_structure_for_description(structure: Any) -> Any:
17
+ """
18
+ Strip heavy fields (e.g. full ``text``) before sending the tree to the description model.
19
+ Keeps ``title``, ``node_id``, ``summary``, and nested ``nodes``.
20
+ """
21
+ if isinstance(structure, dict):
22
+ clean_node: dict[str, Any] = {}
23
+ for key in ("title", "node_id", "summary"):
24
+ if key in structure:
25
+ clean_node[key] = structure[key]
26
+ children = structure.get("nodes")
27
+ if isinstance(children, list) and children:
28
+ clean_node["nodes"] = create_clean_structure_for_description(children)
29
+ return clean_node
30
+ if isinstance(structure, list):
31
+ return [create_clean_structure_for_description(item) for item in structure]
32
+ return structure
33
+
34
+
35
+ def generate_doc_description(
36
+ settings: RuntimeSettings,
37
+ structure: Any,
38
+ *,
39
+ model: str | None = None,
40
+ ) -> str:
41
+ """
42
+ Second-phase LLM: one plain completion, no chat history, unrelated to TOC init/continue prompts.
43
+ """
44
+ clean = create_clean_structure_for_description(structure)
45
+ structure_json = json.dumps(clean, ensure_ascii=False, indent=2)
46
+ prompt = prompts.build_doc_description_prompt(structure_json)
47
+ raw = llm_completion(settings, model, prompt, chat_history=None)
48
+ text = (raw or "").strip()
49
+ if not text or text == "Error":
50
+ logger.warning("Document description LLM returned empty or error placeholder")
51
+ return text
@@ -0,0 +1,96 @@
1
+ """LLM calls for init/continue outline extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any, List
8
+
9
+ from chunksmith_pageindex.config import RuntimeSettings
10
+ from chunksmith_pageindex.exceptions import OutlineExtractionError
11
+ from chunksmith_pageindex.indexer import client
12
+ from chunksmith_pageindex.indexer.prompts import (
13
+ ANCHOR_INSTRUCTION,
14
+ SUMMARY_INSTRUCTION,
15
+ TOC_CONTINUE_SYSTEM,
16
+ TOC_INIT_SYSTEM,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _outline_extra_instructions(*, include_summary: bool, include_word_range: bool) -> str:
23
+ parts: List[str] = []
24
+ if include_summary:
25
+ parts.append(SUMMARY_INSTRUCTION.strip())
26
+ if include_word_range:
27
+ parts.append(ANCHOR_INSTRUCTION.strip())
28
+ if not parts:
29
+ return ""
30
+ return "\n" + "\n".join(parts)
31
+
32
+
33
+ def format_given_text_block(part: str) -> str:
34
+ """Prefix the document excerpt for the outline prompt (verbatim anchor must match this text)."""
35
+ return f"\nGiven text\n:{part}"
36
+
37
+
38
+ def _rows_from_llm_response(response: str) -> List[dict[str, Any]]:
39
+ """Parse assistant text: prefer ``{"sections": [...]}``, else bare array."""
40
+ parsed = client.extract_json(response)
41
+ if isinstance(parsed, dict) and "sections" in parsed:
42
+ inner = parsed["sections"]
43
+ if isinstance(inner, list):
44
+ return inner
45
+ if isinstance(parsed, list):
46
+ return parsed
47
+ return []
48
+
49
+
50
+ def generate_toc_init(
51
+ settings: RuntimeSettings,
52
+ part: str,
53
+ model: str | None = None,
54
+ *,
55
+ include_summary: bool = False,
56
+ include_word_range: bool = False,
57
+ ) -> List[dict[str, Any]]:
58
+ m = model or settings.pageindex_model
59
+ head = TOC_INIT_SYSTEM.strip() + _outline_extra_instructions(
60
+ include_summary=include_summary,
61
+ include_word_range=include_word_range,
62
+ )
63
+ prompt = head + format_given_text_block(part)
64
+
65
+ response, finish_reason = client.llm_completion(settings, m, prompt, return_finish_reason=True)
66
+ if finish_reason != "finished":
67
+ raise OutlineExtractionError(f"generate_toc_init finish_reason={finish_reason!r}")
68
+ rows = _rows_from_llm_response(response)
69
+ if not rows:
70
+ raise OutlineExtractionError("generate_toc_init: no sections in model output")
71
+ return rows
72
+
73
+
74
+ def generate_toc_continue(
75
+ settings: RuntimeSettings,
76
+ toc_content: List[dict[str, Any]],
77
+ part: str,
78
+ model: str | None = None,
79
+ *,
80
+ include_summary: bool = False,
81
+ include_word_range: bool = False,
82
+ ) -> List[dict[str, Any]]:
83
+ m = model or settings.pageindex_model
84
+ head = TOC_CONTINUE_SYSTEM.strip() + _outline_extra_instructions(
85
+ include_summary=include_summary,
86
+ include_word_range=include_word_range,
87
+ )
88
+ prompt = head + format_given_text_block(part) + "\nPrevious outline (JSON)\n:" + json.dumps(toc_content, indent=2)
89
+
90
+ response, finish_reason = client.llm_completion(settings, m, prompt, return_finish_reason=True)
91
+ if finish_reason != "finished":
92
+ raise OutlineExtractionError(f"generate_toc_continue finish_reason={finish_reason!r}")
93
+ rows = _rows_from_llm_response(response)
94
+ if not rows:
95
+ raise OutlineExtractionError("generate_toc_continue: no sections in model output")
96
+ return rows
@@ -0,0 +1,56 @@
1
+ """Prompt templates for flat outline extraction (no-TOC path)."""
2
+
3
+ # Appended to init/continue instructions when ``add_summary`` is enabled (same completion as outline rows).
4
+ SUMMARY_INSTRUCTION = """
5
+ Each row must also include "summary" (string): one or two concise sentences describing what that section covers,
6
+ using only information visible in the given excerpt (no outside knowledge)."""
7
+
8
+ # Appended when ``add_word_range=True``: one verbatim alignment string per row (no word indices).
9
+ ANCHOR_INSTRUCTION = """
10
+ When section anchors are requested in the **same** API response, every row must include the usual outline keys
11
+ (structure, title, physical_index, and "summary" only if that was also requested) **plus**:
12
+
13
+ • ``split_document_anchor`` (string) — a **short verbatim substring** copied from the **Given text** excerpt below
14
+ (the text after ``Given text`` + newline + ``:``). Use text at or immediately after where that section begins
15
+ in this excerpt (e.g. heading like ``1 Introduction``, ``Abstract``, ``3.2 Attention``). Used to locate and
16
+ verify the section start in the excerpt; it must appear exactly as in the Given text.
17
+
18
+ Apply this on **every** row for this message. For **continue** calls, anchors refer only to the **current** excerpt
19
+ (the new Given text in this message), not to prior parts."""
20
+
21
+ TOC_INIT_SYSTEM = """
22
+ You are an expert in extracting hierarchical tree structure. Generate the tree structure of the document.
23
+ structure: numeric index, e.g. "1", "1.1", "1.2", "1.2.1", "1.2.2", "1.2.3", "1.2.4", "1.2.5", "1.2.6", "1.2.7", "1.2.8", "1.2.9", "1.2.10".
24
+ Tags <physical_index_X> in the given text mark where PDF page X begins. Long PDFs are split into multiple excerpts; this message is the first excerpt only.
25
+ Several sections may start on the same page—that is normal; set each row's physical_index from the tag nearest that section's start.
26
+ **Important:** ``physical_index`` is always the **printed PDF page** from those tags only—never infer it from section numbers in headings (e.g. a section titled ``3 Model Architecture`` may start on page 2 if the ``<physical_index_2>`` block contains that heading).
27
+ Each section row must include: "structure" (string), "title" (string), and "physical_index"
28
+ (either an integer start page or a string tag like "<physical_index_5>").
29
+ If the prompt also includes appended instructions (e.g. per-section summaries or ``split_document_anchor`` on the excerpt),
30
+ include those fields on every row as specified there.
31
+ Return JSON only: a markdown code block ```json containing either a JSON array of rows in reading order,
32
+ or an object {"sections": [ ...rows... ]}."""
33
+
34
+ TOC_CONTINUE_SYSTEM = """
35
+ You are an expert in extracting hierarchical tree structure.
36
+ You are given the previous outline as JSON and the text of the current part of the document.
37
+ Continue the outline: add new rows only for sections that appear in the current part.
38
+ structure: numeric index, e.g. "1", "1.1", "1.2", "1.2.1", "1.2.2", "1.2.3", "1.2.4", "1.2.5", "1.2.6", "1.2.7", "1.2.8", "1.2.9", "1.2.10".
39
+ Tags <physical_index_X> in the given text mark where PDF page X begins. The start of this excerpt often overlaps the end of the prior excerpt (the same pages can appear again)—use the tags to assign physical_index and do **not** duplicate sections already present in the previous outline JSON.
40
+ Several sections may start on the same page; that is normal.
41
+ Each row: "structure" (string), "title" (string), "physical_index" (int or "<physical_index_N>" string).
42
+ ``physical_index`` must be the PDF page from ``<physical_index_N>`` tags only, not from numeric prefixes in section titles.
43
+ When ``split_document_anchor`` is requested, it must be copied from **this** message's Given text excerpt only.
44
+ If the prompt also includes appended instructions (e.g. summaries or anchors), include those fields on each new row.
45
+ Return JSON only: ```json with either an array of only the additional rows, or {"sections": [ ... ]}."""
46
+
47
+
48
+ def build_doc_description_prompt(structure_json: str) -> str:
49
+ """User message for a standalone completion after the TOC tree exists (not mixed with outline extraction)."""
50
+ return f"""You are an expert in generating descriptions for a document.
51
+ You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
52
+
53
+ Document Structure:
54
+ {structure_json}
55
+
56
+ Directly return the description, do not include any other text."""
@@ -0,0 +1,43 @@
1
+ """LiteLLM / ChatLiteLLM env resolution for PageIndexer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+
10
+ def _clean_env(name: str) -> str | None:
11
+ raw = os.getenv(name)
12
+ if raw is None:
13
+ return None
14
+ value = str(raw).split("#", 1)[0].strip()
15
+ return value or None
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class LiteLLMConfig:
20
+ model: str
21
+ kwargs: dict[str, Any]
22
+
23
+
24
+ def resolve_litellm_config(*, pageindex_model: str) -> LiteLLMConfig:
25
+ model = (_clean_env("CHUNKSMITH_LLM_MODEL") or pageindex_model).strip()
26
+ kwargs: dict[str, Any] = {}
27
+
28
+ openai_key = _clean_env("OPENAI_API_KEY") or _clean_env("CHATGPT_API_KEY")
29
+ azure_key = _clean_env("AZURE_API_KEY")
30
+ azure_base = _clean_env("AZURE_API_BASE")
31
+ azure_version = _clean_env("AZURE_API_VERSION")
32
+
33
+ if model.startswith("azure/"):
34
+ if azure_key:
35
+ kwargs["api_key"] = azure_key
36
+ if azure_base:
37
+ kwargs["api_base"] = azure_base.rstrip("/")
38
+ if azure_version:
39
+ kwargs["api_version"] = azure_version
40
+ elif openai_key:
41
+ kwargs["api_key"] = openai_key
42
+
43
+ return LiteLLMConfig(model=model, kwargs=kwargs)
@@ -0,0 +1,5 @@
1
+ """PDF loading and page tagging."""
2
+
3
+ from chunksmith_pageindex.parser import chunker, document
4
+
5
+ __all__ = ["chunker", "document"]
@@ -0,0 +1,65 @@
1
+ """Tag pages with physical_index markers and split into token-budget groups."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from typing import List, Tuple
7
+
8
+ from chunksmith_pageindex.indexer.client import count_tokens
9
+
10
+
11
+ def build_tagged_page_strings(
12
+ page_list: List[Tuple[str, int]],
13
+ *,
14
+ start_index: int = 1,
15
+ model: str | None = None,
16
+ ) -> Tuple[List[str], List[int]]:
17
+ """
18
+ Wrap each page's text in ``<physical_index_N>`` tags.
19
+
20
+ Token counts are computed on the **tagged** strings (same as original PageIndex).
21
+ """
22
+ page_contents: List[str] = []
23
+ token_lengths: List[int] = []
24
+ for i, (page_text, _tok_unused) in enumerate(page_list):
25
+ page_num = start_index + i
26
+ wrapped = f"<physical_index_{page_num}>\n{page_text}\n<physical_index_{page_num}>\n\n"
27
+ page_contents.append(wrapped)
28
+ token_lengths.append(count_tokens(wrapped, model))
29
+ return page_contents, token_lengths
30
+
31
+
32
+ def page_list_to_group_text(
33
+ page_contents: List[str],
34
+ token_lengths: List[int],
35
+ *,
36
+ max_tokens: int = 20000,
37
+ overlap_page: int = 1,
38
+ ) -> List[str]:
39
+ """
40
+ Split tagged page strings into groups that each stay near ``max_tokens`` total.
41
+
42
+ Mirrors the original PageIndex grouping logic (with overlap between groups).
43
+ """
44
+ num_tokens = sum(token_lengths)
45
+ if num_tokens <= max_tokens:
46
+ return ["".join(page_contents)]
47
+
48
+ subsets: List[str] = []
49
+ current_subset: List[str] = []
50
+ current_token_count = 0
51
+ expected_parts_num = math.ceil(num_tokens / max_tokens)
52
+ average_tokens_per_part = math.ceil(((num_tokens / expected_parts_num) + max_tokens) / 2)
53
+
54
+ for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
55
+ if current_token_count + page_tokens > average_tokens_per_part:
56
+ subsets.append("".join(current_subset))
57
+ overlap_start = max(i - overlap_page, 0)
58
+ current_subset = list(page_contents[overlap_start:i])
59
+ current_token_count = sum(token_lengths[overlap_start:i])
60
+ current_subset.append(page_content)
61
+ current_token_count += page_tokens
62
+
63
+ if current_subset:
64
+ subsets.append("".join(current_subset))
65
+ return subsets
@@ -0,0 +1,66 @@
1
+ """Load PDF pages as (text, token_length) pairs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from io import BytesIO
7
+ from pathlib import Path
8
+ from typing import List, Tuple, Union
9
+
10
+ import PyPDF2
11
+ import pymupdf
12
+
13
+ from chunksmith_pageindex.config import RuntimeSettings
14
+ from chunksmith_pageindex.exceptions import PdfLoadError
15
+ from chunksmith_pageindex.indexer.client import count_tokens
16
+
17
+ PdfSource = Union[str, Path, BytesIO]
18
+
19
+
20
+ def load_pdf_pages(source: PdfSource, settings: RuntimeSettings) -> List[Tuple[str, int]]:
21
+ """
22
+ Extract one string per PDF page and count tokens (for chunking).
23
+
24
+ ``settings.pdf_parser`` must be ``PyPDF2`` or ``PyMuPDF``.
25
+ """
26
+ parser = settings.pdf_parser
27
+ model = settings.pageindex_model
28
+
29
+ if parser == "PyPDF2":
30
+ return _load_pypdf2(source, model)
31
+ if parser == "PyMuPDF":
32
+ return _load_pymupdf(source, model)
33
+ raise PdfLoadError(f"Unsupported pdf_parser: {parser!r} (use PyPDF2 or PyMuPDF)")
34
+
35
+
36
+ def _load_pypdf2(source: PdfSource, model: str) -> List[Tuple[str, int]]:
37
+ if isinstance(source, (str, Path)):
38
+ path = str(source)
39
+ if not os.path.isfile(path):
40
+ raise PdfLoadError(f"Not a file: {path}")
41
+ reader = PyPDF2.PdfReader(path)
42
+ else:
43
+ reader = PyPDF2.PdfReader(source)
44
+ out: List[Tuple[str, int]] = []
45
+ for page in reader.pages:
46
+ text = page.extract_text() or ""
47
+ out.append((text, count_tokens(text, model)))
48
+ return out
49
+
50
+
51
+ def _load_pymupdf(source: PdfSource, model: str) -> List[Tuple[str, int]]:
52
+ if isinstance(source, (str, Path)):
53
+ path = str(source)
54
+ if not os.path.isfile(path):
55
+ raise PdfLoadError(f"Not a file: {path}")
56
+ doc = pymupdf.open(path)
57
+ else:
58
+ doc = pymupdf.open(stream=source, filetype="pdf")
59
+ try:
60
+ out: List[Tuple[str, int]] = []
61
+ for page in doc:
62
+ text = page.get_text() or ""
63
+ out.append((text, count_tokens(text, model)))
64
+ return out
65
+ finally:
66
+ doc.close()
@@ -0,0 +1,5 @@
1
+ """Flat row validation and nested tree assembly."""
2
+
3
+ from chunksmith_pageindex.tree import builder, schema
4
+
5
+ __all__ = ["builder", "schema"]
@@ -0,0 +1,127 @@
1
+ """Merge flat TOC rows, convert indices, build nested tree, assign node IDs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ from pathlib import Path
7
+ from typing import Any, Optional, Union
8
+
9
+ from chunksmith_pageindex.config import RuntimeSettings, load_settings
10
+ from chunksmith_pageindex.exceptions import OutlineExtractionError
11
+ from chunksmith_pageindex.indexer import doc_description, extractor
12
+ from chunksmith_pageindex.parser import chunker, document
13
+ from chunksmith_pageindex.tree import schema
14
+ from chunksmith_pageindex.tree.node_text import add_node_text
15
+ from chunksmith_pageindex.tree.physical_index import (
16
+ convert_physical_index_to_int,
17
+ refine_physical_index_from_excerpt_tags,
18
+ refine_physical_index_from_pdf_page_text,
19
+ )
20
+ from chunksmith_pageindex.tree.tree_transform import (
21
+ add_preface_if_needed,
22
+ post_processing,
23
+ write_node_id,
24
+ )
25
+
26
+
27
+ def build_outline_from_pdf(
28
+ pdf_path: Union[str, Path],
29
+ settings: Optional[RuntimeSettings] = None,
30
+ *,
31
+ add_text: bool = False,
32
+ assign_node_ids: bool = False,
33
+ add_summary: bool = False,
34
+ add_word_range: bool = False,
35
+ generate_doc_summary: Optional[bool] = None,
36
+ ) -> dict[str, Any]:
37
+ """
38
+ End-to-end no-TOC outline: PDF → tagged chunks → LLM flat rows → tree.
39
+
40
+ If ``add_word_range`` is True, each row also has ``split_document_anchor`` (verbatim excerpt substring at the
41
+ section start) and ``chunk_excerpt_index`` (which LLM chunk produced that row).
42
+ """
43
+ settings = settings or load_settings()
44
+ path = Path(pdf_path).resolve()
45
+ page_list = document.load_pdf_pages(path, settings)
46
+ if not page_list:
47
+ raise OutlineExtractionError("No pages extracted from PDF")
48
+
49
+ tagged, lengths = chunker.build_tagged_page_strings(page_list, start_index=1, model=settings.pageindex_model)
50
+ groups = chunker.page_list_to_group_text(
51
+ tagged,
52
+ lengths,
53
+ max_tokens=settings.max_tokens_per_chunk,
54
+ overlap_page=settings.overlap_pages,
55
+ )
56
+
57
+ init_rows = extractor.generate_toc_init(
58
+ settings,
59
+ groups[0],
60
+ include_summary=add_summary,
61
+ include_word_range=add_word_range,
62
+ )
63
+ chunk_of_row: list[int] = [0] * len(init_rows)
64
+ flat = list(init_rows)
65
+ for chunk_idx, part in enumerate(groups[1:], start=1):
66
+ cont_rows = extractor.generate_toc_continue(
67
+ settings,
68
+ flat,
69
+ part,
70
+ include_summary=add_summary,
71
+ include_word_range=add_word_range,
72
+ )
73
+ chunk_of_row.extend([chunk_idx] * len(cont_rows))
74
+ flat.extend(cont_rows)
75
+
76
+ convert_physical_index_to_int(flat)
77
+ before_preface = len(chunk_of_row)
78
+ flat = add_preface_if_needed(flat, include_summary=add_summary)
79
+ if len(flat) == before_preface + 1:
80
+ chunk_of_row.insert(0, 0)
81
+ if add_word_range and flat and str(flat[0].get("title", "")).strip() == "Preface":
82
+ flat[0].setdefault("split_document_anchor", "Preface")
83
+
84
+ paired = [(c, x) for c, x in zip(chunk_of_row, flat) if x.get("physical_index") is not None]
85
+ chunk_of_row = [p[0] for p in paired]
86
+ flat = [p[1] for p in paired]
87
+
88
+ refine_physical_index_from_excerpt_tags(flat, chunk_of_row, groups)
89
+ refine_physical_index_from_pdf_page_text(flat, page_list)
90
+
91
+ try:
92
+ validated = schema.validate_toc_rows(flat)
93
+ use_rows: list[dict[str, Any]] = []
94
+ for i, r in enumerate(validated):
95
+ d = r.model_dump()
96
+ if add_word_range:
97
+ d["chunk_excerpt_index"] = chunk_of_row[i]
98
+ use_rows.append(d)
99
+ except Exception as e:
100
+ raise OutlineExtractionError(f"TOC row validation failed: {e}") from e
101
+ if not add_summary:
102
+ for d in use_rows:
103
+ d.pop("summary", None)
104
+ if not add_word_range:
105
+ for d in use_rows:
106
+ d.pop("split_document_anchor", None)
107
+ d.pop("chunk_excerpt_index", None)
108
+
109
+ end_page = len(page_list)
110
+ tree = post_processing(copy.deepcopy(use_rows), end_page)
111
+
112
+ if assign_node_ids:
113
+ write_node_id(tree)
114
+
115
+ if add_text:
116
+ add_node_text(tree, page_list)
117
+
118
+ do_doc_summary = settings.generate_doc_summary if generate_doc_summary is None else generate_doc_summary
119
+ out: dict[str, Any] = {
120
+ "doc_name": path.name,
121
+ "structure": tree,
122
+ }
123
+ if do_doc_summary:
124
+ out["doc_description"] = doc_description.generate_doc_description(
125
+ settings, tree, model=settings.pageindex_model
126
+ )
127
+ return out
@@ -0,0 +1,84 @@
1
+ """Fill outline node ``text`` from PDF pages using ``start_index`` / ``end_index``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, List
6
+
7
+
8
+ def _needle_in_blob(node: dict[str, Any], blob: str) -> str:
9
+ """Prefer ``split_document_anchor``, then ``title``, first substring that appears in ``blob``."""
10
+ for key in ("split_document_anchor", "title"):
11
+ v = str(node.get(key) or "").strip()
12
+ if v and v in blob:
13
+ return v
14
+ return ""
15
+
16
+
17
+ def page_span_plain_text(
18
+ start_1b: int,
19
+ end_1b_inclusive: int,
20
+ pdf_pages: List[tuple[str, int]],
21
+ ) -> str:
22
+ """Concatenate extracted text for PDF pages ``start_1b`` … ``end_1b_inclusive`` (1-based, inclusive)."""
23
+ s = int(start_1b)
24
+ e = int(end_1b_inclusive)
25
+ if e < s:
26
+ s, e = e, s
27
+ parts: List[str] = []
28
+ for pi in range(s - 1, e):
29
+ if 0 <= pi < len(pdf_pages):
30
+ parts.append(pdf_pages[pi][0])
31
+ return "".join(parts)
32
+
33
+
34
+ def iter_outline_nodes_preorder(tree: Any) -> List[dict[str, Any]]:
35
+ """Depth-first pre-order over nested ``nodes`` (same order as typical reading / JSON walk)."""
36
+ out: List[dict[str, Any]] = []
37
+
38
+ def walk(x: Any) -> None:
39
+ if isinstance(x, dict):
40
+ out.append(x)
41
+ ch = x.get("nodes")
42
+ if isinstance(ch, list):
43
+ for c in ch:
44
+ walk(c)
45
+ elif isinstance(x, list):
46
+ for item in x:
47
+ walk(item)
48
+
49
+ walk(tree)
50
+ return out
51
+
52
+
53
+ def add_node_text(tree: Any, pdf_pages: List[tuple[str, int]]) -> None:
54
+ """
55
+ Set each node's ``text`` to the slice of its page span that belongs to that section.
56
+
57
+ Builds plain text from ``start_index`` … ``end_index`` (inclusive), then cuts from this node's
58
+ heading needle to the next outline node's needle in pre-order (so siblings on the same page do
59
+ not duplicate each other's bodies). If no needle matches, uses the full page span.
60
+ """
61
+ nodes = iter_outline_nodes_preorder(tree)
62
+ for i, node in enumerate(nodes):
63
+ start_p = node.get("start_index")
64
+ end_p = node.get("end_index")
65
+ if start_p is None or end_p is None:
66
+ continue
67
+ blob = page_span_plain_text(int(start_p), int(end_p), pdf_pages).strip()
68
+ if not blob:
69
+ node["text"] = ""
70
+ continue
71
+ start_needle = _needle_in_blob(node, blob)
72
+ next_needle = ""
73
+ if i + 1 < len(nodes):
74
+ next_needle = _needle_in_blob(nodes[i + 1], blob)
75
+ if start_needle:
76
+ start_char = blob.find(start_needle)
77
+ end_char = len(blob)
78
+ if next_needle:
79
+ nx = blob.find(next_needle, start_char + max(1, len(start_needle)))
80
+ if nx >= 0:
81
+ end_char = nx
82
+ node["text"] = blob[start_char:end_char].strip()
83
+ else:
84
+ node["text"] = blob
@@ -0,0 +1,114 @@
1
+ """Physical index refinement from excerpt tags and PDF page text."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, List
6
+
7
+
8
+ def convert_physical_index_to_int(data: List[dict[str, Any]]) -> None:
9
+ """Normalize ``physical_index`` strings like ``<physical_index_5>`` to ints in-place."""
10
+ for item in data:
11
+ if not isinstance(item, dict) or "physical_index" not in item:
12
+ continue
13
+ pi = item["physical_index"]
14
+ if isinstance(pi, str):
15
+ if pi.startswith("<physical_index_"):
16
+ item["physical_index"] = int(pi.split("_")[-1].rstrip(">").strip())
17
+ elif pi.startswith("physical_index_"):
18
+ item["physical_index"] = int(pi.split("_")[-1].strip())
19
+
20
+
21
+ def physical_page_at_char(tagged_excerpt: str, char_index: int) -> int:
22
+ i = 0
23
+ current = 1
24
+ limit = max(0, min(len(tagged_excerpt), char_index))
25
+ prefix = "<physical_index_"
26
+ plen = len(prefix)
27
+ while i < limit:
28
+ if tagged_excerpt.startswith(prefix, i):
29
+ j = tagged_excerpt.find(">", i)
30
+ if j == -1:
31
+ break
32
+ current = int(tagged_excerpt[i + plen : j])
33
+ i = j + 1
34
+ else:
35
+ i += 1
36
+ return current
37
+
38
+
39
+ def find_row_start_in_excerpt(excerpt: str, row: dict[str, Any], search_from: int) -> int:
40
+ anchor = str(row.get("split_document_anchor") or "").strip()
41
+ if anchor:
42
+ p = excerpt.find(anchor, search_from)
43
+ if p >= 0:
44
+ return p
45
+ title = str(row.get("title") or "").strip()
46
+ if title:
47
+ p = excerpt.find(title, search_from)
48
+ if p >= 0:
49
+ return p
50
+ return -1
51
+
52
+
53
+ def refine_physical_index_from_excerpt_tags(
54
+ rows: List[dict[str, Any]],
55
+ chunk_idx_per_row: List[int],
56
+ groups: List[str],
57
+ ) -> None:
58
+ if not rows or len(chunk_idx_per_row) != len(rows):
59
+ return
60
+ prev_chunk = -1
61
+ scan_from = 0
62
+ for row, chi in zip(rows, chunk_idx_per_row):
63
+ if not isinstance(row, dict) or row.get("physical_index") is None:
64
+ continue
65
+ if not isinstance(chi, int) or chi < 0 or chi >= len(groups):
66
+ continue
67
+ excerpt = groups[chi]
68
+ if chi != prev_chunk:
69
+ scan_from = 0
70
+ prev_chunk = chi
71
+ pos = find_row_start_in_excerpt(excerpt, row, scan_from)
72
+ if pos < 0:
73
+ continue
74
+ try:
75
+ row["physical_index"] = physical_page_at_char(excerpt, pos)
76
+ except ValueError:
77
+ continue
78
+ scan_from = pos + 1
79
+
80
+
81
+ def refine_physical_index_from_pdf_page_text(
82
+ rows: List[dict[str, Any]],
83
+ page_list: List[tuple[str, int]],
84
+ ) -> None:
85
+ if not rows or not page_list:
86
+ return
87
+ texts = [p[0] for p in page_list]
88
+ n = len(texts)
89
+ prev_page = 1
90
+ for row in rows:
91
+ if not isinstance(row, dict) or row.get("physical_index") is None:
92
+ continue
93
+ anchor = str(row.get("split_document_anchor") or "").strip()
94
+ title = str(row.get("title") or "").strip()
95
+ needles: List[str] = []
96
+ if anchor:
97
+ needles.append(anchor)
98
+ if title and title not in needles:
99
+ needles.append(title)
100
+ if not needles:
101
+ continue
102
+ found: int | None = None
103
+ lo = max(0, prev_page - 1)
104
+ for pi in range(lo, n):
105
+ body = texts[pi]
106
+ for needle in needles:
107
+ if needle in body:
108
+ found = pi + 1
109
+ break
110
+ if found is not None:
111
+ break
112
+ if found is not None:
113
+ row["physical_index"] = found
114
+ prev_page = found
@@ -0,0 +1,50 @@
1
+ """Pydantic models for LLM flat rows and optional nested outline nodes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, List, Optional
6
+
7
+ from pydantic import AliasChoices, BaseModel, ConfigDict, Field, TypeAdapter
8
+
9
+
10
+ class TocFlatRow(BaseModel):
11
+ """One outline row (LLM output)."""
12
+
13
+ model_config = ConfigDict(extra="ignore")
14
+
15
+ structure: str
16
+ title: str = ""
17
+ physical_index: str | int | None = None
18
+ # Filled when ``add_summary=True`` on ``build_outline_from_pdf`` (same outline API calls).
19
+ summary: str = ""
20
+ # When ``add_word_range=True``: verbatim substring from the excerpt at the section start.
21
+ split_document_anchor: str = Field(
22
+ default="",
23
+ validation_alias=AliasChoices(
24
+ "split_document_anchor",
25
+ "document_anchor_word",
26
+ "split_from_document_word",
27
+ ),
28
+ )
29
+
30
+
31
+ def validate_toc_rows(raw: Any) -> List[TocFlatRow]:
32
+ if not isinstance(raw, list):
33
+ return []
34
+ adapter = TypeAdapter(List[TocFlatRow])
35
+ return adapter.validate_python(raw)
36
+
37
+
38
+ class OutlineNode(BaseModel):
39
+ """Nested node after ``post_processing`` / ``list_to_tree`` (optional validation)."""
40
+
41
+ model_config = ConfigDict(extra="allow")
42
+
43
+ title: str = ""
44
+ start_index: Optional[int] = None
45
+ end_index: Optional[int] = None
46
+ node_id: Optional[str] = None
47
+ summary: Optional[str] = None
48
+ split_document_anchor: Optional[str] = None
49
+ text: Optional[str] = None
50
+ nodes: Optional[List[OutlineNode]] = None
@@ -0,0 +1,98 @@
1
+ """Flat TOC row → nested tree transforms."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, List
6
+
7
+
8
+ def add_preface_if_needed(data: List[dict[str, Any]], *, include_summary: bool = False) -> List[dict[str, Any]]:
9
+ if not data:
10
+ return data
11
+ if data[0].get("physical_index") is not None and data[0]["physical_index"] > 1:
12
+ row: dict[str, Any] = {"structure": "0", "title": "Preface", "physical_index": 1}
13
+ if include_summary:
14
+ row["summary"] = ""
15
+ data.insert(0, row)
16
+ return data
17
+
18
+
19
+ def list_to_tree(data: List[dict[str, Any]]) -> List[dict[str, Any]]:
20
+ def get_parent_structure(structure: str | None) -> str | None:
21
+ if not structure:
22
+ return None
23
+ parts = str(structure).split(".")
24
+ return ".".join(parts[:-1]) if len(parts) > 1 else None
25
+
26
+ nodes: dict[str, dict[str, Any]] = {}
27
+ root_nodes: List[dict[str, Any]] = []
28
+
29
+ for item in data:
30
+ structure = item.get("structure")
31
+ sk = str(structure) if structure is not None else ""
32
+ node = {
33
+ "title": item.get("title"),
34
+ "start_index": item.get("start_index"),
35
+ "end_index": item.get("end_index"),
36
+ "nodes": [],
37
+ }
38
+ if "summary" in item:
39
+ node["summary"] = str(item.get("summary") or "").strip()
40
+ anchor = str(item.get("split_document_anchor") or "").strip()
41
+ if anchor:
42
+ node["split_document_anchor"] = anchor
43
+ cxi = item.get("chunk_excerpt_index")
44
+ if cxi is not None:
45
+ node["chunk_excerpt_index"] = int(cxi)
46
+ nodes[sk] = node
47
+ parent_structure = get_parent_structure(sk if sk else None)
48
+
49
+ if parent_structure:
50
+ if parent_structure in nodes:
51
+ nodes[parent_structure]["nodes"].append(node)
52
+ else:
53
+ root_nodes.append(node)
54
+ else:
55
+ root_nodes.append(node)
56
+
57
+ def clean_node(node: dict[str, Any]) -> dict[str, Any]:
58
+ if not node.get("nodes"):
59
+ node.pop("nodes", None)
60
+ else:
61
+ for child in node["nodes"]:
62
+ clean_node(child)
63
+ return node
64
+
65
+ return [clean_node(n) for n in root_nodes]
66
+
67
+
68
+ def post_processing(structure: List[dict[str, Any]], end_physical_index: int) -> List[dict[str, Any]]:
69
+ for i, item in enumerate(structure):
70
+ item["start_index"] = item.get("physical_index")
71
+ if i < len(structure) - 1:
72
+ if structure[i + 1].get("appear_start") == "yes":
73
+ item["end_index"] = structure[i + 1]["physical_index"] - 1
74
+ else:
75
+ item["end_index"] = structure[i + 1]["physical_index"]
76
+ else:
77
+ item["end_index"] = end_physical_index
78
+ tree = list_to_tree(structure)
79
+ if tree:
80
+ return tree
81
+ for node in structure:
82
+ node.pop("appear_start", None)
83
+ node.pop("physical_index", None)
84
+ return structure
85
+
86
+
87
+ def write_node_id(data: Any, node_id: int = 0) -> int:
88
+ if isinstance(data, dict):
89
+ data["node_id"] = str(node_id).zfill(4)
90
+ node_id += 1
91
+ children = data.get("nodes")
92
+ if isinstance(children, list):
93
+ for item in children:
94
+ node_id = write_node_id(item, node_id)
95
+ elif isinstance(data, list):
96
+ for item in data:
97
+ node_id = write_node_id(item, node_id)
98
+ return node_id
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunksmith-pageindex
3
+ Version: 0.3.0
4
+ Summary: ChunkSmith PDF page-index outline pipeline (no Unstructured partition).
5
+ Author-email: AnshulParate2004 <anshulnparate@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
8
+ Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
9
+ Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: chunksmith-core>=0.3.0
12
+ Provides-Extra: pdf
13
+ Requires-Dist: pymupdf>=1.24.0; extra == "pdf"
14
+ Requires-Dist: PyPDF2>=3.0.0; extra == "pdf"
15
+ Provides-Extra: llm
16
+ Requires-Dist: httpx>=0.27.0; extra == "llm"
17
+ Requires-Dist: tiktoken>=0.7.0; extra == "llm"
18
+ Requires-Dist: langchain-core>=0.3.28; extra == "llm"
19
+ Requires-Dist: langchain-litellm>=0.2.0; extra == "llm"
@@ -0,0 +1,25 @@
1
+ pyproject.toml
2
+ src/chunksmith_pageindex/__init__.py
3
+ src/chunksmith_pageindex/config.py
4
+ src/chunksmith_pageindex/config_defaults.py
5
+ src/chunksmith_pageindex/exceptions.py
6
+ src/chunksmith_pageindex/llm_config.py
7
+ src/chunksmith_pageindex.egg-info/PKG-INFO
8
+ src/chunksmith_pageindex.egg-info/SOURCES.txt
9
+ src/chunksmith_pageindex.egg-info/dependency_links.txt
10
+ src/chunksmith_pageindex.egg-info/requires.txt
11
+ src/chunksmith_pageindex.egg-info/top_level.txt
12
+ src/chunksmith_pageindex/indexer/__init__.py
13
+ src/chunksmith_pageindex/indexer/client.py
14
+ src/chunksmith_pageindex/indexer/doc_description.py
15
+ src/chunksmith_pageindex/indexer/extractor.py
16
+ src/chunksmith_pageindex/indexer/prompts.py
17
+ src/chunksmith_pageindex/parser/__init__.py
18
+ src/chunksmith_pageindex/parser/chunker.py
19
+ src/chunksmith_pageindex/parser/document.py
20
+ src/chunksmith_pageindex/tree/__init__.py
21
+ src/chunksmith_pageindex/tree/builder.py
22
+ src/chunksmith_pageindex/tree/node_text.py
23
+ src/chunksmith_pageindex/tree/physical_index.py
24
+ src/chunksmith_pageindex/tree/schema.py
25
+ src/chunksmith_pageindex/tree/tree_transform.py
@@ -0,0 +1,11 @@
1
+ chunksmith-core>=0.3.0
2
+
3
+ [llm]
4
+ httpx>=0.27.0
5
+ tiktoken>=0.7.0
6
+ langchain-core>=0.3.28
7
+ langchain-litellm>=0.2.0
8
+
9
+ [pdf]
10
+ pymupdf>=1.24.0
11
+ PyPDF2>=3.0.0