chunksmith-pageindex 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_pageindex-0.3.0/PKG-INFO +19 -0
- chunksmith_pageindex-0.3.0/pyproject.toml +32 -0
- chunksmith_pageindex-0.3.0/setup.cfg +4 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/__init__.py +6 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/config.py +75 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/config_defaults.py +11 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/exceptions.py +13 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/__init__.py +5 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/client.py +170 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/doc_description.py +51 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/extractor.py +96 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/indexer/prompts.py +56 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/llm_config.py +43 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/parser/__init__.py +5 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/parser/chunker.py +65 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/parser/document.py +66 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/__init__.py +5 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/builder.py +127 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/node_text.py +84 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/physical_index.py +114 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/schema.py +50 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex/tree/tree_transform.py +98 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/PKG-INFO +19 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/SOURCES.txt +25 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/dependency_links.txt +1 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/requires.txt +11 -0
- chunksmith_pageindex-0.3.0/src/chunksmith_pageindex.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunksmith-pageindex
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: ChunkSmith PDF page-index outline pipeline (no Unstructured partition).
|
|
5
|
+
Author-email: AnshulParate2004 <anshulnparate@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
|
|
8
|
+
Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
|
|
9
|
+
Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: chunksmith-core>=0.3.0
|
|
12
|
+
Provides-Extra: pdf
|
|
13
|
+
Requires-Dist: pymupdf>=1.24.0; extra == "pdf"
|
|
14
|
+
Requires-Dist: PyPDF2>=3.0.0; extra == "pdf"
|
|
15
|
+
Provides-Extra: llm
|
|
16
|
+
Requires-Dist: httpx>=0.27.0; extra == "llm"
|
|
17
|
+
Requires-Dist: tiktoken>=0.7.0; extra == "llm"
|
|
18
|
+
Requires-Dist: langchain-core>=0.3.28; extra == "llm"
|
|
19
|
+
Requires-Dist: langchain-litellm>=0.2.0; extra == "llm"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chunksmith-pageindex"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "ChunkSmith PDF page-index outline pipeline (no Unstructured partition)."
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [{ name = "AnshulParate2004", email = "anshulnparate@gmail.com" }]
|
|
12
|
+
dependencies = [
|
|
13
|
+
"chunksmith-core>=0.3.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
pdf = ["pymupdf>=1.24.0", "PyPDF2>=3.0.0"]
|
|
18
|
+
llm = [
|
|
19
|
+
"httpx>=0.27.0",
|
|
20
|
+
"tiktoken>=0.7.0",
|
|
21
|
+
"langchain-core>=0.3.28",
|
|
22
|
+
"langchain-litellm>=0.2.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/AnshulParate2004/chunksmith-lib"
|
|
27
|
+
Repository = "https://github.com/AnshulParate2004/chunksmith-lib"
|
|
28
|
+
Changelog = "https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["src"]
|
|
32
|
+
include = ["chunksmith_pageindex", "chunksmith_pageindex.*"]
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""ChunkSmith PageIndexer (PDF parser + LLM outline)."""
|
|
2
|
+
|
|
3
|
+
from chunksmith_pageindex.config import RuntimeSettings, load_settings
|
|
4
|
+
from chunksmith_pageindex.tree.builder import build_outline_from_pdf
|
|
5
|
+
|
|
6
|
+
__all__ = ["RuntimeSettings", "load_settings", "build_outline_from_pdf"]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Runtime settings: env vars + Python defaults (no secrets in defaults module)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
|
|
11
|
+
from chunksmith_pageindex.config_defaults import DEFAULTS
|
|
12
|
+
from chunksmith_pageindex.llm_config import _clean_env, resolve_litellm_config
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _env_bool(name: str, default: bool) -> bool:
|
|
16
|
+
v = os.getenv(name)
|
|
17
|
+
if v is None or v == "":
|
|
18
|
+
return default
|
|
19
|
+
return v.strip().lower() in ("1", "true", "yes", "on")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class RuntimeSettings:
|
|
24
|
+
llm_model: str
|
|
25
|
+
openai_api_key: str | None
|
|
26
|
+
pageindex_model: str
|
|
27
|
+
pdf_parser: str
|
|
28
|
+
max_tokens_per_chunk: int
|
|
29
|
+
overlap_pages: int
|
|
30
|
+
generate_doc_summary: bool
|
|
31
|
+
litellm_kwargs: dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _default_values(overrides: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
35
|
+
merged = dict(DEFAULTS)
|
|
36
|
+
if overrides:
|
|
37
|
+
merged.update(overrides)
|
|
38
|
+
return merged
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_settings(*, defaults: dict[str, Any] | None = None) -> RuntimeSettings:
|
|
42
|
+
load_dotenv()
|
|
43
|
+
y = _default_values(defaults)
|
|
44
|
+
|
|
45
|
+
openai_key = _clean_env("OPENAI_API_KEY") or _clean_env("CHATGPT_API_KEY")
|
|
46
|
+
model = (
|
|
47
|
+
_clean_env("CHUNKSMITH_LLM_MODEL")
|
|
48
|
+
or os.getenv("PAGEINDEX_MODEL")
|
|
49
|
+
or os.getenv("LLM_MODEL")
|
|
50
|
+
or y.get("model")
|
|
51
|
+
or "gpt-4o-2024-11-20"
|
|
52
|
+
)
|
|
53
|
+
pdf_parser = os.getenv("CHUNKSMITH_PDF_PARSER") or y.get("pdf_parser") or "PyPDF2"
|
|
54
|
+
max_tokens = int(os.getenv("CHUNKSMITH_MAX_TOKENS_PER_CHUNK") or y.get("max_tokens_per_chunk") or 20000)
|
|
55
|
+
overlap = int(os.getenv("CHUNKSMITH_OVERLAP_PAGES") or y.get("overlap_pages") or 1)
|
|
56
|
+
gen_doc = _env_bool("CHUNKSMITH_GENERATE_DOC_SUMMARY", bool(y.get("generate_doc_summary", False)))
|
|
57
|
+
|
|
58
|
+
litellm = resolve_litellm_config(pageindex_model=str(model).strip())
|
|
59
|
+
|
|
60
|
+
if litellm.model.startswith("azure/"):
|
|
61
|
+
if not (_clean_env("AZURE_API_KEY") and _clean_env("AZURE_API_BASE")):
|
|
62
|
+
raise ValueError(f"Azure model {litellm.model!r} requires AZURE_API_KEY and AZURE_API_BASE in .env")
|
|
63
|
+
elif not openai_key:
|
|
64
|
+
raise ValueError("Missing OPENAI_API_KEY in .env")
|
|
65
|
+
|
|
66
|
+
return RuntimeSettings(
|
|
67
|
+
llm_model=litellm.model,
|
|
68
|
+
openai_api_key=openai_key,
|
|
69
|
+
pageindex_model=str(model).strip(),
|
|
70
|
+
pdf_parser=pdf_parser.strip(),
|
|
71
|
+
max_tokens_per_chunk=max_tokens,
|
|
72
|
+
overlap_pages=max(0, overlap),
|
|
73
|
+
generate_doc_summary=gen_doc,
|
|
74
|
+
litellm_kwargs=dict(litellm.kwargs),
|
|
75
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Non-secret defaults (override with env vars — see repo .env.example)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
DEFAULTS: dict[str, object] = {
|
|
6
|
+
"model": "gpt-4o-2024-11-20",
|
|
7
|
+
"pdf_parser": "PyPDF2",
|
|
8
|
+
"max_tokens_per_chunk": 20_000,
|
|
9
|
+
"overlap_pages": 1,
|
|
10
|
+
"generate_doc_summary": False,
|
|
11
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Domain-specific errors for the page-index outline pipeline."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ChunksmithError(Exception):
|
|
5
|
+
"""Base error for this package."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PdfLoadError(ChunksmithError):
|
|
9
|
+
"""Failed to open or read a PDF."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OutlineExtractionError(ChunksmithError):
|
|
13
|
+
"""LLM returned unusable output or finish_reason was not ``finished``."""
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""ChatLiteLLM client, token counting, and JSON extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import tiktoken
|
|
11
|
+
from langchain_core.messages import AIMessage, HumanMessage
|
|
12
|
+
from langchain_litellm import ChatLiteLLM
|
|
13
|
+
|
|
14
|
+
from chunksmith_pageindex.config import RuntimeSettings
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
_USAGE_TOTALS: dict[str, int] = {
|
|
19
|
+
"calls": 0,
|
|
20
|
+
"prompt_tokens": 0,
|
|
21
|
+
"completion_tokens": 0,
|
|
22
|
+
"total_tokens": 0,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def reset_usage_totals() -> None:
|
|
27
|
+
for k in _USAGE_TOTALS:
|
|
28
|
+
_USAGE_TOTALS[k] = 0
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_usage_totals() -> dict[str, int]:
|
|
32
|
+
return dict(_USAGE_TOTALS)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _record_usage(response: Any) -> None:
|
|
36
|
+
meta = getattr(response, "response_metadata", None) or {}
|
|
37
|
+
usage = meta.get("token_usage") or meta.get("usage") or {}
|
|
38
|
+
if not usage:
|
|
39
|
+
return
|
|
40
|
+
_USAGE_TOTALS["calls"] += 1
|
|
41
|
+
_USAGE_TOTALS["prompt_tokens"] += int(usage.get("prompt_tokens", 0) or 0)
|
|
42
|
+
_USAGE_TOTALS["completion_tokens"] += int(usage.get("completion_tokens", 0) or 0)
|
|
43
|
+
_USAGE_TOTALS["total_tokens"] += int(usage.get("total_tokens", 0) or 0)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _tiktoken_encoding_for(model: str | None) -> tiktoken.Encoding:
|
|
47
|
+
probe = (model or "gpt-4o").split("/")[-1]
|
|
48
|
+
try:
|
|
49
|
+
return tiktoken.encoding_for_model(probe)
|
|
50
|
+
except KeyError:
|
|
51
|
+
return tiktoken.get_encoding("cl100k_base")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def count_tokens(text: str | None, model: str | None = None) -> int:
|
|
55
|
+
if not text:
|
|
56
|
+
return 0
|
|
57
|
+
enc = _tiktoken_encoding_for(model)
|
|
58
|
+
return len(enc.encode(text))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_chat_model(settings: RuntimeSettings, model: str | None = None) -> ChatLiteLLM:
|
|
62
|
+
return ChatLiteLLM(
|
|
63
|
+
model=model or settings.llm_model,
|
|
64
|
+
temperature=0,
|
|
65
|
+
**settings.litellm_kwargs,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _to_langchain_messages(
|
|
70
|
+
prompt: str,
|
|
71
|
+
chat_history: Optional[List[dict]] = None,
|
|
72
|
+
) -> list[HumanMessage | AIMessage]:
|
|
73
|
+
if not chat_history:
|
|
74
|
+
return [HumanMessage(content=prompt)]
|
|
75
|
+
messages: list[HumanMessage | AIMessage] = []
|
|
76
|
+
for row in chat_history:
|
|
77
|
+
role = str(row.get("role") or "").strip().lower()
|
|
78
|
+
content = str(row.get("content") or "")
|
|
79
|
+
if role == "assistant":
|
|
80
|
+
messages.append(AIMessage(content=content))
|
|
81
|
+
else:
|
|
82
|
+
messages.append(HumanMessage(content=content))
|
|
83
|
+
messages.append(HumanMessage(content=prompt))
|
|
84
|
+
return messages
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _finish_reason_from_response(response: Any) -> str:
|
|
88
|
+
meta = getattr(response, "response_metadata", None) or {}
|
|
89
|
+
reason = meta.get("finish_reason") or meta.get("stop_reason")
|
|
90
|
+
if isinstance(reason, str) and reason.strip():
|
|
91
|
+
if reason in ("length", "max_tokens"):
|
|
92
|
+
return "max_output_reached"
|
|
93
|
+
return "finished"
|
|
94
|
+
return "finished"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def ChatGPT_API_with_finish_reason(
|
|
98
|
+
settings: RuntimeSettings,
|
|
99
|
+
model: str,
|
|
100
|
+
prompt: str,
|
|
101
|
+
chat_history: Optional[List[dict]] = None,
|
|
102
|
+
) -> Tuple[str, str]:
|
|
103
|
+
max_retries = 10
|
|
104
|
+
llm = build_chat_model(settings, model=model or None)
|
|
105
|
+
messages = _to_langchain_messages(prompt, chat_history)
|
|
106
|
+
for i in range(max_retries):
|
|
107
|
+
try:
|
|
108
|
+
response = llm.invoke(messages)
|
|
109
|
+
_record_usage(response)
|
|
110
|
+
if not (getattr(response, "response_metadata", None) or {}).get("token_usage"):
|
|
111
|
+
_USAGE_TOTALS["calls"] += 1
|
|
112
|
+
text = response.content if isinstance(response.content, str) else str(response.content or "")
|
|
113
|
+
return text, _finish_reason_from_response(response)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.warning("Chat completion retry %s: %s", i + 1, e)
|
|
116
|
+
if i < max_retries - 1:
|
|
117
|
+
time.sleep(1)
|
|
118
|
+
else:
|
|
119
|
+
logger.error("Max retries reached for chat completion")
|
|
120
|
+
return "Error", "error"
|
|
121
|
+
return "Error", "error"
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def ChatGPT_API(
|
|
125
|
+
settings: RuntimeSettings,
|
|
126
|
+
model: str,
|
|
127
|
+
prompt: str,
|
|
128
|
+
chat_history: Optional[List[dict]] = None,
|
|
129
|
+
) -> str:
|
|
130
|
+
text, _ = ChatGPT_API_with_finish_reason(settings, model, prompt, chat_history)
|
|
131
|
+
return text
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def extract_json(content: str) -> Any:
|
|
135
|
+
try:
|
|
136
|
+
start_idx = content.find("```json")
|
|
137
|
+
if start_idx != -1:
|
|
138
|
+
start_idx += 7
|
|
139
|
+
end_idx = content.rfind("```")
|
|
140
|
+
json_content = content[start_idx:end_idx].strip()
|
|
141
|
+
else:
|
|
142
|
+
json_content = content.strip()
|
|
143
|
+
|
|
144
|
+
json_content = json_content.replace("None", "null")
|
|
145
|
+
json_content = " ".join(json_content.replace("\n", " ").replace("\r", " ").split())
|
|
146
|
+
|
|
147
|
+
return json.loads(json_content)
|
|
148
|
+
except json.JSONDecodeError:
|
|
149
|
+
try:
|
|
150
|
+
json_content = json_content.replace(",]", "]").replace(",}", "}")
|
|
151
|
+
return json.loads(json_content)
|
|
152
|
+
except Exception:
|
|
153
|
+
logger.exception("Failed to parse JSON from model output")
|
|
154
|
+
return {}
|
|
155
|
+
except Exception:
|
|
156
|
+
logger.exception("Unexpected error while extracting JSON")
|
|
157
|
+
return {}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def llm_completion(
|
|
161
|
+
settings: RuntimeSettings,
|
|
162
|
+
model: str | None,
|
|
163
|
+
prompt: str,
|
|
164
|
+
chat_history: Optional[List[dict]] = None,
|
|
165
|
+
return_finish_reason: bool = False,
|
|
166
|
+
):
|
|
167
|
+
m = model or settings.llm_model
|
|
168
|
+
if return_finish_reason:
|
|
169
|
+
return ChatGPT_API_with_finish_reason(settings, m, prompt, chat_history=chat_history)
|
|
170
|
+
return ChatGPT_API(settings, m, prompt, chat_history=chat_history)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""One-shot document description after the outline tree is built (separate LLM call from TOC extraction)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from chunksmith_pageindex.config import RuntimeSettings
|
|
10
|
+
from chunksmith_pageindex.indexer import prompts
|
|
11
|
+
from chunksmith_pageindex.indexer.client import llm_completion
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_clean_structure_for_description(structure: Any) -> Any:
|
|
17
|
+
"""
|
|
18
|
+
Strip heavy fields (e.g. full ``text``) before sending the tree to the description model.
|
|
19
|
+
Keeps ``title``, ``node_id``, ``summary``, and nested ``nodes``.
|
|
20
|
+
"""
|
|
21
|
+
if isinstance(structure, dict):
|
|
22
|
+
clean_node: dict[str, Any] = {}
|
|
23
|
+
for key in ("title", "node_id", "summary"):
|
|
24
|
+
if key in structure:
|
|
25
|
+
clean_node[key] = structure[key]
|
|
26
|
+
children = structure.get("nodes")
|
|
27
|
+
if isinstance(children, list) and children:
|
|
28
|
+
clean_node["nodes"] = create_clean_structure_for_description(children)
|
|
29
|
+
return clean_node
|
|
30
|
+
if isinstance(structure, list):
|
|
31
|
+
return [create_clean_structure_for_description(item) for item in structure]
|
|
32
|
+
return structure
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def generate_doc_description(
|
|
36
|
+
settings: RuntimeSettings,
|
|
37
|
+
structure: Any,
|
|
38
|
+
*,
|
|
39
|
+
model: str | None = None,
|
|
40
|
+
) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Second-phase LLM: one plain completion, no chat history, unrelated to TOC init/continue prompts.
|
|
43
|
+
"""
|
|
44
|
+
clean = create_clean_structure_for_description(structure)
|
|
45
|
+
structure_json = json.dumps(clean, ensure_ascii=False, indent=2)
|
|
46
|
+
prompt = prompts.build_doc_description_prompt(structure_json)
|
|
47
|
+
raw = llm_completion(settings, model, prompt, chat_history=None)
|
|
48
|
+
text = (raw or "").strip()
|
|
49
|
+
if not text or text == "Error":
|
|
50
|
+
logger.warning("Document description LLM returned empty or error placeholder")
|
|
51
|
+
return text
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""LLM calls for init/continue outline extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any, List
|
|
8
|
+
|
|
9
|
+
from chunksmith_pageindex.config import RuntimeSettings
|
|
10
|
+
from chunksmith_pageindex.exceptions import OutlineExtractionError
|
|
11
|
+
from chunksmith_pageindex.indexer import client
|
|
12
|
+
from chunksmith_pageindex.indexer.prompts import (
|
|
13
|
+
ANCHOR_INSTRUCTION,
|
|
14
|
+
SUMMARY_INSTRUCTION,
|
|
15
|
+
TOC_CONTINUE_SYSTEM,
|
|
16
|
+
TOC_INIT_SYSTEM,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _outline_extra_instructions(*, include_summary: bool, include_word_range: bool) -> str:
|
|
23
|
+
parts: List[str] = []
|
|
24
|
+
if include_summary:
|
|
25
|
+
parts.append(SUMMARY_INSTRUCTION.strip())
|
|
26
|
+
if include_word_range:
|
|
27
|
+
parts.append(ANCHOR_INSTRUCTION.strip())
|
|
28
|
+
if not parts:
|
|
29
|
+
return ""
|
|
30
|
+
return "\n" + "\n".join(parts)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def format_given_text_block(part: str) -> str:
|
|
34
|
+
"""Prefix the document excerpt for the outline prompt (verbatim anchor must match this text)."""
|
|
35
|
+
return f"\nGiven text\n:{part}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _rows_from_llm_response(response: str) -> List[dict[str, Any]]:
|
|
39
|
+
"""Parse assistant text: prefer ``{"sections": [...]}``, else bare array."""
|
|
40
|
+
parsed = client.extract_json(response)
|
|
41
|
+
if isinstance(parsed, dict) and "sections" in parsed:
|
|
42
|
+
inner = parsed["sections"]
|
|
43
|
+
if isinstance(inner, list):
|
|
44
|
+
return inner
|
|
45
|
+
if isinstance(parsed, list):
|
|
46
|
+
return parsed
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def generate_toc_init(
|
|
51
|
+
settings: RuntimeSettings,
|
|
52
|
+
part: str,
|
|
53
|
+
model: str | None = None,
|
|
54
|
+
*,
|
|
55
|
+
include_summary: bool = False,
|
|
56
|
+
include_word_range: bool = False,
|
|
57
|
+
) -> List[dict[str, Any]]:
|
|
58
|
+
m = model or settings.pageindex_model
|
|
59
|
+
head = TOC_INIT_SYSTEM.strip() + _outline_extra_instructions(
|
|
60
|
+
include_summary=include_summary,
|
|
61
|
+
include_word_range=include_word_range,
|
|
62
|
+
)
|
|
63
|
+
prompt = head + format_given_text_block(part)
|
|
64
|
+
|
|
65
|
+
response, finish_reason = client.llm_completion(settings, m, prompt, return_finish_reason=True)
|
|
66
|
+
if finish_reason != "finished":
|
|
67
|
+
raise OutlineExtractionError(f"generate_toc_init finish_reason={finish_reason!r}")
|
|
68
|
+
rows = _rows_from_llm_response(response)
|
|
69
|
+
if not rows:
|
|
70
|
+
raise OutlineExtractionError("generate_toc_init: no sections in model output")
|
|
71
|
+
return rows
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def generate_toc_continue(
|
|
75
|
+
settings: RuntimeSettings,
|
|
76
|
+
toc_content: List[dict[str, Any]],
|
|
77
|
+
part: str,
|
|
78
|
+
model: str | None = None,
|
|
79
|
+
*,
|
|
80
|
+
include_summary: bool = False,
|
|
81
|
+
include_word_range: bool = False,
|
|
82
|
+
) -> List[dict[str, Any]]:
|
|
83
|
+
m = model or settings.pageindex_model
|
|
84
|
+
head = TOC_CONTINUE_SYSTEM.strip() + _outline_extra_instructions(
|
|
85
|
+
include_summary=include_summary,
|
|
86
|
+
include_word_range=include_word_range,
|
|
87
|
+
)
|
|
88
|
+
prompt = head + format_given_text_block(part) + "\nPrevious outline (JSON)\n:" + json.dumps(toc_content, indent=2)
|
|
89
|
+
|
|
90
|
+
response, finish_reason = client.llm_completion(settings, m, prompt, return_finish_reason=True)
|
|
91
|
+
if finish_reason != "finished":
|
|
92
|
+
raise OutlineExtractionError(f"generate_toc_continue finish_reason={finish_reason!r}")
|
|
93
|
+
rows = _rows_from_llm_response(response)
|
|
94
|
+
if not rows:
|
|
95
|
+
raise OutlineExtractionError("generate_toc_continue: no sections in model output")
|
|
96
|
+
return rows
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Prompt templates for flat outline extraction (no-TOC path)."""
|
|
2
|
+
|
|
3
|
+
# Appended to init/continue instructions when ``add_summary`` is enabled (same completion as outline rows).
|
|
4
|
+
SUMMARY_INSTRUCTION = """
|
|
5
|
+
Each row must also include "summary" (string): one or two concise sentences describing what that section covers,
|
|
6
|
+
using only information visible in the given excerpt (no outside knowledge)."""
|
|
7
|
+
|
|
8
|
+
# Appended when ``add_word_range=True``: one verbatim alignment string per row (no word indices).
|
|
9
|
+
ANCHOR_INSTRUCTION = """
|
|
10
|
+
When section anchors are requested in the **same** API response, every row must include the usual outline keys
|
|
11
|
+
(structure, title, physical_index, and "summary" only if that was also requested) **plus**:
|
|
12
|
+
|
|
13
|
+
• ``split_document_anchor`` (string) — a **short verbatim substring** copied from the **Given text** excerpt below
|
|
14
|
+
(the text after ``Given text`` + newline + ``:``). Use text at or immediately after where that section begins
|
|
15
|
+
in this excerpt (e.g. heading like ``1 Introduction``, ``Abstract``, ``3.2 Attention``). Used to locate and
|
|
16
|
+
verify the section start in the excerpt; it must appear exactly as in the Given text.
|
|
17
|
+
|
|
18
|
+
Apply this on **every** row for this message. For **continue** calls, anchors refer only to the **current** excerpt
|
|
19
|
+
(the new Given text in this message), not to prior parts."""
|
|
20
|
+
|
|
21
|
+
TOC_INIT_SYSTEM = """
|
|
22
|
+
You are an expert in extracting hierarchical tree structure. Generate the tree structure of the document.
|
|
23
|
+
structure: numeric index, e.g. "1", "1.1", "1.2", "1.2.1", "1.2.2", "1.2.3", "1.2.4", "1.2.5", "1.2.6", "1.2.7", "1.2.8", "1.2.9", "1.2.10".
|
|
24
|
+
Tags <physical_index_X> in the given text mark where PDF page X begins. Long PDFs are split into multiple excerpts; this message is the first excerpt only.
|
|
25
|
+
Several sections may start on the same page—that is normal; set each row's physical_index from the tag nearest that section's start.
|
|
26
|
+
**Important:** ``physical_index`` is always the **printed PDF page** from those tags only—never infer it from section numbers in headings (e.g. a section titled ``3 Model Architecture`` may start on page 2 if the ``<physical_index_2>`` block contains that heading).
|
|
27
|
+
Each section row must include: "structure" (string), "title" (string), and "physical_index"
|
|
28
|
+
(either an integer start page or a string tag like "<physical_index_5>").
|
|
29
|
+
If the prompt also includes appended instructions (e.g. per-section summaries or ``split_document_anchor`` on the excerpt),
|
|
30
|
+
include those fields on every row as specified there.
|
|
31
|
+
Return JSON only: a markdown code block ```json containing either a JSON array of rows in reading order,
|
|
32
|
+
or an object {"sections": [ ...rows... ]}."""
|
|
33
|
+
|
|
34
|
+
TOC_CONTINUE_SYSTEM = """
|
|
35
|
+
You are an expert in extracting hierarchical tree structure.
|
|
36
|
+
You are given the previous outline as JSON and the text of the current part of the document.
|
|
37
|
+
Continue the outline: add new rows only for sections that appear in the current part.
|
|
38
|
+
structure: numeric index, e.g. "1", "1.1", "1.2", "1.2.1", "1.2.2", "1.2.3", "1.2.4", "1.2.5", "1.2.6", "1.2.7", "1.2.8", "1.2.9", "1.2.10".
|
|
39
|
+
Tags <physical_index_X> in the given text mark where PDF page X begins. The start of this excerpt often overlaps the end of the prior excerpt (the same pages can appear again)—use the tags to assign physical_index and do **not** duplicate sections already present in the previous outline JSON.
|
|
40
|
+
Several sections may start on the same page; that is normal.
|
|
41
|
+
Each row: "structure" (string), "title" (string), "physical_index" (int or "<physical_index_N>" string).
|
|
42
|
+
``physical_index`` must be the PDF page from ``<physical_index_N>`` tags only, not from numeric prefixes in section titles.
|
|
43
|
+
When ``split_document_anchor`` is requested, it must be copied from **this** message's Given text excerpt only.
|
|
44
|
+
If the prompt also includes appended instructions (e.g. summaries or anchors), include those fields on each new row.
|
|
45
|
+
Return JSON only: ```json with either an array of only the additional rows, or {"sections": [ ... ]}."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def build_doc_description_prompt(structure_json: str) -> str:
|
|
49
|
+
"""User message for a standalone completion after the TOC tree exists (not mixed with outline extraction)."""
|
|
50
|
+
return f"""You are an expert in generating descriptions for a document.
|
|
51
|
+
You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
|
|
52
|
+
|
|
53
|
+
Document Structure:
|
|
54
|
+
{structure_json}
|
|
55
|
+
|
|
56
|
+
Directly return the description, do not include any other text."""
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""LiteLLM / ChatLiteLLM env resolution for PageIndexer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _clean_env(name: str) -> str | None:
|
|
11
|
+
raw = os.getenv(name)
|
|
12
|
+
if raw is None:
|
|
13
|
+
return None
|
|
14
|
+
value = str(raw).split("#", 1)[0].strip()
|
|
15
|
+
return value or None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class LiteLLMConfig:
|
|
20
|
+
model: str
|
|
21
|
+
kwargs: dict[str, Any]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def resolve_litellm_config(*, pageindex_model: str) -> LiteLLMConfig:
|
|
25
|
+
model = (_clean_env("CHUNKSMITH_LLM_MODEL") or pageindex_model).strip()
|
|
26
|
+
kwargs: dict[str, Any] = {}
|
|
27
|
+
|
|
28
|
+
openai_key = _clean_env("OPENAI_API_KEY") or _clean_env("CHATGPT_API_KEY")
|
|
29
|
+
azure_key = _clean_env("AZURE_API_KEY")
|
|
30
|
+
azure_base = _clean_env("AZURE_API_BASE")
|
|
31
|
+
azure_version = _clean_env("AZURE_API_VERSION")
|
|
32
|
+
|
|
33
|
+
if model.startswith("azure/"):
|
|
34
|
+
if azure_key:
|
|
35
|
+
kwargs["api_key"] = azure_key
|
|
36
|
+
if azure_base:
|
|
37
|
+
kwargs["api_base"] = azure_base.rstrip("/")
|
|
38
|
+
if azure_version:
|
|
39
|
+
kwargs["api_version"] = azure_version
|
|
40
|
+
elif openai_key:
|
|
41
|
+
kwargs["api_key"] = openai_key
|
|
42
|
+
|
|
43
|
+
return LiteLLMConfig(model=model, kwargs=kwargs)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Tag pages with physical_index markers and split into token-budget groups."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import List, Tuple
|
|
7
|
+
|
|
8
|
+
from chunksmith_pageindex.indexer.client import count_tokens
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_tagged_page_strings(
|
|
12
|
+
page_list: List[Tuple[str, int]],
|
|
13
|
+
*,
|
|
14
|
+
start_index: int = 1,
|
|
15
|
+
model: str | None = None,
|
|
16
|
+
) -> Tuple[List[str], List[int]]:
|
|
17
|
+
"""
|
|
18
|
+
Wrap each page's text in ``<physical_index_N>`` tags.
|
|
19
|
+
|
|
20
|
+
Token counts are computed on the **tagged** strings (same as original PageIndex).
|
|
21
|
+
"""
|
|
22
|
+
page_contents: List[str] = []
|
|
23
|
+
token_lengths: List[int] = []
|
|
24
|
+
for i, (page_text, _tok_unused) in enumerate(page_list):
|
|
25
|
+
page_num = start_index + i
|
|
26
|
+
wrapped = f"<physical_index_{page_num}>\n{page_text}\n<physical_index_{page_num}>\n\n"
|
|
27
|
+
page_contents.append(wrapped)
|
|
28
|
+
token_lengths.append(count_tokens(wrapped, model))
|
|
29
|
+
return page_contents, token_lengths
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def page_list_to_group_text(
|
|
33
|
+
page_contents: List[str],
|
|
34
|
+
token_lengths: List[int],
|
|
35
|
+
*,
|
|
36
|
+
max_tokens: int = 20000,
|
|
37
|
+
overlap_page: int = 1,
|
|
38
|
+
) -> List[str]:
|
|
39
|
+
"""
|
|
40
|
+
Split tagged page strings into groups that each stay near ``max_tokens`` total.
|
|
41
|
+
|
|
42
|
+
Mirrors the original PageIndex grouping logic (with overlap between groups).
|
|
43
|
+
"""
|
|
44
|
+
num_tokens = sum(token_lengths)
|
|
45
|
+
if num_tokens <= max_tokens:
|
|
46
|
+
return ["".join(page_contents)]
|
|
47
|
+
|
|
48
|
+
subsets: List[str] = []
|
|
49
|
+
current_subset: List[str] = []
|
|
50
|
+
current_token_count = 0
|
|
51
|
+
expected_parts_num = math.ceil(num_tokens / max_tokens)
|
|
52
|
+
average_tokens_per_part = math.ceil(((num_tokens / expected_parts_num) + max_tokens) / 2)
|
|
53
|
+
|
|
54
|
+
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
|
|
55
|
+
if current_token_count + page_tokens > average_tokens_per_part:
|
|
56
|
+
subsets.append("".join(current_subset))
|
|
57
|
+
overlap_start = max(i - overlap_page, 0)
|
|
58
|
+
current_subset = list(page_contents[overlap_start:i])
|
|
59
|
+
current_token_count = sum(token_lengths[overlap_start:i])
|
|
60
|
+
current_subset.append(page_content)
|
|
61
|
+
current_token_count += page_tokens
|
|
62
|
+
|
|
63
|
+
if current_subset:
|
|
64
|
+
subsets.append("".join(current_subset))
|
|
65
|
+
return subsets
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Load PDF pages as (text, token_length) pairs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import PyPDF2
|
|
11
|
+
import pymupdf
|
|
12
|
+
|
|
13
|
+
from chunksmith_pageindex.config import RuntimeSettings
|
|
14
|
+
from chunksmith_pageindex.exceptions import PdfLoadError
|
|
15
|
+
from chunksmith_pageindex.indexer.client import count_tokens
|
|
16
|
+
|
|
17
|
+
PdfSource = Union[str, Path, BytesIO]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_pdf_pages(source: PdfSource, settings: RuntimeSettings) -> List[Tuple[str, int]]:
|
|
21
|
+
"""
|
|
22
|
+
Extract one string per PDF page and count tokens (for chunking).
|
|
23
|
+
|
|
24
|
+
``settings.pdf_parser`` must be ``PyPDF2`` or ``PyMuPDF``.
|
|
25
|
+
"""
|
|
26
|
+
parser = settings.pdf_parser
|
|
27
|
+
model = settings.pageindex_model
|
|
28
|
+
|
|
29
|
+
if parser == "PyPDF2":
|
|
30
|
+
return _load_pypdf2(source, model)
|
|
31
|
+
if parser == "PyMuPDF":
|
|
32
|
+
return _load_pymupdf(source, model)
|
|
33
|
+
raise PdfLoadError(f"Unsupported pdf_parser: {parser!r} (use PyPDF2 or PyMuPDF)")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _load_pypdf2(source: PdfSource, model: str) -> List[Tuple[str, int]]:
|
|
37
|
+
if isinstance(source, (str, Path)):
|
|
38
|
+
path = str(source)
|
|
39
|
+
if not os.path.isfile(path):
|
|
40
|
+
raise PdfLoadError(f"Not a file: {path}")
|
|
41
|
+
reader = PyPDF2.PdfReader(path)
|
|
42
|
+
else:
|
|
43
|
+
reader = PyPDF2.PdfReader(source)
|
|
44
|
+
out: List[Tuple[str, int]] = []
|
|
45
|
+
for page in reader.pages:
|
|
46
|
+
text = page.extract_text() or ""
|
|
47
|
+
out.append((text, count_tokens(text, model)))
|
|
48
|
+
return out
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _load_pymupdf(source: PdfSource, model: str) -> List[Tuple[str, int]]:
|
|
52
|
+
if isinstance(source, (str, Path)):
|
|
53
|
+
path = str(source)
|
|
54
|
+
if not os.path.isfile(path):
|
|
55
|
+
raise PdfLoadError(f"Not a file: {path}")
|
|
56
|
+
doc = pymupdf.open(path)
|
|
57
|
+
else:
|
|
58
|
+
doc = pymupdf.open(stream=source, filetype="pdf")
|
|
59
|
+
try:
|
|
60
|
+
out: List[Tuple[str, int]] = []
|
|
61
|
+
for page in doc:
|
|
62
|
+
text = page.get_text() or ""
|
|
63
|
+
out.append((text, count_tokens(text, model)))
|
|
64
|
+
return out
|
|
65
|
+
finally:
|
|
66
|
+
doc.close()
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Merge flat TOC rows, convert indices, build nested tree, assign node IDs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional, Union
|
|
8
|
+
|
|
9
|
+
from chunksmith_pageindex.config import RuntimeSettings, load_settings
|
|
10
|
+
from chunksmith_pageindex.exceptions import OutlineExtractionError
|
|
11
|
+
from chunksmith_pageindex.indexer import doc_description, extractor
|
|
12
|
+
from chunksmith_pageindex.parser import chunker, document
|
|
13
|
+
from chunksmith_pageindex.tree import schema
|
|
14
|
+
from chunksmith_pageindex.tree.node_text import add_node_text
|
|
15
|
+
from chunksmith_pageindex.tree.physical_index import (
|
|
16
|
+
convert_physical_index_to_int,
|
|
17
|
+
refine_physical_index_from_excerpt_tags,
|
|
18
|
+
refine_physical_index_from_pdf_page_text,
|
|
19
|
+
)
|
|
20
|
+
from chunksmith_pageindex.tree.tree_transform import (
|
|
21
|
+
add_preface_if_needed,
|
|
22
|
+
post_processing,
|
|
23
|
+
write_node_id,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def build_outline_from_pdf(
|
|
28
|
+
pdf_path: Union[str, Path],
|
|
29
|
+
settings: Optional[RuntimeSettings] = None,
|
|
30
|
+
*,
|
|
31
|
+
add_text: bool = False,
|
|
32
|
+
assign_node_ids: bool = False,
|
|
33
|
+
add_summary: bool = False,
|
|
34
|
+
add_word_range: bool = False,
|
|
35
|
+
generate_doc_summary: Optional[bool] = None,
|
|
36
|
+
) -> dict[str, Any]:
|
|
37
|
+
"""
|
|
38
|
+
End-to-end no-TOC outline: PDF → tagged chunks → LLM flat rows → tree.
|
|
39
|
+
|
|
40
|
+
If ``add_word_range`` is True, each row also has ``split_document_anchor`` (verbatim excerpt substring at the
|
|
41
|
+
section start) and ``chunk_excerpt_index`` (which LLM chunk produced that row).
|
|
42
|
+
"""
|
|
43
|
+
settings = settings or load_settings()
|
|
44
|
+
path = Path(pdf_path).resolve()
|
|
45
|
+
page_list = document.load_pdf_pages(path, settings)
|
|
46
|
+
if not page_list:
|
|
47
|
+
raise OutlineExtractionError("No pages extracted from PDF")
|
|
48
|
+
|
|
49
|
+
tagged, lengths = chunker.build_tagged_page_strings(page_list, start_index=1, model=settings.pageindex_model)
|
|
50
|
+
groups = chunker.page_list_to_group_text(
|
|
51
|
+
tagged,
|
|
52
|
+
lengths,
|
|
53
|
+
max_tokens=settings.max_tokens_per_chunk,
|
|
54
|
+
overlap_page=settings.overlap_pages,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
init_rows = extractor.generate_toc_init(
|
|
58
|
+
settings,
|
|
59
|
+
groups[0],
|
|
60
|
+
include_summary=add_summary,
|
|
61
|
+
include_word_range=add_word_range,
|
|
62
|
+
)
|
|
63
|
+
chunk_of_row: list[int] = [0] * len(init_rows)
|
|
64
|
+
flat = list(init_rows)
|
|
65
|
+
for chunk_idx, part in enumerate(groups[1:], start=1):
|
|
66
|
+
cont_rows = extractor.generate_toc_continue(
|
|
67
|
+
settings,
|
|
68
|
+
flat,
|
|
69
|
+
part,
|
|
70
|
+
include_summary=add_summary,
|
|
71
|
+
include_word_range=add_word_range,
|
|
72
|
+
)
|
|
73
|
+
chunk_of_row.extend([chunk_idx] * len(cont_rows))
|
|
74
|
+
flat.extend(cont_rows)
|
|
75
|
+
|
|
76
|
+
convert_physical_index_to_int(flat)
|
|
77
|
+
before_preface = len(chunk_of_row)
|
|
78
|
+
flat = add_preface_if_needed(flat, include_summary=add_summary)
|
|
79
|
+
if len(flat) == before_preface + 1:
|
|
80
|
+
chunk_of_row.insert(0, 0)
|
|
81
|
+
if add_word_range and flat and str(flat[0].get("title", "")).strip() == "Preface":
|
|
82
|
+
flat[0].setdefault("split_document_anchor", "Preface")
|
|
83
|
+
|
|
84
|
+
paired = [(c, x) for c, x in zip(chunk_of_row, flat) if x.get("physical_index") is not None]
|
|
85
|
+
chunk_of_row = [p[0] for p in paired]
|
|
86
|
+
flat = [p[1] for p in paired]
|
|
87
|
+
|
|
88
|
+
refine_physical_index_from_excerpt_tags(flat, chunk_of_row, groups)
|
|
89
|
+
refine_physical_index_from_pdf_page_text(flat, page_list)
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
validated = schema.validate_toc_rows(flat)
|
|
93
|
+
use_rows: list[dict[str, Any]] = []
|
|
94
|
+
for i, r in enumerate(validated):
|
|
95
|
+
d = r.model_dump()
|
|
96
|
+
if add_word_range:
|
|
97
|
+
d["chunk_excerpt_index"] = chunk_of_row[i]
|
|
98
|
+
use_rows.append(d)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise OutlineExtractionError(f"TOC row validation failed: {e}") from e
|
|
101
|
+
if not add_summary:
|
|
102
|
+
for d in use_rows:
|
|
103
|
+
d.pop("summary", None)
|
|
104
|
+
if not add_word_range:
|
|
105
|
+
for d in use_rows:
|
|
106
|
+
d.pop("split_document_anchor", None)
|
|
107
|
+
d.pop("chunk_excerpt_index", None)
|
|
108
|
+
|
|
109
|
+
end_page = len(page_list)
|
|
110
|
+
tree = post_processing(copy.deepcopy(use_rows), end_page)
|
|
111
|
+
|
|
112
|
+
if assign_node_ids:
|
|
113
|
+
write_node_id(tree)
|
|
114
|
+
|
|
115
|
+
if add_text:
|
|
116
|
+
add_node_text(tree, page_list)
|
|
117
|
+
|
|
118
|
+
do_doc_summary = settings.generate_doc_summary if generate_doc_summary is None else generate_doc_summary
|
|
119
|
+
out: dict[str, Any] = {
|
|
120
|
+
"doc_name": path.name,
|
|
121
|
+
"structure": tree,
|
|
122
|
+
}
|
|
123
|
+
if do_doc_summary:
|
|
124
|
+
out["doc_description"] = doc_description.generate_doc_description(
|
|
125
|
+
settings, tree, model=settings.pageindex_model
|
|
126
|
+
)
|
|
127
|
+
return out
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Fill outline node ``text`` from PDF pages using ``start_index`` / ``end_index``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _needle_in_blob(node: dict[str, Any], blob: str) -> str:
|
|
9
|
+
"""Prefer ``split_document_anchor``, then ``title``, first substring that appears in ``blob``."""
|
|
10
|
+
for key in ("split_document_anchor", "title"):
|
|
11
|
+
v = str(node.get(key) or "").strip()
|
|
12
|
+
if v and v in blob:
|
|
13
|
+
return v
|
|
14
|
+
return ""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def page_span_plain_text(
|
|
18
|
+
start_1b: int,
|
|
19
|
+
end_1b_inclusive: int,
|
|
20
|
+
pdf_pages: List[tuple[str, int]],
|
|
21
|
+
) -> str:
|
|
22
|
+
"""Concatenate extracted text for PDF pages ``start_1b`` … ``end_1b_inclusive`` (1-based, inclusive)."""
|
|
23
|
+
s = int(start_1b)
|
|
24
|
+
e = int(end_1b_inclusive)
|
|
25
|
+
if e < s:
|
|
26
|
+
s, e = e, s
|
|
27
|
+
parts: List[str] = []
|
|
28
|
+
for pi in range(s - 1, e):
|
|
29
|
+
if 0 <= pi < len(pdf_pages):
|
|
30
|
+
parts.append(pdf_pages[pi][0])
|
|
31
|
+
return "".join(parts)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def iter_outline_nodes_preorder(tree: Any) -> List[dict[str, Any]]:
|
|
35
|
+
"""Depth-first pre-order over nested ``nodes`` (same order as typical reading / JSON walk)."""
|
|
36
|
+
out: List[dict[str, Any]] = []
|
|
37
|
+
|
|
38
|
+
def walk(x: Any) -> None:
|
|
39
|
+
if isinstance(x, dict):
|
|
40
|
+
out.append(x)
|
|
41
|
+
ch = x.get("nodes")
|
|
42
|
+
if isinstance(ch, list):
|
|
43
|
+
for c in ch:
|
|
44
|
+
walk(c)
|
|
45
|
+
elif isinstance(x, list):
|
|
46
|
+
for item in x:
|
|
47
|
+
walk(item)
|
|
48
|
+
|
|
49
|
+
walk(tree)
|
|
50
|
+
return out
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def add_node_text(tree: Any, pdf_pages: List[tuple[str, int]]) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Set each node's ``text`` to the slice of its page span that belongs to that section.
|
|
56
|
+
|
|
57
|
+
Builds plain text from ``start_index`` … ``end_index`` (inclusive), then cuts from this node's
|
|
58
|
+
heading needle to the next outline node's needle in pre-order (so siblings on the same page do
|
|
59
|
+
not duplicate each other's bodies). If no needle matches, uses the full page span.
|
|
60
|
+
"""
|
|
61
|
+
nodes = iter_outline_nodes_preorder(tree)
|
|
62
|
+
for i, node in enumerate(nodes):
|
|
63
|
+
start_p = node.get("start_index")
|
|
64
|
+
end_p = node.get("end_index")
|
|
65
|
+
if start_p is None or end_p is None:
|
|
66
|
+
continue
|
|
67
|
+
blob = page_span_plain_text(int(start_p), int(end_p), pdf_pages).strip()
|
|
68
|
+
if not blob:
|
|
69
|
+
node["text"] = ""
|
|
70
|
+
continue
|
|
71
|
+
start_needle = _needle_in_blob(node, blob)
|
|
72
|
+
next_needle = ""
|
|
73
|
+
if i + 1 < len(nodes):
|
|
74
|
+
next_needle = _needle_in_blob(nodes[i + 1], blob)
|
|
75
|
+
if start_needle:
|
|
76
|
+
start_char = blob.find(start_needle)
|
|
77
|
+
end_char = len(blob)
|
|
78
|
+
if next_needle:
|
|
79
|
+
nx = blob.find(next_needle, start_char + max(1, len(start_needle)))
|
|
80
|
+
if nx >= 0:
|
|
81
|
+
end_char = nx
|
|
82
|
+
node["text"] = blob[start_char:end_char].strip()
|
|
83
|
+
else:
|
|
84
|
+
node["text"] = blob
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Physical index refinement from excerpt tags and PDF page text."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def convert_physical_index_to_int(data: List[dict[str, Any]]) -> None:
|
|
9
|
+
"""Normalize ``physical_index`` strings like ``<physical_index_5>`` to ints in-place."""
|
|
10
|
+
for item in data:
|
|
11
|
+
if not isinstance(item, dict) or "physical_index" not in item:
|
|
12
|
+
continue
|
|
13
|
+
pi = item["physical_index"]
|
|
14
|
+
if isinstance(pi, str):
|
|
15
|
+
if pi.startswith("<physical_index_"):
|
|
16
|
+
item["physical_index"] = int(pi.split("_")[-1].rstrip(">").strip())
|
|
17
|
+
elif pi.startswith("physical_index_"):
|
|
18
|
+
item["physical_index"] = int(pi.split("_")[-1].strip())
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def physical_page_at_char(tagged_excerpt: str, char_index: int) -> int:
|
|
22
|
+
i = 0
|
|
23
|
+
current = 1
|
|
24
|
+
limit = max(0, min(len(tagged_excerpt), char_index))
|
|
25
|
+
prefix = "<physical_index_"
|
|
26
|
+
plen = len(prefix)
|
|
27
|
+
while i < limit:
|
|
28
|
+
if tagged_excerpt.startswith(prefix, i):
|
|
29
|
+
j = tagged_excerpt.find(">", i)
|
|
30
|
+
if j == -1:
|
|
31
|
+
break
|
|
32
|
+
current = int(tagged_excerpt[i + plen : j])
|
|
33
|
+
i = j + 1
|
|
34
|
+
else:
|
|
35
|
+
i += 1
|
|
36
|
+
return current
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def find_row_start_in_excerpt(excerpt: str, row: dict[str, Any], search_from: int) -> int:
|
|
40
|
+
anchor = str(row.get("split_document_anchor") or "").strip()
|
|
41
|
+
if anchor:
|
|
42
|
+
p = excerpt.find(anchor, search_from)
|
|
43
|
+
if p >= 0:
|
|
44
|
+
return p
|
|
45
|
+
title = str(row.get("title") or "").strip()
|
|
46
|
+
if title:
|
|
47
|
+
p = excerpt.find(title, search_from)
|
|
48
|
+
if p >= 0:
|
|
49
|
+
return p
|
|
50
|
+
return -1
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def refine_physical_index_from_excerpt_tags(
|
|
54
|
+
rows: List[dict[str, Any]],
|
|
55
|
+
chunk_idx_per_row: List[int],
|
|
56
|
+
groups: List[str],
|
|
57
|
+
) -> None:
|
|
58
|
+
if not rows or len(chunk_idx_per_row) != len(rows):
|
|
59
|
+
return
|
|
60
|
+
prev_chunk = -1
|
|
61
|
+
scan_from = 0
|
|
62
|
+
for row, chi in zip(rows, chunk_idx_per_row):
|
|
63
|
+
if not isinstance(row, dict) or row.get("physical_index") is None:
|
|
64
|
+
continue
|
|
65
|
+
if not isinstance(chi, int) or chi < 0 or chi >= len(groups):
|
|
66
|
+
continue
|
|
67
|
+
excerpt = groups[chi]
|
|
68
|
+
if chi != prev_chunk:
|
|
69
|
+
scan_from = 0
|
|
70
|
+
prev_chunk = chi
|
|
71
|
+
pos = find_row_start_in_excerpt(excerpt, row, scan_from)
|
|
72
|
+
if pos < 0:
|
|
73
|
+
continue
|
|
74
|
+
try:
|
|
75
|
+
row["physical_index"] = physical_page_at_char(excerpt, pos)
|
|
76
|
+
except ValueError:
|
|
77
|
+
continue
|
|
78
|
+
scan_from = pos + 1
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def refine_physical_index_from_pdf_page_text(
|
|
82
|
+
rows: List[dict[str, Any]],
|
|
83
|
+
page_list: List[tuple[str, int]],
|
|
84
|
+
) -> None:
|
|
85
|
+
if not rows or not page_list:
|
|
86
|
+
return
|
|
87
|
+
texts = [p[0] for p in page_list]
|
|
88
|
+
n = len(texts)
|
|
89
|
+
prev_page = 1
|
|
90
|
+
for row in rows:
|
|
91
|
+
if not isinstance(row, dict) or row.get("physical_index") is None:
|
|
92
|
+
continue
|
|
93
|
+
anchor = str(row.get("split_document_anchor") or "").strip()
|
|
94
|
+
title = str(row.get("title") or "").strip()
|
|
95
|
+
needles: List[str] = []
|
|
96
|
+
if anchor:
|
|
97
|
+
needles.append(anchor)
|
|
98
|
+
if title and title not in needles:
|
|
99
|
+
needles.append(title)
|
|
100
|
+
if not needles:
|
|
101
|
+
continue
|
|
102
|
+
found: int | None = None
|
|
103
|
+
lo = max(0, prev_page - 1)
|
|
104
|
+
for pi in range(lo, n):
|
|
105
|
+
body = texts[pi]
|
|
106
|
+
for needle in needles:
|
|
107
|
+
if needle in body:
|
|
108
|
+
found = pi + 1
|
|
109
|
+
break
|
|
110
|
+
if found is not None:
|
|
111
|
+
break
|
|
112
|
+
if found is not None:
|
|
113
|
+
row["physical_index"] = found
|
|
114
|
+
prev_page = found
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Pydantic models for LLM flat rows and optional nested outline nodes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, List, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import AliasChoices, BaseModel, ConfigDict, Field, TypeAdapter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TocFlatRow(BaseModel):
|
|
11
|
+
"""One outline row (LLM output)."""
|
|
12
|
+
|
|
13
|
+
model_config = ConfigDict(extra="ignore")
|
|
14
|
+
|
|
15
|
+
structure: str
|
|
16
|
+
title: str = ""
|
|
17
|
+
physical_index: str | int | None = None
|
|
18
|
+
# Filled when ``add_summary=True`` on ``build_outline_from_pdf`` (same outline API calls).
|
|
19
|
+
summary: str = ""
|
|
20
|
+
# When ``add_word_range=True``: verbatim substring from the excerpt at the section start.
|
|
21
|
+
split_document_anchor: str = Field(
|
|
22
|
+
default="",
|
|
23
|
+
validation_alias=AliasChoices(
|
|
24
|
+
"split_document_anchor",
|
|
25
|
+
"document_anchor_word",
|
|
26
|
+
"split_from_document_word",
|
|
27
|
+
),
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_toc_rows(raw: Any) -> List[TocFlatRow]:
|
|
32
|
+
if not isinstance(raw, list):
|
|
33
|
+
return []
|
|
34
|
+
adapter = TypeAdapter(List[TocFlatRow])
|
|
35
|
+
return adapter.validate_python(raw)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OutlineNode(BaseModel):
|
|
39
|
+
"""Nested node after ``post_processing`` / ``list_to_tree`` (optional validation)."""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(extra="allow")
|
|
42
|
+
|
|
43
|
+
title: str = ""
|
|
44
|
+
start_index: Optional[int] = None
|
|
45
|
+
end_index: Optional[int] = None
|
|
46
|
+
node_id: Optional[str] = None
|
|
47
|
+
summary: Optional[str] = None
|
|
48
|
+
split_document_anchor: Optional[str] = None
|
|
49
|
+
text: Optional[str] = None
|
|
50
|
+
nodes: Optional[List[OutlineNode]] = None
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Flat TOC row → nested tree transforms."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def add_preface_if_needed(data: List[dict[str, Any]], *, include_summary: bool = False) -> List[dict[str, Any]]:
|
|
9
|
+
if not data:
|
|
10
|
+
return data
|
|
11
|
+
if data[0].get("physical_index") is not None and data[0]["physical_index"] > 1:
|
|
12
|
+
row: dict[str, Any] = {"structure": "0", "title": "Preface", "physical_index": 1}
|
|
13
|
+
if include_summary:
|
|
14
|
+
row["summary"] = ""
|
|
15
|
+
data.insert(0, row)
|
|
16
|
+
return data
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def list_to_tree(data: List[dict[str, Any]]) -> List[dict[str, Any]]:
|
|
20
|
+
def get_parent_structure(structure: str | None) -> str | None:
|
|
21
|
+
if not structure:
|
|
22
|
+
return None
|
|
23
|
+
parts = str(structure).split(".")
|
|
24
|
+
return ".".join(parts[:-1]) if len(parts) > 1 else None
|
|
25
|
+
|
|
26
|
+
nodes: dict[str, dict[str, Any]] = {}
|
|
27
|
+
root_nodes: List[dict[str, Any]] = []
|
|
28
|
+
|
|
29
|
+
for item in data:
|
|
30
|
+
structure = item.get("structure")
|
|
31
|
+
sk = str(structure) if structure is not None else ""
|
|
32
|
+
node = {
|
|
33
|
+
"title": item.get("title"),
|
|
34
|
+
"start_index": item.get("start_index"),
|
|
35
|
+
"end_index": item.get("end_index"),
|
|
36
|
+
"nodes": [],
|
|
37
|
+
}
|
|
38
|
+
if "summary" in item:
|
|
39
|
+
node["summary"] = str(item.get("summary") or "").strip()
|
|
40
|
+
anchor = str(item.get("split_document_anchor") or "").strip()
|
|
41
|
+
if anchor:
|
|
42
|
+
node["split_document_anchor"] = anchor
|
|
43
|
+
cxi = item.get("chunk_excerpt_index")
|
|
44
|
+
if cxi is not None:
|
|
45
|
+
node["chunk_excerpt_index"] = int(cxi)
|
|
46
|
+
nodes[sk] = node
|
|
47
|
+
parent_structure = get_parent_structure(sk if sk else None)
|
|
48
|
+
|
|
49
|
+
if parent_structure:
|
|
50
|
+
if parent_structure in nodes:
|
|
51
|
+
nodes[parent_structure]["nodes"].append(node)
|
|
52
|
+
else:
|
|
53
|
+
root_nodes.append(node)
|
|
54
|
+
else:
|
|
55
|
+
root_nodes.append(node)
|
|
56
|
+
|
|
57
|
+
def clean_node(node: dict[str, Any]) -> dict[str, Any]:
|
|
58
|
+
if not node.get("nodes"):
|
|
59
|
+
node.pop("nodes", None)
|
|
60
|
+
else:
|
|
61
|
+
for child in node["nodes"]:
|
|
62
|
+
clean_node(child)
|
|
63
|
+
return node
|
|
64
|
+
|
|
65
|
+
return [clean_node(n) for n in root_nodes]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def post_processing(structure: List[dict[str, Any]], end_physical_index: int) -> List[dict[str, Any]]:
|
|
69
|
+
for i, item in enumerate(structure):
|
|
70
|
+
item["start_index"] = item.get("physical_index")
|
|
71
|
+
if i < len(structure) - 1:
|
|
72
|
+
if structure[i + 1].get("appear_start") == "yes":
|
|
73
|
+
item["end_index"] = structure[i + 1]["physical_index"] - 1
|
|
74
|
+
else:
|
|
75
|
+
item["end_index"] = structure[i + 1]["physical_index"]
|
|
76
|
+
else:
|
|
77
|
+
item["end_index"] = end_physical_index
|
|
78
|
+
tree = list_to_tree(structure)
|
|
79
|
+
if tree:
|
|
80
|
+
return tree
|
|
81
|
+
for node in structure:
|
|
82
|
+
node.pop("appear_start", None)
|
|
83
|
+
node.pop("physical_index", None)
|
|
84
|
+
return structure
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def write_node_id(data: Any, node_id: int = 0) -> int:
|
|
88
|
+
if isinstance(data, dict):
|
|
89
|
+
data["node_id"] = str(node_id).zfill(4)
|
|
90
|
+
node_id += 1
|
|
91
|
+
children = data.get("nodes")
|
|
92
|
+
if isinstance(children, list):
|
|
93
|
+
for item in children:
|
|
94
|
+
node_id = write_node_id(item, node_id)
|
|
95
|
+
elif isinstance(data, list):
|
|
96
|
+
for item in data:
|
|
97
|
+
node_id = write_node_id(item, node_id)
|
|
98
|
+
return node_id
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunksmith-pageindex
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: ChunkSmith PDF page-index outline pipeline (no Unstructured partition).
|
|
5
|
+
Author-email: AnshulParate2004 <anshulnparate@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
|
|
8
|
+
Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
|
|
9
|
+
Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: chunksmith-core>=0.3.0
|
|
12
|
+
Provides-Extra: pdf
|
|
13
|
+
Requires-Dist: pymupdf>=1.24.0; extra == "pdf"
|
|
14
|
+
Requires-Dist: PyPDF2>=3.0.0; extra == "pdf"
|
|
15
|
+
Provides-Extra: llm
|
|
16
|
+
Requires-Dist: httpx>=0.27.0; extra == "llm"
|
|
17
|
+
Requires-Dist: tiktoken>=0.7.0; extra == "llm"
|
|
18
|
+
Requires-Dist: langchain-core>=0.3.28; extra == "llm"
|
|
19
|
+
Requires-Dist: langchain-litellm>=0.2.0; extra == "llm"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
src/chunksmith_pageindex/__init__.py
|
|
3
|
+
src/chunksmith_pageindex/config.py
|
|
4
|
+
src/chunksmith_pageindex/config_defaults.py
|
|
5
|
+
src/chunksmith_pageindex/exceptions.py
|
|
6
|
+
src/chunksmith_pageindex/llm_config.py
|
|
7
|
+
src/chunksmith_pageindex.egg-info/PKG-INFO
|
|
8
|
+
src/chunksmith_pageindex.egg-info/SOURCES.txt
|
|
9
|
+
src/chunksmith_pageindex.egg-info/dependency_links.txt
|
|
10
|
+
src/chunksmith_pageindex.egg-info/requires.txt
|
|
11
|
+
src/chunksmith_pageindex.egg-info/top_level.txt
|
|
12
|
+
src/chunksmith_pageindex/indexer/__init__.py
|
|
13
|
+
src/chunksmith_pageindex/indexer/client.py
|
|
14
|
+
src/chunksmith_pageindex/indexer/doc_description.py
|
|
15
|
+
src/chunksmith_pageindex/indexer/extractor.py
|
|
16
|
+
src/chunksmith_pageindex/indexer/prompts.py
|
|
17
|
+
src/chunksmith_pageindex/parser/__init__.py
|
|
18
|
+
src/chunksmith_pageindex/parser/chunker.py
|
|
19
|
+
src/chunksmith_pageindex/parser/document.py
|
|
20
|
+
src/chunksmith_pageindex/tree/__init__.py
|
|
21
|
+
src/chunksmith_pageindex/tree/builder.py
|
|
22
|
+
src/chunksmith_pageindex/tree/node_text.py
|
|
23
|
+
src/chunksmith_pageindex/tree/physical_index.py
|
|
24
|
+
src/chunksmith_pageindex/tree/schema.py
|
|
25
|
+
src/chunksmith_pageindex/tree/tree_transform.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chunksmith_pageindex
|