@fastrag/pageindex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +251 -0
- package/README.zh-CN.md +251 -0
- package/dist/errors/index.d.ts +10 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +19 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/internal-types/config.d.ts +35 -0
- package/dist/internal-types/config.d.ts.map +1 -0
- package/dist/internal-types/config.js +16 -0
- package/dist/internal-types/config.js.map +1 -0
- package/dist/internal-types/document-parser.d.ts +5 -0
- package/dist/internal-types/document-parser.d.ts.map +1 -0
- package/dist/internal-types/document-parser.js +2 -0
- package/dist/internal-types/document-parser.js.map +1 -0
- package/dist/internal-types/index.d.ts +9 -0
- package/dist/internal-types/index.d.ts.map +1 -0
- package/dist/internal-types/index.js +2 -0
- package/dist/internal-types/index.js.map +1 -0
- package/dist/internal-types/llm-provider.d.ts +19 -0
- package/dist/internal-types/llm-provider.d.ts.map +1 -0
- package/dist/internal-types/llm-provider.js +2 -0
- package/dist/internal-types/llm-provider.js.map +1 -0
- package/dist/internal-types/logger.d.ts +7 -0
- package/dist/internal-types/logger.d.ts.map +1 -0
- package/dist/internal-types/logger.js +2 -0
- package/dist/internal-types/logger.js.map +1 -0
- package/dist/internal-types/page.d.ts +5 -0
- package/dist/internal-types/page.d.ts.map +1 -0
- package/dist/internal-types/page.js +2 -0
- package/dist/internal-types/page.js.map +1 -0
- package/dist/internal-types/processing.d.ts +21 -0
- package/dist/internal-types/processing.d.ts.map +1 -0
- package/dist/internal-types/processing.js +2 -0
- package/dist/internal-types/processing.js.map +1 -0
- package/dist/internal-types/tree-node.d.ts +30 -0
- package/dist/internal-types/tree-node.d.ts.map +1 -0
- package/dist/internal-types/tree-node.js +2 -0
- package/dist/internal-types/tree-node.js.map +1 -0
- package/dist/llm/index.d.ts +3 -0
- package/dist/llm/index.d.ts.map +1 -0
- package/dist/llm/index.js +3 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/llm/llm-client.d.ts +26 -0
- package/dist/llm/llm-client.d.ts.map +1 -0
- package/dist/llm/llm-client.js +88 -0
- package/dist/llm/llm-client.js.map +1 -0
- package/dist/llm/prompts.d.ts +33 -0
- package/dist/llm/prompts.d.ts.map +1 -0
- package/dist/llm/prompts.js +312 -0
- package/dist/llm/prompts.js.map +1 -0
- package/dist/markdown/index.d.ts +6 -0
- package/dist/markdown/index.d.ts.map +1 -0
- package/dist/markdown/index.js +5 -0
- package/dist/markdown/index.js.map +1 -0
- package/dist/markdown/md-extractor.d.ts +14 -0
- package/dist/markdown/md-extractor.d.ts.map +1 -0
- package/dist/markdown/md-extractor.js +30 -0
- package/dist/markdown/md-extractor.js.map +1 -0
- package/dist/markdown/md-to-tree.d.ts +8 -0
- package/dist/markdown/md-to-tree.d.ts.map +1 -0
- package/dist/markdown/md-to-tree.js +20 -0
- package/dist/markdown/md-to-tree.js.map +1 -0
- package/dist/markdown/md-tree-builder.d.ts +7 -0
- package/dist/markdown/md-tree-builder.d.ts.map +1 -0
- package/dist/markdown/md-tree-builder.js +36 -0
- package/dist/markdown/md-tree-builder.js.map +1 -0
- package/dist/markdown/tree-thinning.d.ts +8 -0
- package/dist/markdown/tree-thinning.d.ts.map +1 -0
- package/dist/markdown/tree-thinning.js +42 -0
- package/dist/markdown/tree-thinning.js.map +1 -0
- package/dist/page-index.d.ts +10 -0
- package/dist/page-index.d.ts.map +1 -0
- package/dist/page-index.js +54 -0
- package/dist/page-index.js.map +1 -0
- package/dist/post-processing/doc-description.d.ts +12 -0
- package/dist/post-processing/doc-description.d.ts.map +1 -0
- package/dist/post-processing/doc-description.js +31 -0
- package/dist/post-processing/doc-description.js.map +1 -0
- package/dist/post-processing/index.d.ts +5 -0
- package/dist/post-processing/index.d.ts.map +1 -0
- package/dist/post-processing/index.js +5 -0
- package/dist/post-processing/index.js.map +1 -0
- package/dist/post-processing/node-id.d.ts +7 -0
- package/dist/post-processing/node-id.d.ts.map +1 -0
- package/dist/post-processing/node-id.js +20 -0
- package/dist/post-processing/node-id.js.map +1 -0
- package/dist/post-processing/node-text.d.ts +11 -0
- package/dist/post-processing/node-text.d.ts.map +1 -0
- package/dist/post-processing/node-text.js +37 -0
- package/dist/post-processing/node-text.js.map +1 -0
- package/dist/post-processing/summary.d.ts +7 -0
- package/dist/post-processing/summary.d.ts.map +1 -0
- package/dist/post-processing/summary.js +31 -0
- package/dist/post-processing/summary.js.map +1 -0
- package/dist/processing/index.d.ts +6 -0
- package/dist/processing/index.d.ts.map +1 -0
- package/dist/processing/index.js +6 -0
- package/dist/processing/index.js.map +1 -0
- package/dist/processing/large-node.d.ts +9 -0
- package/dist/processing/large-node.d.ts.map +1 -0
- package/dist/processing/large-node.js +40 -0
- package/dist/processing/large-node.js.map +1 -0
- package/dist/processing/meta-processor.d.ts +19 -0
- package/dist/processing/meta-processor.d.ts.map +1 -0
- package/dist/processing/meta-processor.js +91 -0
- package/dist/processing/meta-processor.js.map +1 -0
- package/dist/processing/no-toc.d.ts +10 -0
- package/dist/processing/no-toc.d.ts.map +1 -0
- package/dist/processing/no-toc.js +44 -0
- package/dist/processing/no-toc.js.map +1 -0
- package/dist/processing/toc-no-pages.d.ts +11 -0
- package/dist/processing/toc-no-pages.d.ts.map +1 -0
- package/dist/processing/toc-no-pages.js +46 -0
- package/dist/processing/toc-no-pages.js.map +1 -0
- package/dist/processing/toc-with-pages.d.ts +15 -0
- package/dist/processing/toc-with-pages.d.ts.map +1 -0
- package/dist/processing/toc-with-pages.js +151 -0
- package/dist/processing/toc-with-pages.js.map +1 -0
- package/dist/toc/index.d.ts +4 -0
- package/dist/toc/index.d.ts.map +1 -0
- package/dist/toc/index.js +4 -0
- package/dist/toc/index.js.map +1 -0
- package/dist/toc/toc-detector.d.ts +23 -0
- package/dist/toc/toc-detector.d.ts.map +1 -0
- package/dist/toc/toc-detector.js +65 -0
- package/dist/toc/toc-detector.js.map +1 -0
- package/dist/toc/toc-extractor.d.ts +13 -0
- package/dist/toc/toc-extractor.d.ts.map +1 -0
- package/dist/toc/toc-extractor.js +32 -0
- package/dist/toc/toc-extractor.js.map +1 -0
- package/dist/toc/toc-transformer.d.ts +11 -0
- package/dist/toc/toc-transformer.d.ts.map +1 -0
- package/dist/toc/toc-transformer.js +69 -0
- package/dist/toc/toc-transformer.js.map +1 -0
- package/dist/tree/index.d.ts +4 -0
- package/dist/tree/index.d.ts.map +1 -0
- package/dist/tree/index.js +4 -0
- package/dist/tree/index.js.map +1 -0
- package/dist/tree/list-to-tree.d.ts +7 -0
- package/dist/tree/list-to-tree.d.ts.map +1 -0
- package/dist/tree/list-to-tree.js +33 -0
- package/dist/tree/list-to-tree.js.map +1 -0
- package/dist/tree/post-processing.d.ts +12 -0
- package/dist/tree/post-processing.d.ts.map +1 -0
- package/dist/tree/post-processing.js +87 -0
- package/dist/tree/post-processing.js.map +1 -0
- package/dist/tree/tree-utils.d.ts +18 -0
- package/dist/tree/tree-utils.d.ts.map +1 -0
- package/dist/tree/tree-utils.js +43 -0
- package/dist/tree/tree-utils.js.map +1 -0
- package/dist/tree-parser.d.ts +30 -0
- package/dist/tree-parser.d.ts.map +1 -0
- package/dist/tree-parser.js +73 -0
- package/dist/tree-parser.js.map +1 -0
- package/dist/types.d.ts +3 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/config-loader.d.ts +15 -0
- package/dist/utils/config-loader.d.ts.map +1 -0
- package/dist/utils/config-loader.js +19 -0
- package/dist/utils/config-loader.js.map +1 -0
- package/dist/utils/index.d.ts +7 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +6 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/json-parser.d.ts +2 -0
- package/dist/utils/json-parser.d.ts.map +1 -0
- package/dist/utils/json-parser.js +76 -0
- package/dist/utils/json-parser.js.map +1 -0
- package/dist/utils/logger.d.ts +3 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +10 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/page-utils.d.ts +16 -0
- package/dist/utils/page-utils.d.ts.map +1 -0
- package/dist/utils/page-utils.js +56 -0
- package/dist/utils/page-utils.js.map +1 -0
- package/dist/utils/token-counter.d.ts +2 -0
- package/dist/utils/token-counter.d.ts.map +1 -0
- package/dist/utils/token-counter.js +5 -0
- package/dist/utils/token-counter.js.map +1 -0
- package/dist/vector-lib/adapters/in-memory-adapter.d.ts +14 -0
- package/dist/vector-lib/adapters/in-memory-adapter.d.ts.map +1 -0
- package/dist/vector-lib/adapters/in-memory-adapter.js +55 -0
- package/dist/vector-lib/adapters/in-memory-adapter.js.map +1 -0
- package/dist/vector-lib/adapters/vector-store.d.ts +10 -0
- package/dist/vector-lib/adapters/vector-store.d.ts.map +1 -0
- package/dist/vector-lib/adapters/vector-store.js +2 -0
- package/dist/vector-lib/adapters/vector-store.js.map +1 -0
- package/dist/vector-lib/chunker/tree-chunker.d.ts +8 -0
- package/dist/vector-lib/chunker/tree-chunker.d.ts.map +1 -0
- package/dist/vector-lib/chunker/tree-chunker.js +59 -0
- package/dist/vector-lib/chunker/tree-chunker.js.map +1 -0
- package/dist/vector-lib/embedder/embedder.d.ts +8 -0
- package/dist/vector-lib/embedder/embedder.d.ts.map +1 -0
- package/dist/vector-lib/embedder/embedder.js +2 -0
- package/dist/vector-lib/embedder/embedder.js.map +1 -0
- package/dist/vector-lib/index.d.ts +10 -0
- package/dist/vector-lib/index.d.ts.map +1 -0
- package/dist/vector-lib/index.js +6 -0
- package/dist/vector-lib/index.js.map +1 -0
- package/dist/vector-lib/search/hybrid-search.d.ts +19 -0
- package/dist/vector-lib/search/hybrid-search.d.ts.map +1 -0
- package/dist/vector-lib/search/hybrid-search.js +25 -0
- package/dist/vector-lib/search/hybrid-search.js.map +1 -0
- package/dist/vector-lib/search/reranker.d.ts +14 -0
- package/dist/vector-lib/search/reranker.d.ts.map +1 -0
- package/dist/vector-lib/search/reranker.js +2 -0
- package/dist/vector-lib/search/reranker.js.map +1 -0
- package/dist/vector-lib/types.d.ts +29 -0
- package/dist/vector-lib/types.d.ts.map +1 -0
- package/dist/vector-lib/types.js +2 -0
- package/dist/vector-lib/types.js.map +1 -0
- package/dist/vector-lib/vector-enhancer.d.ts +28 -0
- package/dist/vector-lib/vector-enhancer.d.ts.map +1 -0
- package/dist/vector-lib/vector-enhancer.js +54 -0
- package/dist/vector-lib/vector-enhancer.js.map +1 -0
- package/dist/vector.d.ts +5 -0
- package/dist/vector.d.ts.map +1 -0
- package/dist/vector.js +3 -0
- package/dist/vector.js.map +1 -0
- package/dist/verification/fix-toc.d.ts +13 -0
- package/dist/verification/fix-toc.d.ts.map +1 -0
- package/dist/verification/fix-toc.js +73 -0
- package/dist/verification/fix-toc.js.map +1 -0
- package/dist/verification/index.d.ts +3 -0
- package/dist/verification/index.d.ts.map +1 -0
- package/dist/verification/index.js +3 -0
- package/dist/verification/index.js.map +1 -0
- package/dist/verification/verify-toc.d.ts +17 -0
- package/dist/verification/verify-toc.d.ts.map +1 -0
- package/dist/verification/verify-toc.js +64 -0
- package/dist/verification/verify-toc.js.map +1 -0
- package/package.json +58 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 FastRAG Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# PageIndex TS SDK
|
|
2
|
+
|
|
3
|
+
[中文文档](./README.zh-CN.md)
|
|
4
|
+
|
|
5
|
+
> TypeScript rewrite of [VectifyAI/PageIndex](https://github.com/VectifyAI/PageIndex)
|
|
6
|
+
|
|
7
|
+
Traditional vector-based RAG relies on semantic *similarity* rather than true *relevance* — and similarity ≠ relevance. [PageIndex](https://pageindex.ai/blog/pageindex-intro) takes a different approach: it builds a **hierarchical tree index** (like a "table of contents") from documents and uses LLM **reasoning** to navigate that tree for retrieval. No chunking, no vector DB required — just structured, human-like document navigation.
|
|
8
|
+
|
|
9
|
+
This SDK is a TypeScript implementation of the PageIndex framework, designed for Node.js/TypeScript RAG pipelines with full runtime decoupling.
|
|
10
|
+
|
|
11
|
+
## Highlights
|
|
12
|
+
|
|
13
|
+
- **Reasoning-based retrieval** — builds a hierarchical tree index from documents, enabling LLM-driven tree search instead of vector similarity
|
|
14
|
+
- **LLM-powered TOC detection** — automatically detects table of contents and chooses the optimal processing mode
|
|
15
|
+
- **Three processing modes** — `toc_with_page_numbers`, `toc_no_page_numbers`, `no_toc` with auto-degradation
|
|
16
|
+
- **Markdown to tree** — `mdToTree()` for structured tree conversion from Markdown
|
|
17
|
+
- **Optional vector enhancement** — indexes tree structures into a vector store for fast segment retrieval within a document, and for multi-document collections where per-document LLM tree-search is impractical
|
|
18
|
+
- **Runtime decoupled** — bring your own LLM (`LlmProvider`), PDF parser (`DocumentParser`), and vector DB (`VectorStore` / `Embedder`)
|
|
19
|
+
|
|
20
|
+
## Package Layout
|
|
21
|
+
|
|
22
|
+
```text
|
|
23
|
+
@fastrag/pageindex ← document structure indexing SDK
|
|
24
|
+
exports:
|
|
25
|
+
. → core pipeline
|
|
26
|
+
./types → shared types + defaults
|
|
27
|
+
./vector → vector enhancement
|
|
28
|
+
runtime deps:
|
|
29
|
+
- gpt-tokenizer
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
| Subpath | Description |
|
|
33
|
+
| --- | --- |
|
|
34
|
+
| `@fastrag/pageindex` | Document structure indexing pipeline |
|
|
35
|
+
| `@fastrag/pageindex/types` | Shared type definitions and interface contracts |
|
|
36
|
+
| `@fastrag/pageindex/vector` | Vector enhancement: chunking, indexing, search |
|
|
37
|
+
|
|
38
|
+
## Requirements
|
|
39
|
+
|
|
40
|
+
- Node.js >= 20
|
|
41
|
+
- pnpm >= 10
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
For application usage:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pnpm add @fastrag/pageindex
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
For local development in this repository:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pnpm install
|
|
55
|
+
pnpm build
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
### 1) Build Document Tree Index
|
|
61
|
+
|
|
62
|
+
```ts
|
|
63
|
+
import { pageIndex } from '@fastrag/pageindex';
|
|
64
|
+
import type { LlmProvider, PageContent } from '@fastrag/pageindex/types';
|
|
65
|
+
|
|
66
|
+
// pageIndex is 1-based
|
|
67
|
+
const pages: PageContent[] = [
|
|
68
|
+
{ pageIndex: 1, text: 'Table of Contents\n1 Introduction ... 1' },
|
|
69
|
+
{ pageIndex: 2, text: '1 Introduction\nThis document ...' },
|
|
70
|
+
{ pageIndex: 3, text: '2 Methods\nWe propose ...' },
|
|
71
|
+
];
|
|
72
|
+
|
|
73
|
+
const provider: LlmProvider = {
|
|
74
|
+
async chat(messages) {
|
|
75
|
+
// Plug in OpenAI / Claude / local model here
|
|
76
|
+
return { content: '{}', finishReason: 'stop' };
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
const result = await pageIndex(
|
|
81
|
+
pages,
|
|
82
|
+
{
|
|
83
|
+
docName: 'example.pdf',
|
|
84
|
+
addNodeId: true,
|
|
85
|
+
addNodeSummary: true,
|
|
86
|
+
addDocDescription: true,
|
|
87
|
+
},
|
|
88
|
+
provider,
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
console.log(result.metadata.processingMode);
|
|
92
|
+
console.log(result.structure);
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 2) Markdown to Tree
|
|
96
|
+
|
|
97
|
+
```ts
|
|
98
|
+
import { mdToTree } from '@fastrag/pageindex';
|
|
99
|
+
|
|
100
|
+
const tree = mdToTree('# Intro\nhello\n## Background\nmore context', {
|
|
101
|
+
thinning: true,
|
|
102
|
+
minTokenThreshold: 500,
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3) Vector Enhancement
|
|
107
|
+
|
|
108
|
+
```ts
|
|
109
|
+
import { VectorEnhancer, InMemoryAdapter, treeChunker } from '@fastrag/pageindex/vector';
|
|
110
|
+
import type { Embedder } from '@fastrag/pageindex/vector';
|
|
111
|
+
|
|
112
|
+
const store = new InMemoryAdapter();
|
|
113
|
+
|
|
114
|
+
const embedder: Embedder = {
|
|
115
|
+
dimension: 3,
|
|
116
|
+
async embed(texts: string[]) {
|
|
117
|
+
// Replace with real embedding service
|
|
118
|
+
return texts.map((t) => [t.length, t.length / 2, 1]);
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
const enhancer = new VectorEnhancer(
|
|
123
|
+
store,
|
|
124
|
+
embedder,
|
|
125
|
+
treeChunker, // you can replace this with a custom chunking strategy
|
|
126
|
+
{ chunkMaxTokens: 1000 },
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
await enhancer.index(result); // result from pageIndex()
|
|
130
|
+
const hits = await enhancer.search('introduction section');
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### 4) Hybrid Search
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
import { HybridSearch } from '@fastrag/pageindex/vector';
|
|
137
|
+
|
|
138
|
+
const hybrid = new HybridSearch(enhancer, {
|
|
139
|
+
vectorTopK: 20,
|
|
140
|
+
rerankTopK: 5,
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
const results = await hybrid.search('what is the main contribution?');
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Core Pipeline
|
|
147
|
+
|
|
148
|
+
```text
|
|
149
|
+
PageContent[] → TOC Detection → Mode Selection → Verify/Fix/Degrade → TreeNode[] → Post-processing → PageIndexResult
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
1. Scan first N pages for table of contents
|
|
153
|
+
2. Choose processing mode based on TOC presence and page numbers
|
|
154
|
+
3. Verify TOC accuracy, auto-fix or degrade to a simpler mode if needed
|
|
155
|
+
4. Build `TreeNode[]` tree structure
|
|
156
|
+
5. Optional post-processing: `nodeId`, `nodeText`, `summary`, `docDescription`
|
|
157
|
+
|
|
158
|
+
## Vector Enhancement
|
|
159
|
+
|
|
160
|
+
The original PageIndex uses LLM-based tree search for retrieval — the entire tree structure is sent to an LLM as a prompt, and the LLM reasons about which nodes are relevant. This works well but means every query requires an LLM call.
|
|
161
|
+
|
|
162
|
+
The `@fastrag/pageindex/vector` subpath export offers an alternative: convert the tree into vector embeddings so retrieval becomes a similarity search — no LLM needed at query time, with millisecond-level latency.
|
|
163
|
+
|
|
164
|
+
| | Indexing | Retrieval |
|
|
165
|
+
| --- | --- | --- |
|
|
166
|
+
| Tree search (original) | LLM builds tree | LLM reasons over tree per query |
|
|
167
|
+
| Vector enhancement (this SDK) | LLM builds tree → embed into vector store | Vector similarity search (no LLM) |
|
|
168
|
+
|
|
169
|
+
This covers two scenarios:
|
|
170
|
+
|
|
171
|
+
- **Within a single document** — chunk tree nodes into embeddings for fast, precise segment retrieval without LLM calls
|
|
172
|
+
- **Across many documents** — index multiple documents into a shared vector store, then search across all of them at once; avoids running LLM tree-search on every document
|
|
173
|
+
|
|
174
|
+
The flow: `PageIndexResult` → `Chunker` (default: `treeChunker`) → `Embedder` (generates vectors) → `VectorStore` (upsert/search). `treeChunker` currently uses paragraph splitting with a char-based heuristic threshold (`1 token ≈ 4 chars`). Each chunk retains its tree metadata (`docName`, `nodeId`, `title`, page range), so results stay traceable to the original document structure.
|
|
175
|
+
|
|
176
|
+
## Interface Abstractions
|
|
177
|
+
|
|
178
|
+
### LlmProvider
|
|
179
|
+
|
|
180
|
+
```ts
|
|
181
|
+
interface LlmProvider {
|
|
182
|
+
chat(messages: LlmMessage[], options?: LlmOptions): Promise<LlmResponse>;
|
|
183
|
+
chatWithSchema?(
|
|
184
|
+
messages: LlmMessage[],
|
|
185
|
+
schema: JsonSchema,
|
|
186
|
+
options?: LlmOptions,
|
|
187
|
+
): Promise<LlmResponse>;
|
|
188
|
+
}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### DocumentParser
|
|
192
|
+
|
|
193
|
+
```ts
|
|
194
|
+
interface DocumentParser {
|
|
195
|
+
parse(input: string | ArrayBuffer | Uint8Array): Promise<PageContent[]>;
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Embedder
|
|
200
|
+
|
|
201
|
+
```ts
|
|
202
|
+
interface Embedder {
|
|
203
|
+
embed(texts: string[]): Promise<number[][]>;
|
|
204
|
+
readonly dimension: number;
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Chunker
|
|
209
|
+
|
|
210
|
+
```ts
|
|
211
|
+
type Chunker = (result: PageIndexResult, config?: VectorConfig) => Chunk[];
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
`treeChunker` is the default strategy implementation you can pass in, but `VectorEnhancer` requires explicit chunker injection so you can replace it with a model-specific strategy when needed.
|
|
215
|
+
|
|
216
|
+
## Configuration
|
|
217
|
+
|
|
218
|
+
| Field | Default | Description |
|
|
219
|
+
| --- | ---: | --- |
|
|
220
|
+
| `tocCheckPageNum` | `20` | Max pages scanned for TOC detection |
|
|
221
|
+
| `maxPageNumEachNode` | `10` | Page threshold for large-node recursive splitting |
|
|
222
|
+
| `maxTokenNumEachNode` | `20000` | Token threshold for large-node recursive splitting |
|
|
223
|
+
| `addNodeId` | `true` | Generate 4-digit node IDs |
|
|
224
|
+
| `addNodeSummary` | `true` | Generate node summaries via LLM |
|
|
225
|
+
| `addDocDescription` | `false` | Generate one-sentence document description |
|
|
226
|
+
| `addNodeText` | `false` | Keep raw node text in output |
|
|
227
|
+
| `retryConfig` | exp. backoff | LLM retry strategy (`maxRetries: 10`, `initialDelayMs: 1000`, `maxDelayMs: 30000`, `backoffMultiplier: 2`) |
|
|
228
|
+
| `onDegradation` | `undefined` | Callback when processing mode degrades |
|
|
229
|
+
| `logger` | silent | Custom logger injection |
|
|
230
|
+
|
|
231
|
+
## Output
|
|
232
|
+
|
|
233
|
+
```ts
|
|
234
|
+
interface PageIndexResult {
|
|
235
|
+
docName: string;
|
|
236
|
+
docDescription?: string;
|
|
237
|
+
structure: TreeNode[];
|
|
238
|
+
metadata: {
|
|
239
|
+
processingMode: ProcessingMode;
|
|
240
|
+
degradations: DegradationEvent[];
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Further Reading
|
|
246
|
+
|
|
247
|
+
- [Retrieval Strategies](./docs/retrieval-strategies.md) — comparison of LLM tree search, MCTS, and vector-based retrieval, with a hybrid architecture design for large-scale document collections
|
|
248
|
+
|
|
249
|
+
## License
|
|
250
|
+
|
|
251
|
+
[MIT](./LICENSE)
|
package/README.zh-CN.md
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# PageIndex TS SDK
|
|
2
|
+
|
|
3
|
+
[English](./README.md)
|
|
4
|
+
|
|
5
|
+
> [VectifyAI/PageIndex](https://github.com/VectifyAI/PageIndex) 的 TypeScript 重写版
|
|
6
|
+
|
|
7
|
+
传统向量 RAG 依赖语义*相似度*而非真正的*相关性* — 但相似 ≠ 相关。[PageIndex](https://pageindex.ai/blog/pageindex-intro) 采用不同的思路:从文档中构建**层级树索引**(类似"目录"),然后利用 LLM **推理**在树上导航检索。无需分块,无需向量数据库 — 像人类专家一样结构化地阅读文档。
|
|
8
|
+
|
|
9
|
+
本 SDK 是 PageIndex 框架的 TypeScript 实现,面向 Node.js/TypeScript RAG 场景,完全运行时解耦。
|
|
10
|
+
|
|
11
|
+
## 特性
|
|
12
|
+
|
|
13
|
+
- **基于推理的检索** — 从文档构建层级树索引,通过 LLM 驱动的树搜索替代向量相似度匹配
|
|
14
|
+
- **LLM 驱动的 TOC 检测** — 自动检测目录并选择最优处理模式
|
|
15
|
+
- **三种处理模式** — `toc_with_page_numbers`、`toc_no_page_numbers`、`no_toc`,支持自动降级
|
|
16
|
+
- **Markdown 转树** — `mdToTree()` 将 Markdown 转换为结构化树
|
|
17
|
+
- **可选向量增强** — 将树索引结果向量化,支持单文档内的精确片段检索,也适用于多文档场景下快速定位相关节点、避免逐一 LLM 树搜索
|
|
18
|
+
- **运行时解耦** — 自带 LLM(`LlmProvider`)、PDF 解析器(`DocumentParser`)、向量数据库(`VectorStore` / `Embedder`)
|
|
19
|
+
|
|
20
|
+
## 包结构
|
|
21
|
+
|
|
22
|
+
```text
|
|
23
|
+
@fastrag/pageindex ← 文档结构索引 SDK
|
|
24
|
+
exports:
|
|
25
|
+
. → 核心流程
|
|
26
|
+
./types → 共享类型 + 默认配置
|
|
27
|
+
./vector → 向量增强
|
|
28
|
+
runtime deps:
|
|
29
|
+
- gpt-tokenizer
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
| 子路径 | 说明 |
|
|
33
|
+
| --- | --- |
|
|
34
|
+
| `@fastrag/pageindex` | 文档结构索引主流程 |
|
|
35
|
+
| `@fastrag/pageindex/types` | 共享类型定义与接口契约 |
|
|
36
|
+
| `@fastrag/pageindex/vector` | 向量增强:分块、索引、搜索 |
|
|
37
|
+
|
|
38
|
+
## 环境要求
|
|
39
|
+
|
|
40
|
+
- Node.js >= 20
|
|
41
|
+
- pnpm >= 10
|
|
42
|
+
|
|
43
|
+
## 安装
|
|
44
|
+
|
|
45
|
+
作为依赖安装:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pnpm add @fastrag/pageindex
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
在本仓库本地开发:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pnpm install
|
|
55
|
+
pnpm build
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## 快速开始
|
|
59
|
+
|
|
60
|
+
### 1) 构建文档树索引
|
|
61
|
+
|
|
62
|
+
```ts
|
|
63
|
+
import { pageIndex } from '@fastrag/pageindex';
|
|
64
|
+
import type { LlmProvider, PageContent } from '@fastrag/pageindex/types';
|
|
65
|
+
|
|
66
|
+
// pageIndex 为 1-based
|
|
67
|
+
const pages: PageContent[] = [
|
|
68
|
+
{ pageIndex: 1, text: 'Table of Contents\n1 Introduction ... 1' },
|
|
69
|
+
{ pageIndex: 2, text: '1 Introduction\nThis document ...' },
|
|
70
|
+
{ pageIndex: 3, text: '2 Methods\nWe propose ...' },
|
|
71
|
+
];
|
|
72
|
+
|
|
73
|
+
const provider: LlmProvider = {
|
|
74
|
+
async chat(messages) {
|
|
75
|
+
// 接入 OpenAI / Claude / 本地模型
|
|
76
|
+
return { content: '{}', finishReason: 'stop' };
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
const result = await pageIndex(
|
|
81
|
+
pages,
|
|
82
|
+
{
|
|
83
|
+
docName: 'example.pdf',
|
|
84
|
+
addNodeId: true,
|
|
85
|
+
addNodeSummary: true,
|
|
86
|
+
addDocDescription: true,
|
|
87
|
+
},
|
|
88
|
+
provider,
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
console.log(result.metadata.processingMode);
|
|
92
|
+
console.log(result.structure);
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 2) Markdown 转树
|
|
96
|
+
|
|
97
|
+
```ts
|
|
98
|
+
import { mdToTree } from '@fastrag/pageindex';
|
|
99
|
+
|
|
100
|
+
const tree = mdToTree('# Intro\nhello\n## Background\nmore context', {
|
|
101
|
+
thinning: true,
|
|
102
|
+
minTokenThreshold: 500,
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3) 向量增强
|
|
107
|
+
|
|
108
|
+
```ts
|
|
109
|
+
import { VectorEnhancer, InMemoryAdapter, treeChunker } from '@fastrag/pageindex/vector';
|
|
110
|
+
import type { Embedder } from '@fastrag/pageindex/vector';
|
|
111
|
+
|
|
112
|
+
const store = new InMemoryAdapter();
|
|
113
|
+
|
|
114
|
+
const embedder: Embedder = {
|
|
115
|
+
dimension: 3,
|
|
116
|
+
async embed(texts: string[]) {
|
|
117
|
+
// 替换为真实 embedding 服务
|
|
118
|
+
return texts.map((t) => [t.length, t.length / 2, 1]);
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
const enhancer = new VectorEnhancer(
|
|
123
|
+
store,
|
|
124
|
+
embedder,
|
|
125
|
+
treeChunker, // 也可以替换为自定义分片策略
|
|
126
|
+
{ chunkMaxTokens: 1000 },
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
await enhancer.index(result); // 来自 pageIndex() 的结果
|
|
130
|
+
const hits = await enhancer.search('introduction section');
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### 4) 混合检索
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
import { HybridSearch } from '@fastrag/pageindex/vector';
|
|
137
|
+
|
|
138
|
+
const hybrid = new HybridSearch(enhancer, {
|
|
139
|
+
vectorTopK: 20,
|
|
140
|
+
rerankTopK: 5,
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
const results = await hybrid.search('what is the main contribution?');
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## 核心流程
|
|
147
|
+
|
|
148
|
+
```text
|
|
149
|
+
PageContent[] → TOC 检测 → 模式选择 → 验证/修复/降级 → TreeNode[] → 后处理 → PageIndexResult
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
1. 扫描前 N 页检测目录
|
|
153
|
+
2. 根据 TOC 有无和页码信息选择处理模式
|
|
154
|
+
3. 验证 TOC 准确性,自动修复或降级到更简单的模式
|
|
155
|
+
4. 构建 `TreeNode[]` 树结构
|
|
156
|
+
5. 可选后处理:`nodeId`、`nodeText`、`summary`、`docDescription`
|
|
157
|
+
|
|
158
|
+
## 向量增强
|
|
159
|
+
|
|
160
|
+
原版 PageIndex 使用 LLM 树搜索进行检索 — 将整个树结构作为 prompt 发送给 LLM,由 LLM 推理判断哪些节点相关。这种方式效果好,但每次查询都需要调用 LLM。
|
|
161
|
+
|
|
162
|
+
`@fastrag/pageindex/vector` 子路径导出提供了另一种方案:将树结构转换为向量 embedding,检索变为相似度搜索 — 查询时无需 LLM,延迟在毫秒级。
|
|
163
|
+
|
|
164
|
+
| | 索引阶段 | 检索阶段 |
|
|
165
|
+
| --- | --- | --- |
|
|
166
|
+
| 树搜索(原版) | LLM 构建树 | 每次查询 LLM 推理遍历树 |
|
|
167
|
+
| 向量增强(本 SDK) | LLM 构建树 → embedding 入库 | 向量相似度搜索(无需 LLM) |
|
|
168
|
+
|
|
169
|
+
覆盖两种场景:
|
|
170
|
+
|
|
171
|
+
- **单文档内** — 将树节点分块为 embedding,无需 LLM 调用即可快速精确定位相关片段
|
|
172
|
+
- **跨多文档** — 将多个文档索引到同一个向量库,一次搜索覆盖所有文档;避免对每个文档逐一执行 LLM 树搜索
|
|
173
|
+
|
|
174
|
+
流程:`PageIndexResult` → `Chunker`(默认实现为 `treeChunker`)→ `Embedder`(生成向量)→ `VectorStore`(存储/检索)。`treeChunker` 当前采用“按段落切分 + 字符近似阈值”(`1 token ≈ 4 chars`)。每个 chunk 保留树结构元数据(`docName`、`nodeId`、`title`、页码范围),检索结果可追溯到原始文档结构。
|
|
175
|
+
|
|
176
|
+
## 接口抽象
|
|
177
|
+
|
|
178
|
+
### LlmProvider
|
|
179
|
+
|
|
180
|
+
```ts
|
|
181
|
+
interface LlmProvider {
|
|
182
|
+
chat(messages: LlmMessage[], options?: LlmOptions): Promise<LlmResponse>;
|
|
183
|
+
chatWithSchema?(
|
|
184
|
+
messages: LlmMessage[],
|
|
185
|
+
schema: JsonSchema,
|
|
186
|
+
options?: LlmOptions,
|
|
187
|
+
): Promise<LlmResponse>;
|
|
188
|
+
}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### DocumentParser
|
|
192
|
+
|
|
193
|
+
```ts
|
|
194
|
+
interface DocumentParser {
|
|
195
|
+
parse(input: string | ArrayBuffer | Uint8Array): Promise<PageContent[]>;
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Embedder
|
|
200
|
+
|
|
201
|
+
```ts
|
|
202
|
+
interface Embedder {
|
|
203
|
+
embed(texts: string[]): Promise<number[][]>;
|
|
204
|
+
readonly dimension: number;
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Chunker
|
|
209
|
+
|
|
210
|
+
```ts
|
|
211
|
+
type Chunker = (result: PageIndexResult, config?: VectorConfig) => Chunk[];
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
`treeChunker` 是可直接使用的默认策略实现,但 `VectorEnhancer` 需要显式注入 chunker,因此你可以按模型分词规则替换为自定义分片策略。
|
|
215
|
+
|
|
216
|
+
## 配置项
|
|
217
|
+
|
|
218
|
+
| 字段 | 默认值 | 说明 |
|
|
219
|
+
| --- | ---: | --- |
|
|
220
|
+
| `tocCheckPageNum` | `20` | TOC 检测最多扫描页数 |
|
|
221
|
+
| `maxPageNumEachNode` | `10` | 大节点递归拆分页数阈值 |
|
|
222
|
+
| `maxTokenNumEachNode` | `20000` | 大节点递归拆分 token 阈值 |
|
|
223
|
+
| `addNodeId` | `true` | 生成 4 位节点 ID |
|
|
224
|
+
| `addNodeSummary` | `true` | 通过 LLM 生成节点摘要 |
|
|
225
|
+
| `addDocDescription` | `false` | 生成文档一句话描述 |
|
|
226
|
+
| `addNodeText` | `false` | 在输出中保留节点原文 |
|
|
227
|
+
| `retryConfig` | 指数退避 | LLM 重试策略(`maxRetries: 10`、`initialDelayMs: 1000`、`maxDelayMs: 30000`、`backoffMultiplier: 2`) |
|
|
228
|
+
| `onDegradation` | `undefined` | 处理模式降级时的回调 |
|
|
229
|
+
| `logger` | 静默 | 自定义日志注入 |
|
|
230
|
+
|
|
231
|
+
## 输出结构
|
|
232
|
+
|
|
233
|
+
```ts
|
|
234
|
+
interface PageIndexResult {
|
|
235
|
+
docName: string;
|
|
236
|
+
docDescription?: string;
|
|
237
|
+
structure: TreeNode[];
|
|
238
|
+
metadata: {
|
|
239
|
+
processingMode: ProcessingMode;
|
|
240
|
+
degradations: DegradationEvent[];
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## 延伸阅读
|
|
246
|
+
|
|
247
|
+
- [检索策略探讨](./docs/retrieval-strategies.md) — LLM 树搜索、MCTS、向量检索三种方案的对比,以及面向大规模文档集合的混合架构设计
|
|
248
|
+
|
|
249
|
+
## 许可证
|
|
250
|
+
|
|
251
|
+
[MIT](./LICENSE)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare class PageIndexError extends Error {
|
|
2
|
+
constructor(message: string);
|
|
3
|
+
}
|
|
4
|
+
export declare class LlmError extends PageIndexError {
|
|
5
|
+
constructor(message: string);
|
|
6
|
+
}
|
|
7
|
+
export declare class TocProcessingError extends PageIndexError {
|
|
8
|
+
constructor(message: string);
|
|
9
|
+
}
|
|
10
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/errors/index.ts"],"names":[],"mappings":"AAAA,qBAAa,cAAe,SAAQ,KAAK;gBAC3B,OAAO,EAAE,MAAM;CAI5B;AAED,qBAAa,QAAS,SAAQ,cAAc;gBAC9B,OAAO,EAAE,MAAM;CAI5B;AAED,qBAAa,kBAAmB,SAAQ,cAAc;gBACxC,OAAO,EAAE,MAAM;CAI5B"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export class PageIndexError extends Error {
|
|
2
|
+
constructor(message) {
|
|
3
|
+
super(message);
|
|
4
|
+
this.name = 'PageIndexError';
|
|
5
|
+
}
|
|
6
|
+
}
|
|
7
|
+
export class LlmError extends PageIndexError {
|
|
8
|
+
constructor(message) {
|
|
9
|
+
super(message);
|
|
10
|
+
this.name = 'LlmError';
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
export class TocProcessingError extends PageIndexError {
|
|
14
|
+
constructor(message) {
|
|
15
|
+
super(message);
|
|
16
|
+
this.name = 'TocProcessingError';
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/errors/index.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,cAAe,SAAQ,KAAK;IACvC,YAAY,OAAe;QACzB,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,QAAS,SAAQ,cAAc;IAC1C,YAAY,OAAe;QACzB,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC;IACzB,CAAC;CACF;AAED,MAAM,OAAO,kBAAmB,SAAQ,cAAc;IACpD,YAAY,OAAe;QACzB,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,oBAAoB,CAAC;IACnC,CAAC;CACF"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export { PageIndexError, LlmError, TocProcessingError, } from './errors/index.js';
|
|
2
|
+
export { countTokens, extractJson, mergeConfig, pageListToGroupText, convertPhysicalIndexToInt, addPhysicalIndexTags, createSilentLogger, } from './utils/index.js';
|
|
3
|
+
export type { ResolvedConfig } from './utils/index.js';
|
|
4
|
+
export { LlmClient } from './llm/index.js';
|
|
5
|
+
export { listToTree, getNodes, getLeafNodes, structureToList, postProcessing } from './tree/index.js';
|
|
6
|
+
export { mdToTree } from './markdown/index.js';
|
|
7
|
+
export { extractNodesFromMarkdown, buildTreeFromNodes, treeThinningForIndex } from './markdown/index.js';
|
|
8
|
+
export { writeNodeId } from './post-processing/index.js';
|
|
9
|
+
export { addNodeText, removeStructureText } from './post-processing/index.js';
|
|
10
|
+
export { generateSummariesForStructure } from './post-processing/index.js';
|
|
11
|
+
export { createCleanStructureForDescription, generateDocDescription } from './post-processing/index.js';
|
|
12
|
+
export { pageIndex } from './page-index.js';
|
|
13
|
+
export { treeParser } from './tree-parser.js';
|
|
14
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,cAAc,EACd,QAAQ,EACR,kBAAkB,GACnB,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EACL,WAAW,EACX,WAAW,EACX,WAAW,EACX,mBAAmB,EACnB,yBAAyB,EACzB,oBAAoB,EACpB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAC1B,YAAY,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAGvD,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAG3C,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAGtG,OAAO,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAC/C,OAAO,EAAE,wBAAwB,EAAE,kBAAkB,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAGzG,OAAO,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,MAAM,4BAA4B,CAAC;AAC9E,OAAO,EAAE,6BAA6B,EAAE,MAAM,4BAA4B,CAAC;AAC3E,OAAO,EAAE,kCAAkC,EAAE,sBAAsB,EAAE,MAAM,4BAA4B,CAAC;AAGxG,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
// Errors
|
|
2
|
+
export { PageIndexError, LlmError, TocProcessingError, } from './errors/index.js';
|
|
3
|
+
// Utils
|
|
4
|
+
export { countTokens, extractJson, mergeConfig, pageListToGroupText, convertPhysicalIndexToInt, addPhysicalIndexTags, createSilentLogger, } from './utils/index.js';
|
|
5
|
+
// LLM
|
|
6
|
+
export { LlmClient } from './llm/index.js';
|
|
7
|
+
// Tree
|
|
8
|
+
export { listToTree, getNodes, getLeafNodes, structureToList, postProcessing } from './tree/index.js';
|
|
9
|
+
// Markdown
|
|
10
|
+
export { mdToTree } from './markdown/index.js';
|
|
11
|
+
export { extractNodesFromMarkdown, buildTreeFromNodes, treeThinningForIndex } from './markdown/index.js';
|
|
12
|
+
// Post-processing
|
|
13
|
+
export { writeNodeId } from './post-processing/index.js';
|
|
14
|
+
export { addNodeText, removeStructureText } from './post-processing/index.js';
|
|
15
|
+
export { generateSummariesForStructure } from './post-processing/index.js';
|
|
16
|
+
export { createCleanStructureForDescription, generateDocDescription } from './post-processing/index.js';
|
|
17
|
+
// Main entry
|
|
18
|
+
export { pageIndex } from './page-index.js';
|
|
19
|
+
export { treeParser } from './tree-parser.js';
|
|
20
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,SAAS;AACT,OAAO,EACL,cAAc,EACd,QAAQ,EACR,kBAAkB,GACnB,MAAM,mBAAmB,CAAC;AAE3B,QAAQ;AACR,OAAO,EACL,WAAW,EACX,WAAW,EACX,WAAW,EACX,mBAAmB,EACnB,yBAAyB,EACzB,oBAAoB,EACpB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAG1B,MAAM;AACN,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAE3C,OAAO;AACP,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAEtG,WAAW;AACX,OAAO,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAC/C,OAAO,EAAE,wBAAwB,EAAE,kBAAkB,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAEzG,kBAAkB;AAClB,OAAO,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,MAAM,4BAA4B,CAAC;AAC9E,OAAO,EAAE,6BAA6B,EAAE,MAAM,4BAA4B,CAAC;AAC3E,OAAO,EAAE,kCAAkC,EAAE,sBAAsB,EAAE,MAAM,4BAA4B,CAAC;AAExG,aAAa;AACb,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import type { DegradationEvent } from './processing.js';
|
|
2
|
+
import type { Logger } from './logger.js';
|
|
3
|
+
export interface RetryConfig {
|
|
4
|
+
maxRetries: number;
|
|
5
|
+
initialDelayMs: number;
|
|
6
|
+
maxDelayMs: number;
|
|
7
|
+
backoffMultiplier: number;
|
|
8
|
+
}
|
|
9
|
+
export interface PageIndexConfig {
|
|
10
|
+
tocCheckPageNum?: number;
|
|
11
|
+
maxPageNumEachNode?: number;
|
|
12
|
+
maxTokenNumEachNode?: number;
|
|
13
|
+
addNodeId?: boolean;
|
|
14
|
+
addNodeSummary?: boolean;
|
|
15
|
+
addDocDescription?: boolean;
|
|
16
|
+
addNodeText?: boolean;
|
|
17
|
+
onDegradation?: (event: DegradationEvent) => void;
|
|
18
|
+
retryConfig?: RetryConfig;
|
|
19
|
+
logger?: Logger;
|
|
20
|
+
}
|
|
21
|
+
export declare const DEFAULT_CONFIG: {
|
|
22
|
+
readonly tocCheckPageNum: 20;
|
|
23
|
+
readonly maxPageNumEachNode: 10;
|
|
24
|
+
readonly maxTokenNumEachNode: 20000;
|
|
25
|
+
readonly addNodeId: true;
|
|
26
|
+
readonly addNodeSummary: true;
|
|
27
|
+
readonly addDocDescription: false;
|
|
28
|
+
readonly addNodeText: false;
|
|
29
|
+
};
|
|
30
|
+
export declare const DEFAULT_RETRY_CONFIG: RetryConfig;
|
|
31
|
+
export interface MdConfig {
|
|
32
|
+
thinning?: boolean;
|
|
33
|
+
minTokenThreshold?: number;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../src/internal-types/config.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AACxD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,eAAe;IAC9B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAC5B,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;IAClD,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,eAAO,MAAM,cAAc;;;;;;;;CAmB1B,CAAC;AAEF,eAAO,MAAM,oBAAoB,EAAE,WAKlC,CAAC;AAEF,MAAM,WAAW,QAAQ;IACvB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export const DEFAULT_CONFIG = {
|
|
2
|
+
tocCheckPageNum: 20,
|
|
3
|
+
maxPageNumEachNode: 10,
|
|
4
|
+
maxTokenNumEachNode: 20000,
|
|
5
|
+
addNodeId: true,
|
|
6
|
+
addNodeSummary: true,
|
|
7
|
+
addDocDescription: false,
|
|
8
|
+
addNodeText: false,
|
|
9
|
+
};
|
|
10
|
+
export const DEFAULT_RETRY_CONFIG = {
|
|
11
|
+
maxRetries: 10,
|
|
12
|
+
initialDelayMs: 1000,
|
|
13
|
+
maxDelayMs: 30000,
|
|
14
|
+
backoffMultiplier: 2,
|
|
15
|
+
};
|
|
16
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/internal-types/config.ts"],"names":[],"mappings":"AAuBA,MAAM,CAAC,MAAM,cAAc,GAAG;IAC5B,eAAe,EAAE,EAAE;IACnB,kBAAkB,EAAE,EAAE;IACtB,mBAAmB,EAAE,KAAK;IAC1B,SAAS,EAAE,IAAI;IACf,cAAc,EAAE,IAAI;IACpB,iBAAiB,EAAE,KAAK;IACxB,WAAW,EAAE,KAAK;CAYnB,CAAC;AAEF,MAAM,CAAC,MAAM,oBAAoB,GAAgB;IAC/C,UAAU,EAAE,EAAE;IACd,cAAc,EAAE,IAAI;IACpB,UAAU,EAAE,KAAK;IACjB,iBAAiB,EAAE,CAAC;CACrB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-parser.d.ts","sourceRoot":"","sources":["../../src/internal-types/document-parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAE7C,MAAM,WAAW,cAAc;IAC7B,KAAK,CAAC,KAAK,EAAE,MAAM,GAAG,WAAW,GAAG,UAAU,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC;CACzE"}
|