@fastrag/pageindex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +251 -0
- package/README.zh-CN.md +251 -0
- package/dist/errors/index.d.ts +10 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +19 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/internal-types/config.d.ts +35 -0
- package/dist/internal-types/config.d.ts.map +1 -0
- package/dist/internal-types/config.js +16 -0
- package/dist/internal-types/config.js.map +1 -0
- package/dist/internal-types/document-parser.d.ts +5 -0
- package/dist/internal-types/document-parser.d.ts.map +1 -0
- package/dist/internal-types/document-parser.js +2 -0
- package/dist/internal-types/document-parser.js.map +1 -0
- package/dist/internal-types/index.d.ts +9 -0
- package/dist/internal-types/index.d.ts.map +1 -0
- package/dist/internal-types/index.js +2 -0
- package/dist/internal-types/index.js.map +1 -0
- package/dist/internal-types/llm-provider.d.ts +19 -0
- package/dist/internal-types/llm-provider.d.ts.map +1 -0
- package/dist/internal-types/llm-provider.js +2 -0
- package/dist/internal-types/llm-provider.js.map +1 -0
- package/dist/internal-types/logger.d.ts +7 -0
- package/dist/internal-types/logger.d.ts.map +1 -0
- package/dist/internal-types/logger.js +2 -0
- package/dist/internal-types/logger.js.map +1 -0
- package/dist/internal-types/page.d.ts +5 -0
- package/dist/internal-types/page.d.ts.map +1 -0
- package/dist/internal-types/page.js +2 -0
- package/dist/internal-types/page.js.map +1 -0
- package/dist/internal-types/processing.d.ts +21 -0
- package/dist/internal-types/processing.d.ts.map +1 -0
- package/dist/internal-types/processing.js +2 -0
- package/dist/internal-types/processing.js.map +1 -0
- package/dist/internal-types/tree-node.d.ts +30 -0
- package/dist/internal-types/tree-node.d.ts.map +1 -0
- package/dist/internal-types/tree-node.js +2 -0
- package/dist/internal-types/tree-node.js.map +1 -0
- package/dist/llm/index.d.ts +3 -0
- package/dist/llm/index.d.ts.map +1 -0
- package/dist/llm/index.js +3 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/llm/llm-client.d.ts +26 -0
- package/dist/llm/llm-client.d.ts.map +1 -0
- package/dist/llm/llm-client.js +88 -0
- package/dist/llm/llm-client.js.map +1 -0
- package/dist/llm/prompts.d.ts +33 -0
- package/dist/llm/prompts.d.ts.map +1 -0
- package/dist/llm/prompts.js +312 -0
- package/dist/llm/prompts.js.map +1 -0
- package/dist/markdown/index.d.ts +6 -0
- package/dist/markdown/index.d.ts.map +1 -0
- package/dist/markdown/index.js +5 -0
- package/dist/markdown/index.js.map +1 -0
- package/dist/markdown/md-extractor.d.ts +14 -0
- package/dist/markdown/md-extractor.d.ts.map +1 -0
- package/dist/markdown/md-extractor.js +30 -0
- package/dist/markdown/md-extractor.js.map +1 -0
- package/dist/markdown/md-to-tree.d.ts +8 -0
- package/dist/markdown/md-to-tree.d.ts.map +1 -0
- package/dist/markdown/md-to-tree.js +20 -0
- package/dist/markdown/md-to-tree.js.map +1 -0
- package/dist/markdown/md-tree-builder.d.ts +7 -0
- package/dist/markdown/md-tree-builder.d.ts.map +1 -0
- package/dist/markdown/md-tree-builder.js +36 -0
- package/dist/markdown/md-tree-builder.js.map +1 -0
- package/dist/markdown/tree-thinning.d.ts +8 -0
- package/dist/markdown/tree-thinning.d.ts.map +1 -0
- package/dist/markdown/tree-thinning.js +42 -0
- package/dist/markdown/tree-thinning.js.map +1 -0
- package/dist/page-index.d.ts +10 -0
- package/dist/page-index.d.ts.map +1 -0
- package/dist/page-index.js +54 -0
- package/dist/page-index.js.map +1 -0
- package/dist/post-processing/doc-description.d.ts +12 -0
- package/dist/post-processing/doc-description.d.ts.map +1 -0
- package/dist/post-processing/doc-description.js +31 -0
- package/dist/post-processing/doc-description.js.map +1 -0
- package/dist/post-processing/index.d.ts +5 -0
- package/dist/post-processing/index.d.ts.map +1 -0
- package/dist/post-processing/index.js +5 -0
- package/dist/post-processing/index.js.map +1 -0
- package/dist/post-processing/node-id.d.ts +7 -0
- package/dist/post-processing/node-id.d.ts.map +1 -0
- package/dist/post-processing/node-id.js +20 -0
- package/dist/post-processing/node-id.js.map +1 -0
- package/dist/post-processing/node-text.d.ts +11 -0
- package/dist/post-processing/node-text.d.ts.map +1 -0
- package/dist/post-processing/node-text.js +37 -0
- package/dist/post-processing/node-text.js.map +1 -0
- package/dist/post-processing/summary.d.ts +7 -0
- package/dist/post-processing/summary.d.ts.map +1 -0
- package/dist/post-processing/summary.js +31 -0
- package/dist/post-processing/summary.js.map +1 -0
- package/dist/processing/index.d.ts +6 -0
- package/dist/processing/index.d.ts.map +1 -0
- package/dist/processing/index.js +6 -0
- package/dist/processing/index.js.map +1 -0
- package/dist/processing/large-node.d.ts +9 -0
- package/dist/processing/large-node.d.ts.map +1 -0
- package/dist/processing/large-node.js +40 -0
- package/dist/processing/large-node.js.map +1 -0
- package/dist/processing/meta-processor.d.ts +19 -0
- package/dist/processing/meta-processor.d.ts.map +1 -0
- package/dist/processing/meta-processor.js +91 -0
- package/dist/processing/meta-processor.js.map +1 -0
- package/dist/processing/no-toc.d.ts +10 -0
- package/dist/processing/no-toc.d.ts.map +1 -0
- package/dist/processing/no-toc.js +44 -0
- package/dist/processing/no-toc.js.map +1 -0
- package/dist/processing/toc-no-pages.d.ts +11 -0
- package/dist/processing/toc-no-pages.d.ts.map +1 -0
- package/dist/processing/toc-no-pages.js +46 -0
- package/dist/processing/toc-no-pages.js.map +1 -0
- package/dist/processing/toc-with-pages.d.ts +15 -0
- package/dist/processing/toc-with-pages.d.ts.map +1 -0
- package/dist/processing/toc-with-pages.js +151 -0
- package/dist/processing/toc-with-pages.js.map +1 -0
- package/dist/toc/index.d.ts +4 -0
- package/dist/toc/index.d.ts.map +1 -0
- package/dist/toc/index.js +4 -0
- package/dist/toc/index.js.map +1 -0
- package/dist/toc/toc-detector.d.ts +23 -0
- package/dist/toc/toc-detector.d.ts.map +1 -0
- package/dist/toc/toc-detector.js +65 -0
- package/dist/toc/toc-detector.js.map +1 -0
- package/dist/toc/toc-extractor.d.ts +13 -0
- package/dist/toc/toc-extractor.d.ts.map +1 -0
- package/dist/toc/toc-extractor.js +32 -0
- package/dist/toc/toc-extractor.js.map +1 -0
- package/dist/toc/toc-transformer.d.ts +11 -0
- package/dist/toc/toc-transformer.d.ts.map +1 -0
- package/dist/toc/toc-transformer.js +69 -0
- package/dist/toc/toc-transformer.js.map +1 -0
- package/dist/tree/index.d.ts +4 -0
- package/dist/tree/index.d.ts.map +1 -0
- package/dist/tree/index.js +4 -0
- package/dist/tree/index.js.map +1 -0
- package/dist/tree/list-to-tree.d.ts +7 -0
- package/dist/tree/list-to-tree.d.ts.map +1 -0
- package/dist/tree/list-to-tree.js +33 -0
- package/dist/tree/list-to-tree.js.map +1 -0
- package/dist/tree/post-processing.d.ts +12 -0
- package/dist/tree/post-processing.d.ts.map +1 -0
- package/dist/tree/post-processing.js +87 -0
- package/dist/tree/post-processing.js.map +1 -0
- package/dist/tree/tree-utils.d.ts +18 -0
- package/dist/tree/tree-utils.d.ts.map +1 -0
- package/dist/tree/tree-utils.js +43 -0
- package/dist/tree/tree-utils.js.map +1 -0
- package/dist/tree-parser.d.ts +30 -0
- package/dist/tree-parser.d.ts.map +1 -0
- package/dist/tree-parser.js +73 -0
- package/dist/tree-parser.js.map +1 -0
- package/dist/types.d.ts +3 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/config-loader.d.ts +15 -0
- package/dist/utils/config-loader.d.ts.map +1 -0
- package/dist/utils/config-loader.js +19 -0
- package/dist/utils/config-loader.js.map +1 -0
- package/dist/utils/index.d.ts +7 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +6 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/json-parser.d.ts +2 -0
- package/dist/utils/json-parser.d.ts.map +1 -0
- package/dist/utils/json-parser.js +76 -0
- package/dist/utils/json-parser.js.map +1 -0
- package/dist/utils/logger.d.ts +3 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +10 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/page-utils.d.ts +16 -0
- package/dist/utils/page-utils.d.ts.map +1 -0
- package/dist/utils/page-utils.js +56 -0
- package/dist/utils/page-utils.js.map +1 -0
- package/dist/utils/token-counter.d.ts +2 -0
- package/dist/utils/token-counter.d.ts.map +1 -0
- package/dist/utils/token-counter.js +5 -0
- package/dist/utils/token-counter.js.map +1 -0
- package/dist/vector-lib/adapters/in-memory-adapter.d.ts +14 -0
- package/dist/vector-lib/adapters/in-memory-adapter.d.ts.map +1 -0
- package/dist/vector-lib/adapters/in-memory-adapter.js +55 -0
- package/dist/vector-lib/adapters/in-memory-adapter.js.map +1 -0
- package/dist/vector-lib/adapters/vector-store.d.ts +10 -0
- package/dist/vector-lib/adapters/vector-store.d.ts.map +1 -0
- package/dist/vector-lib/adapters/vector-store.js +2 -0
- package/dist/vector-lib/adapters/vector-store.js.map +1 -0
- package/dist/vector-lib/chunker/tree-chunker.d.ts +8 -0
- package/dist/vector-lib/chunker/tree-chunker.d.ts.map +1 -0
- package/dist/vector-lib/chunker/tree-chunker.js +59 -0
- package/dist/vector-lib/chunker/tree-chunker.js.map +1 -0
- package/dist/vector-lib/embedder/embedder.d.ts +8 -0
- package/dist/vector-lib/embedder/embedder.d.ts.map +1 -0
- package/dist/vector-lib/embedder/embedder.js +2 -0
- package/dist/vector-lib/embedder/embedder.js.map +1 -0
- package/dist/vector-lib/index.d.ts +10 -0
- package/dist/vector-lib/index.d.ts.map +1 -0
- package/dist/vector-lib/index.js +6 -0
- package/dist/vector-lib/index.js.map +1 -0
- package/dist/vector-lib/search/hybrid-search.d.ts +19 -0
- package/dist/vector-lib/search/hybrid-search.d.ts.map +1 -0
- package/dist/vector-lib/search/hybrid-search.js +25 -0
- package/dist/vector-lib/search/hybrid-search.js.map +1 -0
- package/dist/vector-lib/search/reranker.d.ts +14 -0
- package/dist/vector-lib/search/reranker.d.ts.map +1 -0
- package/dist/vector-lib/search/reranker.js +2 -0
- package/dist/vector-lib/search/reranker.js.map +1 -0
- package/dist/vector-lib/types.d.ts +29 -0
- package/dist/vector-lib/types.d.ts.map +1 -0
- package/dist/vector-lib/types.js +2 -0
- package/dist/vector-lib/types.js.map +1 -0
- package/dist/vector-lib/vector-enhancer.d.ts +28 -0
- package/dist/vector-lib/vector-enhancer.d.ts.map +1 -0
- package/dist/vector-lib/vector-enhancer.js +54 -0
- package/dist/vector-lib/vector-enhancer.js.map +1 -0
- package/dist/vector.d.ts +5 -0
- package/dist/vector.d.ts.map +1 -0
- package/dist/vector.js +3 -0
- package/dist/vector.js.map +1 -0
- package/dist/verification/fix-toc.d.ts +13 -0
- package/dist/verification/fix-toc.d.ts.map +1 -0
- package/dist/verification/fix-toc.js +73 -0
- package/dist/verification/fix-toc.js.map +1 -0
- package/dist/verification/index.d.ts +3 -0
- package/dist/verification/index.d.ts.map +1 -0
- package/dist/verification/index.js +3 -0
- package/dist/verification/index.js.map +1 -0
- package/dist/verification/verify-toc.d.ts +17 -0
- package/dist/verification/verify-toc.d.ts.map +1 -0
- package/dist/verification/verify-toc.js +64 -0
- package/dist/verification/verify-toc.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { tocTransformer, tocIndexExtractor } from '../toc/toc-transformer.js';
|
|
3
|
+
import { convertPhysicalIndexToInt, addPhysicalIndexTags } from '../utils/page-utils.js';
|
|
4
|
+
/**
|
|
5
|
+
* Processes TOC with page numbers.
|
|
6
|
+
* Algorithm:
|
|
7
|
+
* 1. tocTransformer → structured JSON with page field
|
|
8
|
+
* 2. Remove page → tocNoPageNumber
|
|
9
|
+
* 3. Extract physical indices from main content
|
|
10
|
+
* 4. Match page/physicalIndex pairs → calculate offset
|
|
11
|
+
* 5. Apply offset to all entries
|
|
12
|
+
*/
|
|
13
|
+
export async function processTocWithPageNumbers(tocContent, pageList, tocPageList, llmClient) {
|
|
14
|
+
// Step 1: Transform TOC to JSON
|
|
15
|
+
const tocItems = await tocTransformer(tocContent, llmClient);
|
|
16
|
+
// Step 2: Create version without page numbers
|
|
17
|
+
const tocNoPage = tocItems.map(({ structure, title }) => ({
|
|
18
|
+
structure,
|
|
19
|
+
title,
|
|
20
|
+
}));
|
|
21
|
+
// Step 3: Get main content pages (after TOC)
|
|
22
|
+
const mainStartIdx = tocPageList.length > 0
|
|
23
|
+
? tocPageList[tocPageList.length - 1] + 1
|
|
24
|
+
: 0;
|
|
25
|
+
const mainPages = pageList.slice(mainStartIdx);
|
|
26
|
+
const taggedContent = addPhysicalIndexTags(mainPages, mainStartIdx + 1);
|
|
27
|
+
// Step 4: Extract physical indices
|
|
28
|
+
const withPhysical = await tocIndexExtractor(tocNoPage, taggedContent, llmClient);
|
|
29
|
+
// Convert physical_index strings to numbers (LLM may return string values)
|
|
30
|
+
for (const item of withPhysical) {
|
|
31
|
+
if (item.physicalIndex != null && typeof item.physicalIndex === 'string') {
|
|
32
|
+
try {
|
|
33
|
+
item.physicalIndex = convertPhysicalIndexToInt(String(item.physicalIndex));
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
item.physicalIndex = null;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
// Step 5: Match pairs and calculate offset
|
|
41
|
+
const resolvedPhysicalIndices = resolvePhysicalIndices(tocItems, withPhysical);
|
|
42
|
+
const pairs = extractMatchingPagePairs(tocItems, resolvedPhysicalIndices);
|
|
43
|
+
const offset = calculatePageOffset(pairs);
|
|
44
|
+
// Step 6: Apply offset
|
|
45
|
+
const result = addPageOffsetToTocJson(tocItems, offset);
|
|
46
|
+
// Step 7: Fix entries still without physicalIndex
|
|
47
|
+
processNonePageNumbers(result, resolvedPhysicalIndices);
|
|
48
|
+
return result;
|
|
49
|
+
}
|
|
50
|
+
function extractMatchingPagePairs(tocItems, resolvedPhysicalIndices) {
|
|
51
|
+
const pairs = [];
|
|
52
|
+
for (let i = 0; i < tocItems.length; i++) {
|
|
53
|
+
const page = tocItems[i].page;
|
|
54
|
+
const physicalIndex = resolvedPhysicalIndices[i];
|
|
55
|
+
if (page == null || physicalIndex == null)
|
|
56
|
+
continue;
|
|
57
|
+
pairs.push({ page, physicalIndex });
|
|
58
|
+
}
|
|
59
|
+
return pairs;
|
|
60
|
+
}
|
|
61
|
+
function calculatePageOffset(pairs) {
|
|
62
|
+
if (pairs.length === 0)
|
|
63
|
+
return 0;
|
|
64
|
+
const diffs = pairs.map((p) => p.physicalIndex - p.page);
|
|
65
|
+
const counts = new Map();
|
|
66
|
+
for (const d of diffs) {
|
|
67
|
+
counts.set(d, (counts.get(d) ?? 0) + 1);
|
|
68
|
+
}
|
|
69
|
+
let maxCount = 0;
|
|
70
|
+
let mode = 0;
|
|
71
|
+
for (const [diff, count] of counts) {
|
|
72
|
+
if (count > maxCount) {
|
|
73
|
+
maxCount = count;
|
|
74
|
+
mode = diff;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return mode;
|
|
78
|
+
}
|
|
79
|
+
function addPageOffsetToTocJson(toc, offset) {
|
|
80
|
+
return toc.map((item) => ({
|
|
81
|
+
...item,
|
|
82
|
+
physicalIndex: item.page != null ? item.page + offset : item.physicalIndex ?? null,
|
|
83
|
+
}));
|
|
84
|
+
}
|
|
85
|
+
function processNonePageNumbers(result, resolvedPhysicalIndices) {
|
|
86
|
+
for (let i = 0; i < result.length; i++) {
|
|
87
|
+
if (result[i].physicalIndex == null && resolvedPhysicalIndices[i] != null) {
|
|
88
|
+
result[i].physicalIndex = resolvedPhysicalIndices[i];
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
function resolvePhysicalIndices(tocItems, withPhysical) {
|
|
93
|
+
const byStructure = new Map();
|
|
94
|
+
const byTitleQueue = new Map();
|
|
95
|
+
for (const item of withPhysical) {
|
|
96
|
+
if (item.physicalIndex == null)
|
|
97
|
+
continue;
|
|
98
|
+
if (typeof item.physicalIndex !== 'number')
|
|
99
|
+
continue;
|
|
100
|
+
// structure should be the most stable key (handles duplicate titles)
|
|
101
|
+
if (item.structure) {
|
|
102
|
+
byStructure.set(item.structure, item.physicalIndex);
|
|
103
|
+
}
|
|
104
|
+
// title-only fallback for cases where structure is missing or malformed
|
|
105
|
+
const title = item.title;
|
|
106
|
+
if (title) {
|
|
107
|
+
const queue = byTitleQueue.get(title) ?? [];
|
|
108
|
+
queue.push(item.physicalIndex);
|
|
109
|
+
byTitleQueue.set(title, queue);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
const resolved = [];
|
|
113
|
+
for (const tocItem of tocItems) {
|
|
114
|
+
if (tocItem.structure && byStructure.has(tocItem.structure)) {
|
|
115
|
+
const matched = byStructure.get(tocItem.structure) ?? null;
|
|
116
|
+
resolved.push(matched);
|
|
117
|
+
// Keep title queue in sync with structure matches so later title-fallback
|
|
118
|
+
// entries do not reuse values already consumed by structure-based entries.
|
|
119
|
+
consumeTitleQueueValue(byTitleQueue, tocItem.title, matched);
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
const title = tocItem.title;
|
|
123
|
+
const queue = title ? byTitleQueue.get(title) : undefined;
|
|
124
|
+
if (queue && queue.length > 0) {
|
|
125
|
+
resolved.push(queue.shift() ?? null);
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
resolved.push(null);
|
|
129
|
+
}
|
|
130
|
+
return resolved;
|
|
131
|
+
}
|
|
132
|
+
function consumeTitleQueueValue(byTitleQueue, title, value) {
|
|
133
|
+
if (!title || value == null)
|
|
134
|
+
return;
|
|
135
|
+
const queue = byTitleQueue.get(title);
|
|
136
|
+
if (!queue || queue.length === 0)
|
|
137
|
+
return;
|
|
138
|
+
// Prefer removing the same value (queue may not align with structure order).
|
|
139
|
+
const index = queue.indexOf(value);
|
|
140
|
+
if (index >= 0) {
|
|
141
|
+
queue.splice(index, 1);
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
// Fallback to consume head to avoid stale reuse.
|
|
145
|
+
queue.shift();
|
|
146
|
+
}
|
|
147
|
+
if (queue.length === 0) {
|
|
148
|
+
byTitleQueue.delete(title);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
//# sourceMappingURL=toc-with-pages.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-with-pages.js","sourceRoot":"","sources":["../../src/processing/toc-with-pages.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC9E,OAAO,EAAE,yBAAyB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAEzF;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAC7C,UAAkB,EAClB,QAAiC,EACjC,WAAqB,EACrB,SAAoB;IAEpB,gCAAgC;IAChC,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC;IAE7D,8CAA8C;IAC9C,MAAM,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;QACxD,SAAS;QACT,KAAK;KACN,CAAC,CAAC,CAAC;IAEJ,6CAA6C;IAC7C,MAAM,YAAY,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC;QACzC,CAAC,CAAC,WAAW,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC;QACzC,CAAC,CAAC,CAAC,CAAC;IACN,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;IAC/C,MAAM,aAAa,GAAG,oBAAoB,CAAC,SAAS,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;IAExE,mCAAmC;IACnC,MAAM,YAAY,GAAG,MAAM,iBAAiB,CAC1C,SAAsB,EAAE,aAAa,EAAE,SAAS,CACjD,CAAC;IAEF,2EAA2E;IAC3E,KAAK,MAAM,IAAI,IAAI,YAAY,EAAE,CAAC;QAChC,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,IAAI,OAAO,IAAI,CAAC,aAAa,KAAK,QAAQ,EAAE,CAAC;YACzE,IAAI,CAAC;gBACH,IAAI,CAAC,aAAa,GAAG,yBAAyB,CAAC,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC;YAC7E,CAAC;YAAC,MAAM,CAAC;gBACP,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;YAC5B,CAAC;QACH,CAAC;IACH,CAAC;IAED,2CAA2C;IAC3C,MAAM,uBAAuB,GAAG,sBAAsB,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAC/E,MAAM,KAAK,GAAG,wBAAwB,CAAC,QAAQ,EAAE,uBAAuB,CAAC,CAAC;IAC1E,MAAM,MAAM,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAE1C,uBAAuB;IACvB,MAAM,MAAM,GAAG,sBAAsB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAExD,kDAAkD;IAClD,sBAAsB,CAAC,MAAM,EAAE,uBAAuB,CAAC,CAAC;IAExD,OAAO,MAAM,CAAC;AAChB,CAAC;AAOD,SAAS,wBAAwB,CAC/B,QAAmB,EACnB,uBAA6C;IAE7C,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAC9B,MAAM,aAAa,GAAG,uBAAuB,CAAC,CAAC,CAAC,CAAC;QACjD,IAAI,IAAI,IAAI,IAAI,IAAI,aAAa,IAAI,IAAI;YAAE,SAAS;QACpD,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,CAAC,CAAC;IACtC,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,mBAAmB,CAAC,KAAiB;IAC5C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEjC,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;IACzD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;QACnC,IAAI,KAAK,GAAG,QAAQ,EAAE,CAAC;YACrB,QAAQ,GAAG,KAAK,CAAC;YACjB,IAAI,GAAG,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,sBAAsB,CAAC,GAAc,EAAE,MAAc;IAC5D,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACxB,GAAG,IAAI;QACP,aAAa,EAAE,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI;KACnF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,sBAAsB,CAC7B,MAAiB,EACjB,uBAA6C;IAE7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,aAAa,IAAI,IAAI,IAAI,uBAAuB,CAAC,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC;YAC1E,MAAM,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,uBAAuB,CAAC,CAAC,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,sBAAsB,CAC7B,QAAmB,EACnB,YAAuB;IAEvB,MAAM,WAAW,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC9C,MAAM,YAAY,GAAG,IAAI,GAAG,EAAoB,CAAC;IAEjD,KAAK,MAAM,IAAI,IAAI,YAAY,EAAE,CAAC;QAChC,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI;YAAE,SAAS;QACzC,IAAI,OAAO,IAAI,CAAC,aAAa,KAAK,QAAQ;YAAE,SAAS;QAErD,qEAAqE;QACrE,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACnB,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;QACtD,CAAC;QAED,wEAAwE;QACxE,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC;QACzB,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAC5C,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAC/B,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAyB,EAAE,CAAC;IAC1C,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,OAAO,CAAC,SAAS,IAAI,WAAW,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;YAC5D,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,IAAI,IAAI,CAAC;YAC3D,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACvB,0EAA0E;YAC1E,2EAA2E;YAC3E,sBAAsB,CAAC,YAAY,EAAE,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;YAC7D,SAAS;QACX,CAAC;QAED,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QAC5B,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC1D,IAAI,KAAK,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9B,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,IAAI,IAAI,CAAC,CAAC;YACrC,SAAS;QACX,CAAC;QAED,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,sBAAsB,CAC7B,YAAmC,EACnC,KAAyB,EACzB,KAAoB;IAEpB,IAAI,CAAC,KAAK,IAAI,KAAK,IAAI,IAAI;QAAE,OAAO;IACpC,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IACtC,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEzC,6EAA6E;IAC7E,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACnC,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACf,KAAK,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACzB,CAAC;SAAM,CAAC;QACN,iDAAiD;QACjD,KAAK,CAAC,KAAK,EAAE,CAAC;IAChB,CAAC;IAED,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,YAAY,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/toc/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAClF,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/toc/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAClF,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { TocCheckResult, Logger } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Main TOC detection flow.
|
|
5
|
+
* 1. Find TOC pages by scanning from the start
|
|
6
|
+
* 2. Extract TOC content
|
|
7
|
+
* 3. Check for page numbers
|
|
8
|
+
* 4. If no page numbers, continue scanning for more TOC pages
|
|
9
|
+
*/
|
|
10
|
+
export declare function checkToc(pageList: Array<{
|
|
11
|
+
text: string;
|
|
12
|
+
}>, tocCheckPageNum: number, llmClient: LlmClient, _logger: Logger): Promise<TocCheckResult>;
|
|
13
|
+
/**
|
|
14
|
+
* Scans pages sequentially to find TOC pages.
|
|
15
|
+
*/
|
|
16
|
+
export declare function findTocPages(startPageIndex: number, pageList: Array<{
|
|
17
|
+
text: string;
|
|
18
|
+
}>, tocCheckPageNum: number, llmClient: LlmClient): Promise<number[]>;
|
|
19
|
+
/**
|
|
20
|
+
* Detects if a single page contains a table of contents.
|
|
21
|
+
*/
|
|
22
|
+
export declare function tocDetectorSinglePage(content: string, llmClient: LlmClient): Promise<'yes' | 'no'>;
|
|
23
|
+
//# sourceMappingURL=toc-detector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-detector.d.ts","sourceRoot":"","sources":["../../src/toc/toc-detector.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAIjD;;;;;;GAMG;AACH,wBAAsB,QAAQ,CAC5B,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,eAAe,EAAE,MAAM,EACvB,SAAS,EAAE,SAAS,EACpB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,cAAc,CAAC,CAgCzB;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,cAAc,EAAE,MAAM,EACtB,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,eAAe,EAAE,MAAM,EACvB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,MAAM,EAAE,CAAC,CAqBnB;AAED;;GAEG;AACH,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,MAAM,EACf,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,KAAK,GAAG,IAAI,CAAC,CAMvB"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { tocDetectorPrompt } from '../llm/prompts.js';
|
|
3
|
+
import { tocExtractor } from './toc-extractor.js';
|
|
4
|
+
/**
|
|
5
|
+
* Main TOC detection flow.
|
|
6
|
+
* 1. Find TOC pages by scanning from the start
|
|
7
|
+
* 2. Extract TOC content
|
|
8
|
+
* 3. Check for page numbers
|
|
9
|
+
* 4. If no page numbers, continue scanning for more TOC pages
|
|
10
|
+
*/
|
|
11
|
+
export async function checkToc(pageList, tocCheckPageNum, llmClient, _logger) {
|
|
12
|
+
const tocPageList = await findTocPages(0, pageList, tocCheckPageNum, llmClient);
|
|
13
|
+
if (tocPageList.length === 0) {
|
|
14
|
+
return { tocContent: null, tocPageList: [], pageIndexGivenInToc: false };
|
|
15
|
+
}
|
|
16
|
+
const result = await tocExtractor(pageList, tocPageList, llmClient);
|
|
17
|
+
if (result.pageIndexGivenInToc) {
|
|
18
|
+
return result;
|
|
19
|
+
}
|
|
20
|
+
// Continue scanning for more TOC pages with page numbers
|
|
21
|
+
let startIdx = tocPageList[tocPageList.length - 1] + 1;
|
|
22
|
+
while (startIdx < pageList.length && startIdx < tocCheckPageNum) {
|
|
23
|
+
const moreTocPages = await findTocPages(startIdx, pageList, tocCheckPageNum, llmClient);
|
|
24
|
+
if (moreTocPages.length === 0)
|
|
25
|
+
break;
|
|
26
|
+
const allTocPages = [...tocPageList, ...moreTocPages];
|
|
27
|
+
const newResult = await tocExtractor(pageList, allTocPages, llmClient);
|
|
28
|
+
if (newResult.pageIndexGivenInToc) {
|
|
29
|
+
return newResult;
|
|
30
|
+
}
|
|
31
|
+
startIdx = moreTocPages[moreTocPages.length - 1] + 1;
|
|
32
|
+
}
|
|
33
|
+
// Return TOC without page numbers
|
|
34
|
+
return result;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Scans pages sequentially to find TOC pages.
|
|
38
|
+
*/
|
|
39
|
+
export async function findTocPages(startPageIndex, pageList, tocCheckPageNum, llmClient) {
|
|
40
|
+
let lastPageIsYes = false;
|
|
41
|
+
const tocPageList = [];
|
|
42
|
+
let i = startPageIndex;
|
|
43
|
+
while (i < pageList.length) {
|
|
44
|
+
if (i >= tocCheckPageNum && !lastPageIsYes)
|
|
45
|
+
break;
|
|
46
|
+
const result = await tocDetectorSinglePage(pageList[i].text, llmClient);
|
|
47
|
+
if (result === 'yes') {
|
|
48
|
+
tocPageList.push(i);
|
|
49
|
+
lastPageIsYes = true;
|
|
50
|
+
}
|
|
51
|
+
else if (result === 'no' && lastPageIsYes) {
|
|
52
|
+
break; // TOC ended
|
|
53
|
+
}
|
|
54
|
+
i++;
|
|
55
|
+
}
|
|
56
|
+
return tocPageList;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Detects if a single page contains a table of contents.
|
|
60
|
+
*/
|
|
61
|
+
export async function tocDetectorSinglePage(content, llmClient) {
|
|
62
|
+
const result = await llmClient.chatJson([{ role: 'user', content: tocDetectorPrompt(content) }]);
|
|
63
|
+
return result.toc_detected === 'yes' ? 'yes' : 'no';
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=toc-detector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-detector.js","sourceRoot":"","sources":["../../src/toc/toc-detector.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,QAAiC,EACjC,eAAuB,EACvB,SAAoB,EACpB,OAAe;IAEf,MAAM,WAAW,GAAG,MAAM,YAAY,CAAC,CAAC,EAAE,QAAQ,EAAE,eAAe,EAAE,SAAS,CAAC,CAAC;IAEhF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7B,OAAO,EAAE,UAAU,EAAE,IAAI,EAAE,WAAW,EAAE,EAAE,EAAE,mBAAmB,EAAE,KAAK,EAAE,CAAC;IAC3E,CAAC;IAED,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,QAAQ,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC;IAEpE,IAAI,MAAM,CAAC,mBAAmB,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,yDAAyD;IACzD,IAAI,QAAQ,GAAG,WAAW,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;IACvD,OAAO,QAAQ,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,GAAG,eAAe,EAAE,CAAC;QAChE,MAAM,YAAY,GAAG,MAAM,YAAY,CACrC,QAAQ,EAAE,QAAQ,EAAE,eAAe,EAAE,SAAS,CAC/C,CAAC;QACF,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM;QAErC,MAAM,WAAW,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,YAAY,CAAC,CAAC;QACtD,MAAM,SAAS,GAAG,MAAM,YAAY,CAAC,QAAQ,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC;QACvE,IAAI,SAAS,CAAC,mBAAmB,EAAE,CAAC;YAClC,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,QAAQ,GAAG,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;IAED,kCAAkC;IAClC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,cAAsB,EACtB,QAAiC,EACjC,eAAuB,EACvB,SAAoB;IAEpB,IAAI,aAAa,GAAG,KAAK,CAAC;IAC1B,MAAM,WAAW,GAAa,EAAE,CAAC;IACjC,IAAI,CAAC,GAAG,cAAc,CAAC;IAEvB,OAAO,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;QAC3B,IAAI,CAAC,IAAI,eAAe,IAAI,CAAC,aAAa;YAAE,MAAM;QAElD,MAAM,MAAM,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QAExE,IAAI,MAAM,KAAK,KAAK,EAAE,CAAC;YACrB,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACpB,aAAa,GAAG,IAAI,CAAC;QACvB,CAAC;aAAM,IAAI,MAAM,KAAK,IAAI,IAAI,aAAa,EAAE,CAAC;YAC5C,MAAM,CAAC,YAAY;QACrB,CAAC;QAED,CAAC,EAAE,CAAC;IACN,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,OAAe,EACf,SAAoB;IAEpB,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAEpC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,iBAAiB,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;IAE5D,OAAO,MAAM,CAAC,YAAY,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;AACtD,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { TocCheckResult } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Extracts TOC content from identified TOC pages and detects page numbers.
|
|
5
|
+
*/
|
|
6
|
+
export declare function tocExtractor(pageList: Array<{
|
|
7
|
+
text: string;
|
|
8
|
+
}>, tocPageList: number[], llmClient: LlmClient): Promise<TocCheckResult>;
|
|
9
|
+
/**
|
|
10
|
+
* Detects if the TOC content contains page numbers.
|
|
11
|
+
*/
|
|
12
|
+
export declare function detectPageIndex(tocContent: string, llmClient: LlmClient): Promise<boolean>;
|
|
13
|
+
//# sourceMappingURL=toc-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-extractor.d.ts","sourceRoot":"","sources":["../../src/toc/toc-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAClD,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAOjD;;GAEG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,WAAW,EAAE,MAAM,EAAE,EACrB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,cAAc,CAAC,CAyBzB;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,OAAO,CAAC,CAMlB"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { extractTocContentPrompt, EXTRACT_TOC_CONTINUE_PROMPT, detectPageIndexPrompt, } from '../llm/prompts.js';
|
|
3
|
+
/**
|
|
4
|
+
* Extracts TOC content from identified TOC pages and detects page numbers.
|
|
5
|
+
*/
|
|
6
|
+
export async function tocExtractor(pageList, tocPageList, llmClient) {
|
|
7
|
+
// Concatenate TOC page texts
|
|
8
|
+
let rawTocContent = tocPageList
|
|
9
|
+
.map((idx) => pageList[idx].text)
|
|
10
|
+
.join('\n');
|
|
11
|
+
// Replace consecutive dots with colon
|
|
12
|
+
rawTocContent = rawTocContent
|
|
13
|
+
.replace(/\.{5,}/g, ': ')
|
|
14
|
+
.replace(/(?:\. ){5,}\.?/g, ': ');
|
|
15
|
+
// Extract clean TOC content via LLM
|
|
16
|
+
const tocContent = await llmClient.chatWithContinuation([{ role: 'user', content: extractTocContentPrompt(rawTocContent) }], EXTRACT_TOC_CONTINUE_PROMPT);
|
|
17
|
+
// Detect if page numbers are present
|
|
18
|
+
const hasPageIndex = await detectPageIndex(tocContent, llmClient);
|
|
19
|
+
return {
|
|
20
|
+
tocContent,
|
|
21
|
+
tocPageList,
|
|
22
|
+
pageIndexGivenInToc: hasPageIndex,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Detects if the TOC content contains page numbers.
|
|
27
|
+
*/
|
|
28
|
+
export async function detectPageIndex(tocContent, llmClient) {
|
|
29
|
+
const result = await llmClient.chatJson([{ role: 'user', content: detectPageIndexPrompt(tocContent) }]);
|
|
30
|
+
return result.page_index_given_in_toc === 'yes';
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=toc-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-extractor.js","sourceRoot":"","sources":["../../src/toc/toc-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EACL,uBAAuB,EACvB,2BAA2B,EAC3B,qBAAqB,GACtB,MAAM,mBAAmB,CAAC;AAE3B;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAiC,EACjC,WAAqB,EACrB,SAAoB;IAEpB,6BAA6B;IAC7B,IAAI,aAAa,GAAG,WAAW;SAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC;SAChC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,sCAAsC;IACtC,aAAa,GAAG,aAAa;SAC1B,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC;SACxB,OAAO,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC;IAEpC,oCAAoC;IACpC,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC,oBAAoB,CACrD,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,uBAAuB,CAAC,aAAa,CAAC,EAAE,CAAC,EACnE,2BAA2B,CAC5B,CAAC;IAEF,qCAAqC;IACrC,MAAM,YAAY,GAAG,MAAM,eAAe,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC;IAElE,OAAO;QACL,UAAU;QACV,WAAW;QACX,mBAAmB,EAAE,YAAY;KAClC,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAkB,EAClB,SAAoB;IAEpB,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAEpC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,qBAAqB,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC;IAEnE,OAAO,MAAM,CAAC,uBAAuB,KAAK,KAAK,CAAC;AAClD,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { TocItem } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Transforms TOC text into structured JSON with structure/title/page.
|
|
5
|
+
*/
|
|
6
|
+
export declare function tocTransformer(tocContent: string, llmClient: LlmClient): Promise<TocItem[]>;
|
|
7
|
+
/**
|
|
8
|
+
* Extracts physical page indices by matching TOC entries against document pages.
|
|
9
|
+
*/
|
|
10
|
+
export declare function tocIndexExtractor(toc: TocItem[], content: string, llmClient: LlmClient): Promise<TocItem[]>;
|
|
11
|
+
//# sourceMappingURL=toc-transformer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-transformer.d.ts","sourceRoot":"","sources":["../../src/toc/toc-transformer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AASjD;;GAEG;AACH,wBAAsB,cAAc,CAClC,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,OAAO,EAAE,CAAC,CAapB;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CACrC,GAAG,EAAE,OAAO,EAAE,EACd,OAAO,EAAE,MAAM,EACf,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,OAAO,EAAE,CAAC,CA0CpB"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { tocTransformerPrompt, TOC_TRANSFORMER_CONTINUE_PROMPT, tocIndexExtractorPrompt, } from '../llm/prompts.js';
|
|
3
|
+
import { extractJson } from '../utils/json-parser.js';
|
|
4
|
+
import { convertPhysicalIndexToInt } from '../utils/page-utils.js';
|
|
5
|
+
/**
|
|
6
|
+
* Transforms TOC text into structured JSON with structure/title/page.
|
|
7
|
+
*/
|
|
8
|
+
export async function tocTransformer(tocContent, llmClient) {
|
|
9
|
+
const raw = await llmClient.chatWithContinuation([{ role: 'user', content: tocTransformerPrompt(tocContent) }], TOC_TRANSFORMER_CONTINUE_PROMPT);
|
|
10
|
+
const parsed = extractJson(raw);
|
|
11
|
+
if (Array.isArray(parsed))
|
|
12
|
+
return parsed;
|
|
13
|
+
if (parsed && 'table_of_contents' in parsed)
|
|
14
|
+
return parsed.table_of_contents;
|
|
15
|
+
return [];
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Extracts physical page indices by matching TOC entries against document pages.
|
|
19
|
+
*/
|
|
20
|
+
export async function tocIndexExtractor(toc, content, llmClient) {
|
|
21
|
+
// Remove page field for matching
|
|
22
|
+
const tocNoPage = toc.map(({ structure, title }) => ({
|
|
23
|
+
structure,
|
|
24
|
+
title,
|
|
25
|
+
}));
|
|
26
|
+
const result = await llmClient.chatJson([{
|
|
27
|
+
role: 'user',
|
|
28
|
+
content: tocIndexExtractorPrompt(JSON.stringify(tocNoPage), content),
|
|
29
|
+
}]);
|
|
30
|
+
if (!Array.isArray(result))
|
|
31
|
+
return [];
|
|
32
|
+
return result
|
|
33
|
+
.map((raw) => {
|
|
34
|
+
const item = raw;
|
|
35
|
+
const structure = typeof item['structure'] === 'string'
|
|
36
|
+
? item['structure']
|
|
37
|
+
: String(item['structure'] ?? '');
|
|
38
|
+
const title = typeof item['title'] === 'string'
|
|
39
|
+
? item['title']
|
|
40
|
+
: String(item['title'] ?? '');
|
|
41
|
+
const rawPhysicalIndex = item['physicalIndex'] ?? item['physical_index'];
|
|
42
|
+
const physicalIndex = normalizePhysicalIndex(rawPhysicalIndex);
|
|
43
|
+
if (!title)
|
|
44
|
+
return null;
|
|
45
|
+
return {
|
|
46
|
+
structure,
|
|
47
|
+
title,
|
|
48
|
+
physicalIndex,
|
|
49
|
+
};
|
|
50
|
+
})
|
|
51
|
+
.filter((x) => x != null);
|
|
52
|
+
}
|
|
53
|
+
function normalizePhysicalIndex(value) {
|
|
54
|
+
if (value == null)
|
|
55
|
+
return null;
|
|
56
|
+
if (typeof value === 'number') {
|
|
57
|
+
return Number.isFinite(value) ? value : null;
|
|
58
|
+
}
|
|
59
|
+
if (typeof value === 'string') {
|
|
60
|
+
try {
|
|
61
|
+
return convertPhysicalIndexToInt(value);
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return null;
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=toc-transformer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-transformer.js","sourceRoot":"","sources":["../../src/toc/toc-transformer.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EACL,oBAAoB,EACpB,+BAA+B,EAC/B,uBAAuB,GACxB,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,yBAAyB,EAAE,MAAM,wBAAwB,CAAC;AAEnE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,UAAkB,EAClB,SAAoB;IAEpB,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAC9C,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,oBAAoB,CAAC,UAAU,CAAC,EAAE,CAAC,EAC7D,+BAA+B,CAChC,CAAC;IAEF,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,CAEjB,CAAC;IAEd,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACzC,IAAI,MAAM,IAAI,mBAAmB,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC,iBAAiB,CAAC;IAC7E,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,GAAc,EACd,OAAe,EACf,SAAoB;IAEpB,iCAAiC;IACjC,MAAM,SAAS,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;QACnD,SAAS;QACT,KAAK;KACN,CAAC,CAAC,CAAC;IAEJ,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CACrC,CAAC;YACC,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE,uBAAuB,CAC9B,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,EACzB,OAAO,CACR;SACF,CAAC,CACH,CAAC;IAEF,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC;QAAE,OAAO,EAAE,CAAC;IAEtC,OAAO,MAAM;SACV,GAAG,CAAC,CAAC,GAAG,EAAkB,EAAE;QAC3B,MAAM,IAAI,GAAG,GAA8B,CAAC;QAE5C,MAAM,SAAS,GAAG,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,QAAQ;YACrD,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC;YACnB,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;QACpC,MAAM,KAAK,GAAG,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,QAAQ;YAC7C,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC;YACf,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;QAEhC,MAAM,gBAAgB,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,IAAI,CAAC,gBAAgB,CAAC,CAAC;QACzE,MAAM,aAAa,GAAG,sBAAsB,CAAC,gBAAgB,CAAC,CAAC;QAE/D,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,OAAO;YACL,SAAS;YACT,KAAK;YACL,aAAa;SACd,CAAC;IACJ,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,CAAC,EAAgB,EAAE,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,sBAAsB,CAAC,KAAc;IAC5C,IAAI,KAAK,IAAI,IAAI;QAAE,OAAO,IAAI,CAAC;IAC/B,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,OAAO,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;IAC/C,CAAC;IACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,IAAI,CAAC;YACH,OAAO,yBAAyB,CAAC,KAAK,CAAC,CAAC;QAC1C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/tree/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/tree/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { TreeNode, TocItem } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Converts a flat TOC list to a nested tree structure.
|
|
4
|
+
* Uses the `structure` field (e.g., "1", "1.1", "1.2.3") to determine hierarchy.
|
|
5
|
+
*/
|
|
6
|
+
export declare function listToTree(data: TocItem[]): TreeNode[];
|
|
7
|
+
//# sourceMappingURL=list-to-tree.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"list-to-tree.d.ts","sourceRoot":"","sources":["../../src/tree/list-to-tree.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD;;;GAGG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,OAAO,EAAE,GAAG,QAAQ,EAAE,CA6BtD"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Converts a flat TOC list to a nested tree structure.
|
|
3
|
+
* Uses the `structure` field (e.g., "1", "1.1", "1.2.3") to determine hierarchy.
|
|
4
|
+
*/
|
|
5
|
+
export function listToTree(data) {
|
|
6
|
+
const nodes = new Map();
|
|
7
|
+
const rootNodes = [];
|
|
8
|
+
for (const item of data) {
|
|
9
|
+
const node = {
|
|
10
|
+
title: item.title,
|
|
11
|
+
startIndex: item.physicalIndex ?? undefined,
|
|
12
|
+
endIndex: undefined,
|
|
13
|
+
nodes: [],
|
|
14
|
+
};
|
|
15
|
+
nodes.set(item.structure, node);
|
|
16
|
+
const parts = item.structure.split('.');
|
|
17
|
+
if (parts.length > 1) {
|
|
18
|
+
const parentStructure = parts.slice(0, -1).join('.');
|
|
19
|
+
const parent = nodes.get(parentStructure);
|
|
20
|
+
if (parent) {
|
|
21
|
+
parent.nodes.push(node);
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
rootNodes.push(node);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
rootNodes.push(node);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return rootNodes;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=list-to-tree.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"list-to-tree.js","sourceRoot":"","sources":["../../src/tree/list-to-tree.ts"],"names":[],"mappings":"AAEA;;;GAGG;AACH,MAAM,UAAU,UAAU,CAAC,IAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAoB,CAAC;IAC1C,MAAM,SAAS,GAAe,EAAE,CAAC;IAEjC,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,MAAM,IAAI,GAAa;YACrB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,UAAU,EAAE,IAAI,CAAC,aAAa,IAAI,SAAS;YAC3C,QAAQ,EAAE,SAAS;YACnB,KAAK,EAAE,EAAE;SACV,CAAC;QAEF,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;QAEhC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACxC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,MAAM,eAAe,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACrD,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;YAC1C,IAAI,MAAM,EAAE,CAAC;gBACX,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACvB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { TreeNode, TocItem } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Converts a flat TOC list to a tree with page range calculation.
|
|
4
|
+
*
|
|
5
|
+
* Algorithm:
|
|
6
|
+
* 1. For each item, compute startIndex and endIndex based on physicalIndex
|
|
7
|
+
* and the next item's appearStart field.
|
|
8
|
+
* 2. Call listToTree to build the nested structure.
|
|
9
|
+
* 3. If tree is empty, return the original flat list as single-level nodes.
|
|
10
|
+
*/
|
|
11
|
+
export declare function postProcessing(structure: TocItem[], endPhysicalIndex: number): TreeNode[];
|
|
12
|
+
//# sourceMappingURL=post-processing.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"post-processing.d.ts","sourceRoot":"","sources":["../../src/tree/post-processing.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD;;;;;;;;GAQG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,OAAO,EAAE,EACpB,gBAAgB,EAAE,MAAM,GACvB,QAAQ,EAAE,CAkDZ"}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { listToTree } from './list-to-tree.js';
|
|
2
|
+
/**
|
|
3
|
+
* Converts a flat TOC list to a tree with page range calculation.
|
|
4
|
+
*
|
|
5
|
+
* Algorithm:
|
|
6
|
+
* 1. For each item, compute startIndex and endIndex based on physicalIndex
|
|
7
|
+
* and the next item's appearStart field.
|
|
8
|
+
* 2. Call listToTree to build the nested structure.
|
|
9
|
+
* 3. If tree is empty, return the original flat list as single-level nodes.
|
|
10
|
+
*/
|
|
11
|
+
export function postProcessing(structure, endPhysicalIndex) {
|
|
12
|
+
if (structure.length === 0)
|
|
13
|
+
return [];
|
|
14
|
+
// Calculate startIndex and endIndex for each item
|
|
15
|
+
for (let i = 0; i < structure.length; i++) {
|
|
16
|
+
const item = structure[i];
|
|
17
|
+
item.physicalIndex = item.physicalIndex ?? undefined;
|
|
18
|
+
if (item.physicalIndex == null)
|
|
19
|
+
continue;
|
|
20
|
+
if (i < structure.length - 1) {
|
|
21
|
+
const nextItem = structure[i + 1];
|
|
22
|
+
if (nextItem.physicalIndex != null) {
|
|
23
|
+
if (nextItem.appearStart === 'yes') {
|
|
24
|
+
item.endIndex = nextItem.physicalIndex - 1;
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
item.endIndex = nextItem.physicalIndex;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
// Last item
|
|
33
|
+
item.endIndex = endPhysicalIndex;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
// Fill in missing endIndex by looking forward
|
|
37
|
+
for (let i = structure.length - 2; i >= 0; i--) {
|
|
38
|
+
if (structure[i].endIndex == null && structure[i].physicalIndex != null) {
|
|
39
|
+
structure[i].endIndex = endPhysicalIndex;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
const tree = listToTree(structure);
|
|
43
|
+
if (tree.length === 0) {
|
|
44
|
+
// Fallback: return flat list as single-level nodes
|
|
45
|
+
return structure
|
|
46
|
+
.filter((item) => item.physicalIndex != null)
|
|
47
|
+
.map((item) => ({
|
|
48
|
+
title: item.title,
|
|
49
|
+
startIndex: item.physicalIndex,
|
|
50
|
+
endIndex: item.endIndex ?? endPhysicalIndex,
|
|
51
|
+
nodes: [],
|
|
52
|
+
}));
|
|
53
|
+
}
|
|
54
|
+
// Apply endIndex from TocItem to TreeNode
|
|
55
|
+
applyEndIndex(tree, structure);
|
|
56
|
+
return tree;
|
|
57
|
+
}
|
|
58
|
+
function applyEndIndex(nodes, items) {
|
|
59
|
+
// Use title-only map as fallback, but prefer matching by title + physicalIndex
|
|
60
|
+
const titleOnlyMap = new Map();
|
|
61
|
+
for (const item of items) {
|
|
62
|
+
if (!titleOnlyMap.has(item.title)) {
|
|
63
|
+
titleOnlyMap.set(item.title, item);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
for (const node of nodes) {
|
|
67
|
+
// Match by startIndex + title for precision (handles duplicate titles)
|
|
68
|
+
let matched;
|
|
69
|
+
for (const item of items) {
|
|
70
|
+
if (item.title === node.title && item.physicalIndex === node.startIndex) {
|
|
71
|
+
matched = item;
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Fall back to title-only lookup
|
|
76
|
+
if (!matched) {
|
|
77
|
+
matched = titleOnlyMap.get(node.title);
|
|
78
|
+
}
|
|
79
|
+
if (matched?.endIndex != null) {
|
|
80
|
+
node.endIndex = matched.endIndex;
|
|
81
|
+
}
|
|
82
|
+
if (node.nodes.length > 0) {
|
|
83
|
+
applyEndIndex(node.nodes, items);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
//# sourceMappingURL=post-processing.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"post-processing.js","sourceRoot":"","sources":["../../src/tree/post-processing.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAE/C;;;;;;;;GAQG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAoB,EACpB,gBAAwB;IAExB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEtC,kDAAkD;IAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;QAC1B,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,SAAS,CAAC;QAErD,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI;YAAE,SAAS;QAEzC,IAAI,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7B,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YAClC,IAAI,QAAQ,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;gBACnC,IAAI,QAAQ,CAAC,WAAW,KAAK,KAAK,EAAE,CAAC;oBACnC,IAAI,CAAC,QAAQ,GAAI,QAAQ,CAAC,aAAwB,GAAG,CAAC,CAAC;gBACzD,CAAC;qBAAM,CAAC;oBACN,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC,aAAuB,CAAC;gBACnD,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,YAAY;YACZ,IAAI,CAAC,QAAQ,GAAG,gBAAgB,CAAC;QACnC,CAAC;IACH,CAAC;IAED,8CAA8C;IAC9C,KAAK,IAAI,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,QAAQ,IAAI,IAAI,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;YACxE,SAAS,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,gBAAgB,CAAC;QAC3C,CAAC;IACH,CAAC;IAED,MAAM,IAAI,GAAG,UAAU,CAAC,SAAS,CAAC,CAAC;IAEnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,mDAAmD;QACnD,OAAO,SAAS;aACb,MAAM,CAAC,CAAC,IAAI,EAAmD,EAAE,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC;aAC7F,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YACd,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,UAAU,EAAE,IAAI,CAAC,aAAa;YAC9B,QAAQ,EAAE,IAAI,CAAC,QAAQ,IAAI,gBAAgB;YAC3C,KAAK,EAAE,EAAE;SACV,CAAC,CAAC,CAAC;IACR,CAAC;IAED,0CAA0C;IAC1C,aAAa,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IAE/B,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,aAAa,CAAC,KAAiB,EAAE,KAAgB;IACxD,+EAA+E;IAC/E,MAAM,YAAY,GAAG,IAAI,GAAG,EAAmB,CAAC;IAChD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAClC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,uEAAuE;QACvE,IAAI,OAA4B,CAAC;QACjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,aAAa,KAAK,IAAI,CAAC,UAAU,EAAE,CAAC;gBACxE,OAAO,GAAG,IAAI,CAAC;gBACf,MAAM;YACR,CAAC;QACH,CAAC;QACD,iCAAiC;QACjC,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzC,CAAC;QACD,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI,EAAE,CAAC;YAC9B,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QACnC,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,aAAa,CAAC,IAAI,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { TreeNode } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Recursively collects all nodes in the tree into a flat array.
|
|
4
|
+
*/
|
|
5
|
+
export declare function getNodes(tree: TreeNode[]): TreeNode[];
|
|
6
|
+
/**
|
|
7
|
+
* Recursively collects all leaf nodes (nodes with no children).
|
|
8
|
+
*/
|
|
9
|
+
export declare function getLeafNodes(tree: TreeNode[]): TreeNode[];
|
|
10
|
+
/**
|
|
11
|
+
* Converts a tree back to a flat list with structure indices.
|
|
12
|
+
*/
|
|
13
|
+
export declare function structureToList(tree: TreeNode[], prefix?: string): Array<{
|
|
14
|
+
structure: string;
|
|
15
|
+
title: string;
|
|
16
|
+
node: TreeNode;
|
|
17
|
+
}>;
|
|
18
|
+
//# sourceMappingURL=tree-utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tree-utils.d.ts","sourceRoot":"","sources":["../../src/tree/tree-utils.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE5C;;GAEG;AACH,wBAAgB,QAAQ,CAAC,IAAI,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE,CASrD;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE,CAUzD;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,QAAQ,EAAE,EAChB,MAAM,SAAK,GACV,KAAK,CAAC;IAAE,SAAS,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,QAAQ,CAAA;CAAE,CAAC,CAU7D"}
|