@fastrag/pageindex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +251 -0
- package/README.zh-CN.md +251 -0
- package/dist/errors/index.d.ts +10 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +19 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/internal-types/config.d.ts +35 -0
- package/dist/internal-types/config.d.ts.map +1 -0
- package/dist/internal-types/config.js +16 -0
- package/dist/internal-types/config.js.map +1 -0
- package/dist/internal-types/document-parser.d.ts +5 -0
- package/dist/internal-types/document-parser.d.ts.map +1 -0
- package/dist/internal-types/document-parser.js +2 -0
- package/dist/internal-types/document-parser.js.map +1 -0
- package/dist/internal-types/index.d.ts +9 -0
- package/dist/internal-types/index.d.ts.map +1 -0
- package/dist/internal-types/index.js +2 -0
- package/dist/internal-types/index.js.map +1 -0
- package/dist/internal-types/llm-provider.d.ts +19 -0
- package/dist/internal-types/llm-provider.d.ts.map +1 -0
- package/dist/internal-types/llm-provider.js +2 -0
- package/dist/internal-types/llm-provider.js.map +1 -0
- package/dist/internal-types/logger.d.ts +7 -0
- package/dist/internal-types/logger.d.ts.map +1 -0
- package/dist/internal-types/logger.js +2 -0
- package/dist/internal-types/logger.js.map +1 -0
- package/dist/internal-types/page.d.ts +5 -0
- package/dist/internal-types/page.d.ts.map +1 -0
- package/dist/internal-types/page.js +2 -0
- package/dist/internal-types/page.js.map +1 -0
- package/dist/internal-types/processing.d.ts +21 -0
- package/dist/internal-types/processing.d.ts.map +1 -0
- package/dist/internal-types/processing.js +2 -0
- package/dist/internal-types/processing.js.map +1 -0
- package/dist/internal-types/tree-node.d.ts +30 -0
- package/dist/internal-types/tree-node.d.ts.map +1 -0
- package/dist/internal-types/tree-node.js +2 -0
- package/dist/internal-types/tree-node.js.map +1 -0
- package/dist/llm/index.d.ts +3 -0
- package/dist/llm/index.d.ts.map +1 -0
- package/dist/llm/index.js +3 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/llm/llm-client.d.ts +26 -0
- package/dist/llm/llm-client.d.ts.map +1 -0
- package/dist/llm/llm-client.js +88 -0
- package/dist/llm/llm-client.js.map +1 -0
- package/dist/llm/prompts.d.ts +33 -0
- package/dist/llm/prompts.d.ts.map +1 -0
- package/dist/llm/prompts.js +312 -0
- package/dist/llm/prompts.js.map +1 -0
- package/dist/markdown/index.d.ts +6 -0
- package/dist/markdown/index.d.ts.map +1 -0
- package/dist/markdown/index.js +5 -0
- package/dist/markdown/index.js.map +1 -0
- package/dist/markdown/md-extractor.d.ts +14 -0
- package/dist/markdown/md-extractor.d.ts.map +1 -0
- package/dist/markdown/md-extractor.js +30 -0
- package/dist/markdown/md-extractor.js.map +1 -0
- package/dist/markdown/md-to-tree.d.ts +8 -0
- package/dist/markdown/md-to-tree.d.ts.map +1 -0
- package/dist/markdown/md-to-tree.js +20 -0
- package/dist/markdown/md-to-tree.js.map +1 -0
- package/dist/markdown/md-tree-builder.d.ts +7 -0
- package/dist/markdown/md-tree-builder.d.ts.map +1 -0
- package/dist/markdown/md-tree-builder.js +36 -0
- package/dist/markdown/md-tree-builder.js.map +1 -0
- package/dist/markdown/tree-thinning.d.ts +8 -0
- package/dist/markdown/tree-thinning.d.ts.map +1 -0
- package/dist/markdown/tree-thinning.js +42 -0
- package/dist/markdown/tree-thinning.js.map +1 -0
- package/dist/page-index.d.ts +10 -0
- package/dist/page-index.d.ts.map +1 -0
- package/dist/page-index.js +54 -0
- package/dist/page-index.js.map +1 -0
- package/dist/post-processing/doc-description.d.ts +12 -0
- package/dist/post-processing/doc-description.d.ts.map +1 -0
- package/dist/post-processing/doc-description.js +31 -0
- package/dist/post-processing/doc-description.js.map +1 -0
- package/dist/post-processing/index.d.ts +5 -0
- package/dist/post-processing/index.d.ts.map +1 -0
- package/dist/post-processing/index.js +5 -0
- package/dist/post-processing/index.js.map +1 -0
- package/dist/post-processing/node-id.d.ts +7 -0
- package/dist/post-processing/node-id.d.ts.map +1 -0
- package/dist/post-processing/node-id.js +20 -0
- package/dist/post-processing/node-id.js.map +1 -0
- package/dist/post-processing/node-text.d.ts +11 -0
- package/dist/post-processing/node-text.d.ts.map +1 -0
- package/dist/post-processing/node-text.js +37 -0
- package/dist/post-processing/node-text.js.map +1 -0
- package/dist/post-processing/summary.d.ts +7 -0
- package/dist/post-processing/summary.d.ts.map +1 -0
- package/dist/post-processing/summary.js +31 -0
- package/dist/post-processing/summary.js.map +1 -0
- package/dist/processing/index.d.ts +6 -0
- package/dist/processing/index.d.ts.map +1 -0
- package/dist/processing/index.js +6 -0
- package/dist/processing/index.js.map +1 -0
- package/dist/processing/large-node.d.ts +9 -0
- package/dist/processing/large-node.d.ts.map +1 -0
- package/dist/processing/large-node.js +40 -0
- package/dist/processing/large-node.js.map +1 -0
- package/dist/processing/meta-processor.d.ts +19 -0
- package/dist/processing/meta-processor.d.ts.map +1 -0
- package/dist/processing/meta-processor.js +91 -0
- package/dist/processing/meta-processor.js.map +1 -0
- package/dist/processing/no-toc.d.ts +10 -0
- package/dist/processing/no-toc.d.ts.map +1 -0
- package/dist/processing/no-toc.js +44 -0
- package/dist/processing/no-toc.js.map +1 -0
- package/dist/processing/toc-no-pages.d.ts +11 -0
- package/dist/processing/toc-no-pages.d.ts.map +1 -0
- package/dist/processing/toc-no-pages.js +46 -0
- package/dist/processing/toc-no-pages.js.map +1 -0
- package/dist/processing/toc-with-pages.d.ts +15 -0
- package/dist/processing/toc-with-pages.d.ts.map +1 -0
- package/dist/processing/toc-with-pages.js +151 -0
- package/dist/processing/toc-with-pages.js.map +1 -0
- package/dist/toc/index.d.ts +4 -0
- package/dist/toc/index.d.ts.map +1 -0
- package/dist/toc/index.js +4 -0
- package/dist/toc/index.js.map +1 -0
- package/dist/toc/toc-detector.d.ts +23 -0
- package/dist/toc/toc-detector.d.ts.map +1 -0
- package/dist/toc/toc-detector.js +65 -0
- package/dist/toc/toc-detector.js.map +1 -0
- package/dist/toc/toc-extractor.d.ts +13 -0
- package/dist/toc/toc-extractor.d.ts.map +1 -0
- package/dist/toc/toc-extractor.js +32 -0
- package/dist/toc/toc-extractor.js.map +1 -0
- package/dist/toc/toc-transformer.d.ts +11 -0
- package/dist/toc/toc-transformer.d.ts.map +1 -0
- package/dist/toc/toc-transformer.js +69 -0
- package/dist/toc/toc-transformer.js.map +1 -0
- package/dist/tree/index.d.ts +4 -0
- package/dist/tree/index.d.ts.map +1 -0
- package/dist/tree/index.js +4 -0
- package/dist/tree/index.js.map +1 -0
- package/dist/tree/list-to-tree.d.ts +7 -0
- package/dist/tree/list-to-tree.d.ts.map +1 -0
- package/dist/tree/list-to-tree.js +33 -0
- package/dist/tree/list-to-tree.js.map +1 -0
- package/dist/tree/post-processing.d.ts +12 -0
- package/dist/tree/post-processing.d.ts.map +1 -0
- package/dist/tree/post-processing.js +87 -0
- package/dist/tree/post-processing.js.map +1 -0
- package/dist/tree/tree-utils.d.ts +18 -0
- package/dist/tree/tree-utils.d.ts.map +1 -0
- package/dist/tree/tree-utils.js +43 -0
- package/dist/tree/tree-utils.js.map +1 -0
- package/dist/tree-parser.d.ts +30 -0
- package/dist/tree-parser.d.ts.map +1 -0
- package/dist/tree-parser.js +73 -0
- package/dist/tree-parser.js.map +1 -0
- package/dist/types.d.ts +3 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/config-loader.d.ts +15 -0
- package/dist/utils/config-loader.d.ts.map +1 -0
- package/dist/utils/config-loader.js +19 -0
- package/dist/utils/config-loader.js.map +1 -0
- package/dist/utils/index.d.ts +7 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +6 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/json-parser.d.ts +2 -0
- package/dist/utils/json-parser.d.ts.map +1 -0
- package/dist/utils/json-parser.js +76 -0
- package/dist/utils/json-parser.js.map +1 -0
- package/dist/utils/logger.d.ts +3 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +10 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/page-utils.d.ts +16 -0
- package/dist/utils/page-utils.d.ts.map +1 -0
- package/dist/utils/page-utils.js +56 -0
- package/dist/utils/page-utils.js.map +1 -0
- package/dist/utils/token-counter.d.ts +2 -0
- package/dist/utils/token-counter.d.ts.map +1 -0
- package/dist/utils/token-counter.js +5 -0
- package/dist/utils/token-counter.js.map +1 -0
- package/dist/vector-lib/adapters/in-memory-adapter.d.ts +14 -0
- package/dist/vector-lib/adapters/in-memory-adapter.d.ts.map +1 -0
- package/dist/vector-lib/adapters/in-memory-adapter.js +55 -0
- package/dist/vector-lib/adapters/in-memory-adapter.js.map +1 -0
- package/dist/vector-lib/adapters/vector-store.d.ts +10 -0
- package/dist/vector-lib/adapters/vector-store.d.ts.map +1 -0
- package/dist/vector-lib/adapters/vector-store.js +2 -0
- package/dist/vector-lib/adapters/vector-store.js.map +1 -0
- package/dist/vector-lib/chunker/tree-chunker.d.ts +8 -0
- package/dist/vector-lib/chunker/tree-chunker.d.ts.map +1 -0
- package/dist/vector-lib/chunker/tree-chunker.js +59 -0
- package/dist/vector-lib/chunker/tree-chunker.js.map +1 -0
- package/dist/vector-lib/embedder/embedder.d.ts +8 -0
- package/dist/vector-lib/embedder/embedder.d.ts.map +1 -0
- package/dist/vector-lib/embedder/embedder.js +2 -0
- package/dist/vector-lib/embedder/embedder.js.map +1 -0
- package/dist/vector-lib/index.d.ts +10 -0
- package/dist/vector-lib/index.d.ts.map +1 -0
- package/dist/vector-lib/index.js +6 -0
- package/dist/vector-lib/index.js.map +1 -0
- package/dist/vector-lib/search/hybrid-search.d.ts +19 -0
- package/dist/vector-lib/search/hybrid-search.d.ts.map +1 -0
- package/dist/vector-lib/search/hybrid-search.js +25 -0
- package/dist/vector-lib/search/hybrid-search.js.map +1 -0
- package/dist/vector-lib/search/reranker.d.ts +14 -0
- package/dist/vector-lib/search/reranker.d.ts.map +1 -0
- package/dist/vector-lib/search/reranker.js +2 -0
- package/dist/vector-lib/search/reranker.js.map +1 -0
- package/dist/vector-lib/types.d.ts +29 -0
- package/dist/vector-lib/types.d.ts.map +1 -0
- package/dist/vector-lib/types.js +2 -0
- package/dist/vector-lib/types.js.map +1 -0
- package/dist/vector-lib/vector-enhancer.d.ts +28 -0
- package/dist/vector-lib/vector-enhancer.d.ts.map +1 -0
- package/dist/vector-lib/vector-enhancer.js +54 -0
- package/dist/vector-lib/vector-enhancer.js.map +1 -0
- package/dist/vector.d.ts +5 -0
- package/dist/vector.d.ts.map +1 -0
- package/dist/vector.js +3 -0
- package/dist/vector.js.map +1 -0
- package/dist/verification/fix-toc.d.ts +13 -0
- package/dist/verification/fix-toc.d.ts.map +1 -0
- package/dist/verification/fix-toc.js +73 -0
- package/dist/verification/fix-toc.js.map +1 -0
- package/dist/verification/index.d.ts +3 -0
- package/dist/verification/index.d.ts.map +1 -0
- package/dist/verification/index.js +3 -0
- package/dist/verification/index.js.map +1 -0
- package/dist/verification/verify-toc.d.ts +17 -0
- package/dist/verification/verify-toc.d.ts.map +1 -0
- package/dist/verification/verify-toc.js +64 -0
- package/dist/verification/verify-toc.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { generateDocDescriptionPrompt } from '../llm/prompts.js';
|
|
3
|
+
/**
|
|
4
|
+
* Creates a clean structure for description generation.
|
|
5
|
+
* Only includes title, nodeId, summary, and prefixSummary.
|
|
6
|
+
*/
|
|
7
|
+
export function createCleanStructureForDescription(nodes) {
|
|
8
|
+
return nodes.map((node) => ({
|
|
9
|
+
title: node.title,
|
|
10
|
+
node_id: node.nodeId,
|
|
11
|
+
summary: node.summary,
|
|
12
|
+
prefix_summary: node.prefixSummary,
|
|
13
|
+
children: node.nodes.length > 0
|
|
14
|
+
? createCleanStructureForDescription(node.nodes)
|
|
15
|
+
: [],
|
|
16
|
+
}));
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Generates a one-sentence document description from the tree structure.
|
|
20
|
+
*/
|
|
21
|
+
export async function generateDocDescription(nodes, llmClient) {
|
|
22
|
+
const cleanStructure = createCleanStructureForDescription(nodes);
|
|
23
|
+
const response = await llmClient.chat([
|
|
24
|
+
{
|
|
25
|
+
role: 'user',
|
|
26
|
+
content: generateDocDescriptionPrompt(JSON.stringify(cleanStructure)),
|
|
27
|
+
},
|
|
28
|
+
]);
|
|
29
|
+
return response.content.trim();
|
|
30
|
+
}
|
|
31
|
+
//# sourceMappingURL=doc-description.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"doc-description.js","sourceRoot":"","sources":["../../src/post-processing/doc-description.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,4BAA4B,EAAE,MAAM,mBAAmB,CAAC;AAEjE;;;GAGG;AACH,MAAM,UAAU,kCAAkC,CAChD,KAAiB;IAEjB,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC1B,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,OAAO,EAAE,IAAI,CAAC,MAAM;QACpB,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,cAAc,EAAE,IAAI,CAAC,aAAa;QAClC,QAAQ,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC;YAC7B,CAAC,CAAC,kCAAkC,CAAC,IAAI,CAAC,KAAK,CAAC;YAChD,CAAC,CAAC,EAAE;KACP,CAAC,CAAC,CAAC;AACN,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAC1C,KAAiB,EACjB,SAAoB;IAEpB,MAAM,cAAc,GAAG,kCAAkC,CAAC,KAAK,CAAC,CAAC;IACjE,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CAAC;QACpC;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE,4BAA4B,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;SACtE;KACF,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;AACjC,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { writeNodeId } from './node-id.js';
|
|
2
|
+
export { addNodeText, removeStructureText } from './node-text.js';
|
|
3
|
+
export { generateSummariesForStructure } from './summary.js';
|
|
4
|
+
export { createCleanStructureForDescription, generateDocDescription, } from './doc-description.js';
|
|
5
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/post-processing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAC3C,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,MAAM,gBAAgB,CAAC;AAClE,OAAO,EAAE,6BAA6B,EAAE,MAAM,cAAc,CAAC;AAC7D,OAAO,EACL,kCAAkC,EAClC,sBAAsB,GACvB,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { writeNodeId } from './node-id.js';
|
|
2
|
+
export { addNodeText, removeStructureText } from './node-text.js';
|
|
3
|
+
export { generateSummariesForStructure } from './summary.js';
|
|
4
|
+
export { createCleanStructureForDescription, generateDocDescription, } from './doc-description.js';
|
|
5
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/post-processing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAC3C,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,MAAM,gBAAgB,CAAC;AAClE,OAAO,EAAE,6BAA6B,EAAE,MAAM,cAAc,CAAC;AAC7D,OAAO,EACL,kCAAkC,EAClC,sBAAsB,GACvB,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { TreeNode } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Recursively assigns 4-digit nodeId to each node in the tree.
|
|
4
|
+
* Returns the next available nodeId.
|
|
5
|
+
*/
|
|
6
|
+
export declare function writeNodeId(nodes: TreeNode | TreeNode[], nodeId?: number): number;
|
|
7
|
+
//# sourceMappingURL=node-id.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"node-id.d.ts","sourceRoot":"","sources":["../../src/post-processing/node-id.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE5C;;;GAGG;AACH,wBAAgB,WAAW,CACzB,KAAK,EAAE,QAAQ,GAAG,QAAQ,EAAE,EAC5B,MAAM,SAAI,GACT,MAAM,CAiBR"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recursively assigns 4-digit nodeId to each node in the tree.
|
|
3
|
+
* Returns the next available nodeId.
|
|
4
|
+
*/
|
|
5
|
+
export function writeNodeId(nodes, nodeId = 0) {
|
|
6
|
+
if (Array.isArray(nodes)) {
|
|
7
|
+
for (const node of nodes) {
|
|
8
|
+
nodeId = writeNodeId(node, nodeId);
|
|
9
|
+
}
|
|
10
|
+
return nodeId;
|
|
11
|
+
}
|
|
12
|
+
// Single node
|
|
13
|
+
nodes.nodeId = String(nodeId).padStart(4, '0');
|
|
14
|
+
nodeId++;
|
|
15
|
+
if (nodes.nodes.length > 0) {
|
|
16
|
+
nodeId = writeNodeId(nodes.nodes, nodeId);
|
|
17
|
+
}
|
|
18
|
+
return nodeId;
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=node-id.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"node-id.js","sourceRoot":"","sources":["../../src/post-processing/node-id.ts"],"names":[],"mappings":"AAEA;;;GAGG;AACH,MAAM,UAAU,WAAW,CACzB,KAA4B,EAC5B,MAAM,GAAG,CAAC;IAEV,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,GAAG,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;QACrC,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,cAAc;IACd,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;IAC/C,MAAM,EAAE,CAAC;IAET,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAC5C,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { TreeNode } from '../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Recursively fills node.text by concatenating page texts
|
|
4
|
+
* from startIndex to endIndex (1-based).
|
|
5
|
+
*/
|
|
6
|
+
export declare function addNodeText(nodes: TreeNode | TreeNode[], pageTexts: string[]): void;
|
|
7
|
+
/**
|
|
8
|
+
* Recursively removes text from all nodes.
|
|
9
|
+
*/
|
|
10
|
+
export declare function removeStructureText(nodes: TreeNode | TreeNode[]): void;
|
|
11
|
+
//# sourceMappingURL=node-text.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"node-text.d.ts","sourceRoot":"","sources":["../../src/post-processing/node-text.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE5C;;;GAGG;AACH,wBAAgB,WAAW,CACzB,KAAK,EAAE,QAAQ,GAAG,QAAQ,EAAE,EAC5B,SAAS,EAAE,MAAM,EAAE,GAClB,IAAI,CAkBN;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,QAAQ,GAAG,QAAQ,EAAE,GAAG,IAAI,CAYtE"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recursively fills node.text by concatenating page texts
|
|
3
|
+
* from startIndex to endIndex (1-based).
|
|
4
|
+
*/
|
|
5
|
+
export function addNodeText(nodes, pageTexts) {
|
|
6
|
+
if (Array.isArray(nodes)) {
|
|
7
|
+
for (const node of nodes) {
|
|
8
|
+
addNodeText(node, pageTexts);
|
|
9
|
+
}
|
|
10
|
+
return;
|
|
11
|
+
}
|
|
12
|
+
// Single node
|
|
13
|
+
if (nodes.startIndex != null && nodes.endIndex != null) {
|
|
14
|
+
const start = nodes.startIndex - 1; // Convert to 0-based
|
|
15
|
+
const end = nodes.endIndex; // endIndex is inclusive, slice is exclusive
|
|
16
|
+
nodes.text = pageTexts.slice(start, end).join('\n');
|
|
17
|
+
}
|
|
18
|
+
if (nodes.nodes.length > 0) {
|
|
19
|
+
addNodeText(nodes.nodes, pageTexts);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Recursively removes text from all nodes.
|
|
24
|
+
*/
|
|
25
|
+
export function removeStructureText(nodes) {
|
|
26
|
+
if (Array.isArray(nodes)) {
|
|
27
|
+
for (const node of nodes) {
|
|
28
|
+
removeStructureText(node);
|
|
29
|
+
}
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
delete nodes.text;
|
|
33
|
+
if (nodes.nodes.length > 0) {
|
|
34
|
+
removeStructureText(nodes.nodes);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
//# sourceMappingURL=node-text.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"node-text.js","sourceRoot":"","sources":["../../src/post-processing/node-text.ts"],"names":[],"mappings":"AAEA;;;GAGG;AACH,MAAM,UAAU,WAAW,CACzB,KAA4B,EAC5B,SAAmB;IAEnB,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,WAAW,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QAC/B,CAAC;QACD,OAAO;IACT,CAAC;IAED,cAAc;IACd,IAAI,KAAK,CAAC,UAAU,IAAI,IAAI,IAAI,KAAK,CAAC,QAAQ,IAAI,IAAI,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC,qBAAqB;QACzD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAC,4CAA4C;QACxE,KAAK,CAAC,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,CAAC;IAED,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,WAAW,CAAC,KAAK,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IACtC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAA4B;IAC9D,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,mBAAmB,CAAC,IAAI,CAAC,CAAC;QAC5B,CAAC;QACD,OAAO;IACT,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC;IAClB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,mBAAmB,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACnC,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { TreeNode } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Recursively generates summaries for all nodes that have text.
|
|
5
|
+
*/
|
|
6
|
+
export declare function generateSummariesForStructure(nodes: TreeNode[], llmClient: LlmClient): Promise<void>;
|
|
7
|
+
//# sourceMappingURL=summary.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"summary.d.ts","sourceRoot":"","sources":["../../src/post-processing/summary.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAGjD;;GAEG;AACH,wBAAsB,6BAA6B,CACjD,KAAK,EAAE,QAAQ,EAAE,EACjB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,IAAI,CAAC,CAGf"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { generateNodeSummaryPrompt } from '../llm/prompts.js';
|
|
3
|
+
/**
|
|
4
|
+
* Recursively generates summaries for all nodes that have text.
|
|
5
|
+
*/
|
|
6
|
+
export async function generateSummariesForStructure(nodes, llmClient) {
|
|
7
|
+
const promises = nodes.map((node) => generateNodeSummary(node, llmClient));
|
|
8
|
+
await Promise.all(promises);
|
|
9
|
+
}
|
|
10
|
+
async function generateNodeSummary(node, llmClient) {
|
|
11
|
+
if (node.text) {
|
|
12
|
+
const response = await llmClient.chat([
|
|
13
|
+
{ role: 'user', content: generateNodeSummaryPrompt(node.text) },
|
|
14
|
+
]);
|
|
15
|
+
node.summary = response.content.trim();
|
|
16
|
+
}
|
|
17
|
+
if (node.nodes.length > 0) {
|
|
18
|
+
await Promise.all(node.nodes.map((child) => generateNodeSummary(child, llmClient)));
|
|
19
|
+
// For non-leaf nodes, create a prefix summary from children
|
|
20
|
+
if (node.nodes.length > 0) {
|
|
21
|
+
const childSummaries = node.nodes
|
|
22
|
+
.map((c) => c.summary)
|
|
23
|
+
.filter(Boolean)
|
|
24
|
+
.join('; ');
|
|
25
|
+
if (childSummaries) {
|
|
26
|
+
node.prefixSummary = childSummaries;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
//# sourceMappingURL=summary.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"summary.js","sourceRoot":"","sources":["../../src/post-processing/summary.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,yBAAyB,EAAE,MAAM,mBAAmB,CAAC;AAE9D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,6BAA6B,CACjD,KAAiB,EACjB,SAAoB;IAEpB,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,mBAAmB,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC;IAC3E,MAAM,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;AAC9B,CAAC;AAED,KAAK,UAAU,mBAAmB,CAChC,IAAc,EACd,SAAoB;IAEpB,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QACd,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CAAC;YACpC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;SAChE,CAAC,CAAC;QACH,IAAI,CAAC,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;IACzC,CAAC;IAED,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,OAAO,CAAC,GAAG,CACf,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,mBAAmB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC,CACjE,CAAC;QAEF,4DAA4D;QAC5D,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK;iBAC9B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;iBACrB,MAAM,CAAC,OAAO,CAAC;iBACf,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,IAAI,cAAc,EAAE,CAAC;gBACnB,IAAI,CAAC,aAAa,GAAG,cAAc,CAAC;YACtC,CAAC;QACH,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { metaProcessor } from './meta-processor.js';
|
|
2
|
+
export { processTocWithPageNumbers } from './toc-with-pages.js';
|
|
3
|
+
export { processTocNoPageNumbers } from './toc-no-pages.js';
|
|
4
|
+
export { processNoToc } from './no-toc.js';
|
|
5
|
+
export { processLargeNodeRecursively } from './large-node.js';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/processing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAChE,OAAO,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,2BAA2B,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { metaProcessor } from './meta-processor.js';
|
|
2
|
+
export { processTocWithPageNumbers } from './toc-with-pages.js';
|
|
3
|
+
export { processTocNoPageNumbers } from './toc-no-pages.js';
|
|
4
|
+
export { processNoToc } from './no-toc.js';
|
|
5
|
+
export { processLargeNodeRecursively } from './large-node.js';
|
|
6
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/processing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAChE,OAAO,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,2BAA2B,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { TreeNode, Logger } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Recursively splits large nodes by generating sub-structure.
|
|
5
|
+
*/
|
|
6
|
+
export declare function processLargeNodeRecursively(node: TreeNode, pageList: Array<{
|
|
7
|
+
text: string;
|
|
8
|
+
}>, llmClient: LlmClient, logger: Logger, maxPageNumEachNode: number, maxTokenNumEachNode: number): Promise<void>;
|
|
9
|
+
//# sourceMappingURL=large-node.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"large-node.d.ts","sourceRoot":"","sources":["../../src/processing/large-node.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAMjD;;GAEG;AACH,wBAAsB,2BAA2B,CAC/C,IAAI,EAAE,QAAQ,EACd,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,SAAS,EAAE,SAAS,EACpB,MAAM,EAAE,MAAM,EACd,kBAAkB,EAAE,MAAM,EAC1B,mBAAmB,EAAE,MAAM,GAC1B,OAAO,CAAC,IAAI,CAAC,CAoDf"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { metaProcessor } from './meta-processor.js';
|
|
3
|
+
import { postProcessing } from '../tree/post-processing.js';
|
|
4
|
+
import { checkTitleAppearanceInStartConcurrent } from '../tree-parser.js';
|
|
5
|
+
import { countTokens } from '../utils/token-counter.js';
|
|
6
|
+
/**
|
|
7
|
+
* Recursively splits large nodes by generating sub-structure.
|
|
8
|
+
*/
|
|
9
|
+
export async function processLargeNodeRecursively(node, pageList, llmClient, logger, maxPageNumEachNode, maxTokenNumEachNode) {
|
|
10
|
+
if (node.startIndex == null || node.endIndex == null)
|
|
11
|
+
return;
|
|
12
|
+
const nodePageList = pageList.slice(node.startIndex - 1, node.endIndex);
|
|
13
|
+
const tokenNum = nodePageList.reduce((sum, p) => sum + countTokens(p.text), 0);
|
|
14
|
+
// Inclusive span: start=1,end=1 → 1 page
|
|
15
|
+
const pageSpan = node.endIndex - node.startIndex + 1;
|
|
16
|
+
if (pageSpan > maxPageNumEachNode && tokenNum >= maxTokenNumEachNode) {
|
|
17
|
+
logger.info(`Splitting large node: ${node.title}`, {
|
|
18
|
+
pages: pageSpan,
|
|
19
|
+
tokens: tokenNum,
|
|
20
|
+
});
|
|
21
|
+
const { items } = await metaProcessor(nodePageList, 'process_no_toc', llmClient, logger, { startIndex: node.startIndex });
|
|
22
|
+
// Check title appearance at start
|
|
23
|
+
await checkTitleAppearanceInStartConcurrent(items, pageList, llmClient);
|
|
24
|
+
// Filter null physicalIndex
|
|
25
|
+
const validItems = items.filter((item) => item.physicalIndex != null);
|
|
26
|
+
if (validItems.length > 0) {
|
|
27
|
+
// If first item title matches current node, skip it
|
|
28
|
+
let subItems = validItems;
|
|
29
|
+
if (validItems[0].title === node.title) {
|
|
30
|
+
subItems = validItems.slice(1);
|
|
31
|
+
}
|
|
32
|
+
if (subItems.length > 0) {
|
|
33
|
+
node.nodes = postProcessing(subItems, node.endIndex);
|
|
34
|
+
// Recursively process children
|
|
35
|
+
await Promise.all(node.nodes.map((child) => processLargeNodeRecursively(child, pageList, llmClient, logger, maxPageNumEachNode, maxTokenNumEachNode)));
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=large-node.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"large-node.js","sourceRoot":"","sources":["../../src/processing/large-node.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,qCAAqC,EAAE,MAAM,mBAAmB,CAAC;AAC1E,OAAO,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAExD;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,2BAA2B,CAC/C,IAAc,EACd,QAAiC,EACjC,SAAoB,EACpB,MAAc,EACd,kBAA0B,EAC1B,mBAA2B;IAE3B,IAAI,IAAI,CAAC,UAAU,IAAI,IAAI,IAAI,IAAI,CAAC,QAAQ,IAAI,IAAI;QAAE,OAAO;IAE7D,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;IACxE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAClC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CACzC,CAAC;IACF,yCAAyC;IACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,UAAU,GAAG,CAAC,CAAC;IAErD,IAAI,QAAQ,GAAG,kBAAkB,IAAI,QAAQ,IAAI,mBAAmB,EAAE,CAAC;QACrE,MAAM,CAAC,IAAI,CAAC,yBAAyB,IAAI,CAAC,KAAK,EAAE,EAAE;YACjD,KAAK,EAAE,QAAQ;YACf,MAAM,EAAE,QAAQ;SACjB,CAAC,CAAC;QAEH,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,aAAa,CACnC,YAAY,EACZ,gBAAgB,EAChB,SAAS,EACT,MAAM,EACN,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,CAChC,CAAC;QAEF,kCAAkC;QAClC,MAAM,qCAAqC,CAAC,KAAK,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC;QAExE,4BAA4B;QAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,CAAC;QAEtE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,oDAAoD;YACpD,IAAI,QAAQ,GAAG,UAAU,CAAC;YAC1B,IAAI,UAAU,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,EAAE,CAAC;gBACvC,QAAQ,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACjC,CAAC;YAED,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,IAAI,CAAC,KAAK,GAAG,cAAc,CAAC,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;gBAErD,+BAA+B;gBAC/B,MAAM,OAAO,CAAC,GAAG,CACf,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CACvB,2BAA2B,CACzB,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAClC,kBAAkB,EAAE,mBAAmB,CACxC,CACF,CACF,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { TocItem, ProcessingMode, DegradationEvent, Logger } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Meta processor: dispatches to the appropriate processing mode,
|
|
5
|
+
* verifies results, and handles degradation.
|
|
6
|
+
*/
|
|
7
|
+
export declare function metaProcessor(pageList: Array<{
|
|
8
|
+
text: string;
|
|
9
|
+
}>, mode: ProcessingMode, llmClient: LlmClient, logger: Logger, options?: {
|
|
10
|
+
tocContent?: string | null;
|
|
11
|
+
tocPageList?: number[];
|
|
12
|
+
startIndex?: number;
|
|
13
|
+
onDegradation?: (event: DegradationEvent) => void;
|
|
14
|
+
}): Promise<{
|
|
15
|
+
items: TocItem[];
|
|
16
|
+
finalMode: ProcessingMode;
|
|
17
|
+
degradations: DegradationEvent[];
|
|
18
|
+
}>;
|
|
19
|
+
//# sourceMappingURL=meta-processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"meta-processor.d.ts","sourceRoot":"","sources":["../../src/processing/meta-processor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrF,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAQjD;;;GAGG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,IAAI,EAAE,cAAc,EACpB,SAAS,EAAE,SAAS,EACpB,MAAM,EAAE,MAAM,EACd,OAAO,GAAE;IACP,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;CAC9C,GACL,OAAO,CAAC;IAAE,KAAK,EAAE,OAAO,EAAE,CAAC;IAAC,SAAS,EAAE,cAAc,CAAC;IAAC,YAAY,EAAE,gBAAgB,EAAE,CAAA;CAAE,CAAC,CA8D5F"}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { TocProcessingError } from '../errors/index.js';
|
|
3
|
+
import { processTocWithPageNumbers } from './toc-with-pages.js';
|
|
4
|
+
import { processTocNoPageNumbers } from './toc-no-pages.js';
|
|
5
|
+
import { processNoToc } from './no-toc.js';
|
|
6
|
+
import { verifyToc } from '../verification/verify-toc.js';
|
|
7
|
+
import { fixIncorrectTocWithRetries } from '../verification/fix-toc.js';
|
|
8
|
+
/**
|
|
9
|
+
* Meta processor: dispatches to the appropriate processing mode,
|
|
10
|
+
* verifies results, and handles degradation.
|
|
11
|
+
*/
|
|
12
|
+
export async function metaProcessor(pageList, mode, llmClient, logger, options = {}) {
|
|
13
|
+
const degradations = [];
|
|
14
|
+
const MAX_ITERATIONS = 5;
|
|
15
|
+
let currentMode = mode;
|
|
16
|
+
let items = [];
|
|
17
|
+
for (let iteration = 0; iteration < MAX_ITERATIONS; iteration++) {
|
|
18
|
+
// Step 1: Process based on current mode
|
|
19
|
+
items = await processByMode(currentMode, pageList, llmClient, options);
|
|
20
|
+
// Step 2: Filter null physicalIndex
|
|
21
|
+
items = items.filter((item) => item.physicalIndex != null);
|
|
22
|
+
// Step 3: Validate and truncate
|
|
23
|
+
validateAndTruncatePhysicalIndices(items, pageList.length);
|
|
24
|
+
// Step 4: Verify
|
|
25
|
+
const { accuracy, incorrectResults } = await verifyToc(items, pageList, llmClient);
|
|
26
|
+
logger.info(`Verification accuracy: ${accuracy}`, { mode: currentMode });
|
|
27
|
+
// Step 5: Handle based on accuracy
|
|
28
|
+
if (accuracy === 1.0) {
|
|
29
|
+
return { items, finalMode: currentMode, degradations };
|
|
30
|
+
}
|
|
31
|
+
if (accuracy > 0.6) {
|
|
32
|
+
items = await fixIncorrectTocWithRetries(items, pageList, incorrectResults, llmClient);
|
|
33
|
+
return { items, finalMode: currentMode, degradations };
|
|
34
|
+
}
|
|
35
|
+
// Step 6: Degrade
|
|
36
|
+
const nextMode = getDegradedMode(currentMode);
|
|
37
|
+
if (!nextMode) {
|
|
38
|
+
throw new TocProcessingError(`Processing failed with accuracy ${accuracy} in mode ${currentMode}, no further degradation possible`);
|
|
39
|
+
}
|
|
40
|
+
const event = {
|
|
41
|
+
fromMode: currentMode,
|
|
42
|
+
toMode: nextMode,
|
|
43
|
+
accuracy,
|
|
44
|
+
reason: `Accuracy ${accuracy} below threshold 0.6`,
|
|
45
|
+
};
|
|
46
|
+
degradations.push(event);
|
|
47
|
+
options.onDegradation?.(event);
|
|
48
|
+
logger.warn(`Degrading from ${currentMode} to ${nextMode}`, { accuracy });
|
|
49
|
+
currentMode = nextMode;
|
|
50
|
+
}
|
|
51
|
+
throw new TocProcessingError(`Processing failed: exceeded maximum ${MAX_ITERATIONS} degradation iterations`);
|
|
52
|
+
}
|
|
53
|
+
async function processByMode(mode, pageList, llmClient, options) {
|
|
54
|
+
switch (mode) {
|
|
55
|
+
case 'process_toc_with_page_numbers':
|
|
56
|
+
if (!options.tocContent) {
|
|
57
|
+
throw new TocProcessingError('tocContent is required for process_toc_with_page_numbers mode');
|
|
58
|
+
}
|
|
59
|
+
return processTocWithPageNumbers(options.tocContent, pageList, options.tocPageList ?? [], llmClient);
|
|
60
|
+
case 'process_toc_no_page_numbers':
|
|
61
|
+
if (!options.tocContent) {
|
|
62
|
+
throw new TocProcessingError('tocContent is required for process_toc_no_page_numbers mode');
|
|
63
|
+
}
|
|
64
|
+
return processTocNoPageNumbers(options.tocContent, pageList, llmClient);
|
|
65
|
+
case 'process_no_toc':
|
|
66
|
+
return processNoToc(pageList, llmClient, options.startIndex);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
function getDegradedMode(mode) {
|
|
70
|
+
switch (mode) {
|
|
71
|
+
case 'process_toc_with_page_numbers':
|
|
72
|
+
return 'process_toc_no_page_numbers';
|
|
73
|
+
case 'process_toc_no_page_numbers':
|
|
74
|
+
return 'process_no_toc';
|
|
75
|
+
case 'process_no_toc':
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
function validateAndTruncatePhysicalIndices(items, totalPages) {
|
|
80
|
+
for (const item of items) {
|
|
81
|
+
if (item.physicalIndex != null) {
|
|
82
|
+
if (item.physicalIndex > totalPages) {
|
|
83
|
+
item.physicalIndex = totalPages;
|
|
84
|
+
}
|
|
85
|
+
if (item.physicalIndex < 1) {
|
|
86
|
+
item.physicalIndex = 1;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
//# sourceMappingURL=meta-processor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"meta-processor.js","sourceRoot":"","sources":["../../src/processing/meta-processor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,yBAAyB,EAAE,MAAM,qBAAqB,CAAC;AAChE,OAAO,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,SAAS,EAAE,MAAM,+BAA+B,CAAC;AAC1D,OAAO,EAAE,0BAA0B,EAAE,MAAM,4BAA4B,CAAC;AAExE;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAiC,EACjC,IAAoB,EACpB,SAAoB,EACpB,MAAc,EACd,UAKI,EAAE;IAEN,MAAM,YAAY,GAAuB,EAAE,CAAC;IAE5C,MAAM,cAAc,GAAG,CAAC,CAAC;IACzB,IAAI,WAAW,GAAG,IAAI,CAAC;IACvB,IAAI,KAAK,GAAc,EAAE,CAAC;IAE1B,KAAK,IAAI,SAAS,GAAG,CAAC,EAAE,SAAS,GAAG,cAAc,EAAE,SAAS,EAAE,EAAE,CAAC;QAChE,wCAAwC;QACxC,KAAK,GAAG,MAAM,aAAa,CACzB,WAAW,EAAE,QAAQ,EAAE,SAAS,EAAE,OAAO,CAC1C,CAAC;QAEF,oCAAoC;QACpC,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,CAAC;QAE3D,gCAAgC;QAChC,kCAAkC,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;QAE3D,iBAAiB;QACjB,MAAM,EAAE,QAAQ,EAAE,gBAAgB,EAAE,GAAG,MAAM,SAAS,CACpD,KAAK,EAAE,QAAQ,EAAE,SAAS,CAC3B,CAAC;QAEF,MAAM,CAAC,IAAI,CAAC,0BAA0B,QAAQ,EAAE,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;QAEzE,mCAAmC;QACnC,IAAI,QAAQ,KAAK,GAAG,EAAE,CAAC;YACrB,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,YAAY,EAAE,CAAC;QACzD,CAAC;QAED,IAAI,QAAQ,GAAG,GAAG,EAAE,CAAC;YACnB,KAAK,GAAG,MAAM,0BAA0B,CACtC,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,SAAS,CAC7C,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,YAAY,EAAE,CAAC;QACzD,CAAC;QAED,kBAAkB;QAClB,MAAM,QAAQ,GAAG,eAAe,CAAC,WAAW,CAAC,CAAC;QAC9C,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,kBAAkB,CAC1B,mCAAmC,QAAQ,YAAY,WAAW,mCAAmC,CACtG,CAAC;QACJ,CAAC;QAED,MAAM,KAAK,GAAqB;YAC9B,QAAQ,EAAE,WAAW;YACrB,MAAM,EAAE,QAAQ;YAChB,QAAQ;YACR,MAAM,EAAE,YAAY,QAAQ,sBAAsB;SACnD,CAAC;QACF,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,OAAO,CAAC,aAAa,EAAE,CAAC,KAAK,CAAC,CAAC;QAC/B,MAAM,CAAC,IAAI,CAAC,kBAAkB,WAAW,OAAO,QAAQ,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;QAE1E,WAAW,GAAG,QAAQ,CAAC;IACzB,CAAC;IAED,MAAM,IAAI,kBAAkB,CAC1B,uCAAuC,cAAc,yBAAyB,CAC/E,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,aAAa,CAC1B,IAAoB,EACpB,QAAiC,EACjC,SAAoB,EACpB,OAIC;IAED,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,+BAA+B;YAClC,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC;gBACxB,MAAM,IAAI,kBAAkB,CAAC,+DAA+D,CAAC,CAAC;YAChG,CAAC;YACD,OAAO,yBAAyB,CAC9B,OAAO,CAAC,UAAU,EAAE,QAAQ,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE,EAAE,SAAS,CACnE,CAAC;QACJ,KAAK,6BAA6B;YAChC,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC;gBACxB,MAAM,IAAI,kBAAkB,CAAC,6DAA6D,CAAC,CAAC;YAC9F,CAAC;YACD,OAAO,uBAAuB,CAC5B,OAAO,CAAC,UAAU,EAAE,QAAQ,EAAE,SAAS,CACxC,CAAC;QACJ,KAAK,gBAAgB;YACnB,OAAO,YAAY,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;IACjE,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAoB;IAC3C,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,+BAA+B;YAClC,OAAO,6BAA6B,CAAC;QACvC,KAAK,6BAA6B;YAChC,OAAO,gBAAgB,CAAC;QAC1B,KAAK,gBAAgB;YACnB,OAAO,IAAI,CAAC;IAChB,CAAC;AACH,CAAC;AAED,SAAS,kCAAkC,CACzC,KAAgB,EAChB,UAAkB;IAElB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,EAAE,CAAC;YAC/B,IAAI,IAAI,CAAC,aAAa,GAAG,UAAU,EAAE,CAAC;gBACpC,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC;YAClC,CAAC;YACD,IAAI,IAAI,CAAC,aAAa,GAAG,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,aAAa,GAAG,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { TocItem } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Processes documents without any TOC.
|
|
5
|
+
* Generates tree structure by scanning document content in groups.
|
|
6
|
+
*/
|
|
7
|
+
export declare function processNoToc(pageList: Array<{
|
|
8
|
+
text: string;
|
|
9
|
+
}>, llmClient: LlmClient, startIndex?: number): Promise<TocItem[]>;
|
|
10
|
+
//# sourceMappingURL=no-toc.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"no-toc.d.ts","sourceRoot":"","sources":["../../src/processing/no-toc.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAWjD;;;GAGG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,SAAS,EAAE,SAAS,EACpB,UAAU,SAAI,GACb,OAAO,CAAC,OAAO,EAAE,CAAC,CA6CpB"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { generateTocInitPrompt, generateTocContinuePrompt, } from '../llm/prompts.js';
|
|
3
|
+
import { pageListToGroupText, convertPhysicalIndexToInt, } from '../utils/page-utils.js';
|
|
4
|
+
import { countTokens } from '../utils/token-counter.js';
|
|
5
|
+
/**
|
|
6
|
+
* Processes documents without any TOC.
|
|
7
|
+
* Generates tree structure by scanning document content in groups.
|
|
8
|
+
*/
|
|
9
|
+
export async function processNoToc(pageList, llmClient, startIndex = 1) {
|
|
10
|
+
// Add physical index tags
|
|
11
|
+
const pageContents = pageList.map((p, i) => {
|
|
12
|
+
const idx = startIndex + i;
|
|
13
|
+
return `<physical_index_${idx}>\n${p.text}\n<physical_index_${idx}>`;
|
|
14
|
+
});
|
|
15
|
+
const tokenLengths = pageContents.map((p) => countTokens(p));
|
|
16
|
+
// Group pages
|
|
17
|
+
const groups = pageListToGroupText(pageContents, tokenLengths);
|
|
18
|
+
if (groups.length === 0)
|
|
19
|
+
return [];
|
|
20
|
+
// Generate initial TOC from first group
|
|
21
|
+
const initResult = await llmClient.chatJson([{ role: 'user', content: generateTocInitPrompt(groups[0]) }]);
|
|
22
|
+
let allItems = Array.isArray(initResult) ? initResult : [];
|
|
23
|
+
// Continue with remaining groups
|
|
24
|
+
for (let i = 1; i < groups.length; i++) {
|
|
25
|
+
const continueResult = await llmClient.chatJson([
|
|
26
|
+
{
|
|
27
|
+
role: 'user',
|
|
28
|
+
content: generateTocContinuePrompt(groups[i], JSON.stringify(allItems)),
|
|
29
|
+
},
|
|
30
|
+
]);
|
|
31
|
+
if (Array.isArray(continueResult)) {
|
|
32
|
+
allItems = [...allItems, ...continueResult];
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
// Convert physical_index strings to numbers
|
|
36
|
+
return allItems.map((item) => ({
|
|
37
|
+
structure: item.structure,
|
|
38
|
+
title: item.title,
|
|
39
|
+
physicalIndex: item.physical_index
|
|
40
|
+
? convertPhysicalIndexToInt(item.physical_index)
|
|
41
|
+
: null,
|
|
42
|
+
}));
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=no-toc.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"no-toc.js","sourceRoot":"","sources":["../../src/processing/no-toc.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EACL,qBAAqB,EACrB,yBAAyB,GAC1B,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EACL,mBAAmB,EACnB,yBAAyB,GAC1B,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAExD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAiC,EACjC,SAAoB,EACpB,UAAU,GAAG,CAAC;IAEd,0BAA0B;IAC1B,MAAM,YAAY,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACzC,MAAM,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC;QAC3B,OAAO,mBAAmB,GAAG,MAAM,CAAC,CAAC,IAAI,qBAAqB,GAAG,GAAG,CAAC;IACvE,CAAC,CAAC,CAAC;IACH,MAAM,YAAY,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,cAAc;IACd,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;IAE/D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,wCAAwC;IACxC,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC,QAAQ,CAEzC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,qBAAqB,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEjE,IAAI,QAAQ,GACV,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC;IAE9C,iCAAiC;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,cAAc,GAAG,MAAM,SAAS,CAAC,QAAQ,CAE7C;YACA;gBACE,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE,yBAAyB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;aACxE;SACF,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,CAAC;YAClC,QAAQ,GAAG,CAAC,GAAG,QAAQ,EAAE,GAAG,cAAc,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,4CAA4C;IAC5C,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC7B,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,aAAa,EAAE,IAAI,CAAC,cAAc;YAChC,CAAC,CAAC,yBAAyB,CAAC,IAAI,CAAC,cAAc,CAAC;YAChD,CAAC,CAAC,IAAI;KACT,CAAC,CAAC,CAAC;AACN,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { TocItem } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Processes TOC without page numbers.
|
|
5
|
+
* Uses the TOC structure but fills in physical indices by matching
|
|
6
|
+
* against document content.
|
|
7
|
+
*/
|
|
8
|
+
export declare function processTocNoPageNumbers(tocContent: string, pageList: Array<{
|
|
9
|
+
text: string;
|
|
10
|
+
}>, llmClient: LlmClient): Promise<TocItem[]>;
|
|
11
|
+
//# sourceMappingURL=toc-no-pages.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-no-pages.d.ts","sourceRoot":"","sources":["../../src/processing/toc-no-pages.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AASjD;;;;GAIG;AACH,wBAAsB,uBAAuB,CAC3C,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,OAAO,EAAE,CAAC,CAiDpB"}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
2
|
+
import { tocTransformer } from '../toc/toc-transformer.js';
|
|
3
|
+
import { convertPhysicalIndexToInt, pageListToGroupText, } from '../utils/page-utils.js';
|
|
4
|
+
import { countTokens } from '../utils/token-counter.js';
|
|
5
|
+
import { addPageNumberToTocPrompt } from '../llm/prompts.js';
|
|
6
|
+
/**
|
|
7
|
+
* Processes TOC without page numbers.
|
|
8
|
+
* Uses the TOC structure but fills in physical indices by matching
|
|
9
|
+
* against document content.
|
|
10
|
+
*/
|
|
11
|
+
export async function processTocNoPageNumbers(tocContent, pageList, llmClient) {
|
|
12
|
+
// Transform TOC to structured JSON
|
|
13
|
+
const tocItems = await tocTransformer(tocContent, llmClient);
|
|
14
|
+
// Add physical index tags to all pages
|
|
15
|
+
const pageContents = pageList.map((p, i) => {
|
|
16
|
+
return `<physical_index_${i + 1}>\n${p.text}\n<physical_index_${i + 1}>`;
|
|
17
|
+
});
|
|
18
|
+
const tokenLengths = pageContents.map((p) => countTokens(p));
|
|
19
|
+
// Group pages
|
|
20
|
+
const groups = pageListToGroupText(pageContents, tokenLengths);
|
|
21
|
+
// For each group, ask LLM to match TOC entries
|
|
22
|
+
let currentStructure = JSON.stringify(tocItems.map(({ structure, title }) => ({
|
|
23
|
+
structure,
|
|
24
|
+
title,
|
|
25
|
+
start: 'no',
|
|
26
|
+
physical_index: null,
|
|
27
|
+
})));
|
|
28
|
+
for (const group of groups) {
|
|
29
|
+
const result = await llmClient.chatJson([
|
|
30
|
+
{ role: 'user', content: addPageNumberToTocPrompt(group, currentStructure) },
|
|
31
|
+
]);
|
|
32
|
+
if (Array.isArray(result)) {
|
|
33
|
+
currentStructure = JSON.stringify(result);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
// Parse final result and convert physical indices
|
|
37
|
+
const finalResult = JSON.parse(currentStructure);
|
|
38
|
+
return finalResult.map((item) => ({
|
|
39
|
+
structure: item.structure,
|
|
40
|
+
title: item.title,
|
|
41
|
+
physicalIndex: item.physical_index
|
|
42
|
+
? convertPhysicalIndexToInt(item.physical_index)
|
|
43
|
+
: null,
|
|
44
|
+
}));
|
|
45
|
+
}
|
|
46
|
+
//# sourceMappingURL=toc-no-pages.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-no-pages.js","sourceRoot":"","sources":["../../src/processing/toc-no-pages.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAC3D,OAAO,EACL,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AACxD,OAAO,EAAE,wBAAwB,EAAE,MAAM,mBAAmB,CAAC;AAE7D;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC3C,UAAkB,EAClB,QAAiC,EACjC,SAAoB;IAEpB,mCAAmC;IACnC,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC;IAE7D,uCAAuC;IACvC,MAAM,YAAY,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACzC,OAAO,mBAAmB,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,IAAI,qBAAqB,CAAC,GAAG,CAAC,GAAG,CAAC;IAC3E,CAAC,CAAC,CAAC;IACH,MAAM,YAAY,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,cAAc;IACd,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;IAE/D,+CAA+C;IAC/C,IAAI,gBAAgB,GAAG,IAAI,CAAC,SAAS,CACnC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;QACtC,SAAS;QACT,KAAK;QACL,KAAK,EAAE,IAAI;QACX,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC,CACJ,CAAC;IAEF,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAErC;YACA,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,wBAAwB,CAAC,KAAK,EAAE,gBAAgB,CAAC,EAAE;SAC7E,CAAC,CAAC;QAEH,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,gBAAgB,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED,kDAAkD;IAClD,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAI7C,CAAC;IAEH,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAChC,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,aAAa,EAAE,IAAI,CAAC,cAAc;YAChC,CAAC,CAAC,yBAAyB,CAAC,IAAI,CAAC,cAAc,CAAC;YAChD,CAAC,CAAC,IAAI;KACT,CAAC,CAAC,CAAC;AACN,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { TocItem } from '../types.js';
|
|
2
|
+
import { LlmClient } from '../llm/llm-client.js';
|
|
3
|
+
/**
|
|
4
|
+
* Processes TOC with page numbers.
|
|
5
|
+
* Algorithm:
|
|
6
|
+
* 1. tocTransformer → structured JSON with page field
|
|
7
|
+
* 2. Remove page → tocNoPageNumber
|
|
8
|
+
* 3. Extract physical indices from main content
|
|
9
|
+
* 4. Match page/physicalIndex pairs → calculate offset
|
|
10
|
+
* 5. Apply offset to all entries
|
|
11
|
+
*/
|
|
12
|
+
export declare function processTocWithPageNumbers(tocContent: string, pageList: Array<{
|
|
13
|
+
text: string;
|
|
14
|
+
}>, tocPageList: number[], llmClient: LlmClient): Promise<TocItem[]>;
|
|
15
|
+
//# sourceMappingURL=toc-with-pages.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toc-with-pages.d.ts","sourceRoot":"","sources":["../../src/processing/toc-with-pages.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAIjD;;;;;;;;GAQG;AACH,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,WAAW,EAAE,MAAM,EAAE,EACrB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,OAAO,EAAE,CAAC,CA6CpB"}
|