@fastrag/pageindex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +251 -0
  3. package/README.zh-CN.md +251 -0
  4. package/dist/errors/index.d.ts +10 -0
  5. package/dist/errors/index.d.ts.map +1 -0
  6. package/dist/errors/index.js +19 -0
  7. package/dist/errors/index.js.map +1 -0
  8. package/dist/index.d.ts +14 -0
  9. package/dist/index.d.ts.map +1 -0
  10. package/dist/index.js +20 -0
  11. package/dist/index.js.map +1 -0
  12. package/dist/internal-types/config.d.ts +35 -0
  13. package/dist/internal-types/config.d.ts.map +1 -0
  14. package/dist/internal-types/config.js +16 -0
  15. package/dist/internal-types/config.js.map +1 -0
  16. package/dist/internal-types/document-parser.d.ts +5 -0
  17. package/dist/internal-types/document-parser.d.ts.map +1 -0
  18. package/dist/internal-types/document-parser.js +2 -0
  19. package/dist/internal-types/document-parser.js.map +1 -0
  20. package/dist/internal-types/index.d.ts +9 -0
  21. package/dist/internal-types/index.d.ts.map +1 -0
  22. package/dist/internal-types/index.js +2 -0
  23. package/dist/internal-types/index.js.map +1 -0
  24. package/dist/internal-types/llm-provider.d.ts +19 -0
  25. package/dist/internal-types/llm-provider.d.ts.map +1 -0
  26. package/dist/internal-types/llm-provider.js +2 -0
  27. package/dist/internal-types/llm-provider.js.map +1 -0
  28. package/dist/internal-types/logger.d.ts +7 -0
  29. package/dist/internal-types/logger.d.ts.map +1 -0
  30. package/dist/internal-types/logger.js +2 -0
  31. package/dist/internal-types/logger.js.map +1 -0
  32. package/dist/internal-types/page.d.ts +5 -0
  33. package/dist/internal-types/page.d.ts.map +1 -0
  34. package/dist/internal-types/page.js +2 -0
  35. package/dist/internal-types/page.js.map +1 -0
  36. package/dist/internal-types/processing.d.ts +21 -0
  37. package/dist/internal-types/processing.d.ts.map +1 -0
  38. package/dist/internal-types/processing.js +2 -0
  39. package/dist/internal-types/processing.js.map +1 -0
  40. package/dist/internal-types/tree-node.d.ts +30 -0
  41. package/dist/internal-types/tree-node.d.ts.map +1 -0
  42. package/dist/internal-types/tree-node.js +2 -0
  43. package/dist/internal-types/tree-node.js.map +1 -0
  44. package/dist/llm/index.d.ts +3 -0
  45. package/dist/llm/index.d.ts.map +1 -0
  46. package/dist/llm/index.js +3 -0
  47. package/dist/llm/index.js.map +1 -0
  48. package/dist/llm/llm-client.d.ts +26 -0
  49. package/dist/llm/llm-client.d.ts.map +1 -0
  50. package/dist/llm/llm-client.js +88 -0
  51. package/dist/llm/llm-client.js.map +1 -0
  52. package/dist/llm/prompts.d.ts +33 -0
  53. package/dist/llm/prompts.d.ts.map +1 -0
  54. package/dist/llm/prompts.js +312 -0
  55. package/dist/llm/prompts.js.map +1 -0
  56. package/dist/markdown/index.d.ts +6 -0
  57. package/dist/markdown/index.d.ts.map +1 -0
  58. package/dist/markdown/index.js +5 -0
  59. package/dist/markdown/index.js.map +1 -0
  60. package/dist/markdown/md-extractor.d.ts +14 -0
  61. package/dist/markdown/md-extractor.d.ts.map +1 -0
  62. package/dist/markdown/md-extractor.js +30 -0
  63. package/dist/markdown/md-extractor.js.map +1 -0
  64. package/dist/markdown/md-to-tree.d.ts +8 -0
  65. package/dist/markdown/md-to-tree.d.ts.map +1 -0
  66. package/dist/markdown/md-to-tree.js +20 -0
  67. package/dist/markdown/md-to-tree.js.map +1 -0
  68. package/dist/markdown/md-tree-builder.d.ts +7 -0
  69. package/dist/markdown/md-tree-builder.d.ts.map +1 -0
  70. package/dist/markdown/md-tree-builder.js +36 -0
  71. package/dist/markdown/md-tree-builder.js.map +1 -0
  72. package/dist/markdown/tree-thinning.d.ts +8 -0
  73. package/dist/markdown/tree-thinning.d.ts.map +1 -0
  74. package/dist/markdown/tree-thinning.js +42 -0
  75. package/dist/markdown/tree-thinning.js.map +1 -0
  76. package/dist/page-index.d.ts +10 -0
  77. package/dist/page-index.d.ts.map +1 -0
  78. package/dist/page-index.js +54 -0
  79. package/dist/page-index.js.map +1 -0
  80. package/dist/post-processing/doc-description.d.ts +12 -0
  81. package/dist/post-processing/doc-description.d.ts.map +1 -0
  82. package/dist/post-processing/doc-description.js +31 -0
  83. package/dist/post-processing/doc-description.js.map +1 -0
  84. package/dist/post-processing/index.d.ts +5 -0
  85. package/dist/post-processing/index.d.ts.map +1 -0
  86. package/dist/post-processing/index.js +5 -0
  87. package/dist/post-processing/index.js.map +1 -0
  88. package/dist/post-processing/node-id.d.ts +7 -0
  89. package/dist/post-processing/node-id.d.ts.map +1 -0
  90. package/dist/post-processing/node-id.js +20 -0
  91. package/dist/post-processing/node-id.js.map +1 -0
  92. package/dist/post-processing/node-text.d.ts +11 -0
  93. package/dist/post-processing/node-text.d.ts.map +1 -0
  94. package/dist/post-processing/node-text.js +37 -0
  95. package/dist/post-processing/node-text.js.map +1 -0
  96. package/dist/post-processing/summary.d.ts +7 -0
  97. package/dist/post-processing/summary.d.ts.map +1 -0
  98. package/dist/post-processing/summary.js +31 -0
  99. package/dist/post-processing/summary.js.map +1 -0
  100. package/dist/processing/index.d.ts +6 -0
  101. package/dist/processing/index.d.ts.map +1 -0
  102. package/dist/processing/index.js +6 -0
  103. package/dist/processing/index.js.map +1 -0
  104. package/dist/processing/large-node.d.ts +9 -0
  105. package/dist/processing/large-node.d.ts.map +1 -0
  106. package/dist/processing/large-node.js +40 -0
  107. package/dist/processing/large-node.js.map +1 -0
  108. package/dist/processing/meta-processor.d.ts +19 -0
  109. package/dist/processing/meta-processor.d.ts.map +1 -0
  110. package/dist/processing/meta-processor.js +91 -0
  111. package/dist/processing/meta-processor.js.map +1 -0
  112. package/dist/processing/no-toc.d.ts +10 -0
  113. package/dist/processing/no-toc.d.ts.map +1 -0
  114. package/dist/processing/no-toc.js +44 -0
  115. package/dist/processing/no-toc.js.map +1 -0
  116. package/dist/processing/toc-no-pages.d.ts +11 -0
  117. package/dist/processing/toc-no-pages.d.ts.map +1 -0
  118. package/dist/processing/toc-no-pages.js +46 -0
  119. package/dist/processing/toc-no-pages.js.map +1 -0
  120. package/dist/processing/toc-with-pages.d.ts +15 -0
  121. package/dist/processing/toc-with-pages.d.ts.map +1 -0
  122. package/dist/processing/toc-with-pages.js +151 -0
  123. package/dist/processing/toc-with-pages.js.map +1 -0
  124. package/dist/toc/index.d.ts +4 -0
  125. package/dist/toc/index.d.ts.map +1 -0
  126. package/dist/toc/index.js +4 -0
  127. package/dist/toc/index.js.map +1 -0
  128. package/dist/toc/toc-detector.d.ts +23 -0
  129. package/dist/toc/toc-detector.d.ts.map +1 -0
  130. package/dist/toc/toc-detector.js +65 -0
  131. package/dist/toc/toc-detector.js.map +1 -0
  132. package/dist/toc/toc-extractor.d.ts +13 -0
  133. package/dist/toc/toc-extractor.d.ts.map +1 -0
  134. package/dist/toc/toc-extractor.js +32 -0
  135. package/dist/toc/toc-extractor.js.map +1 -0
  136. package/dist/toc/toc-transformer.d.ts +11 -0
  137. package/dist/toc/toc-transformer.d.ts.map +1 -0
  138. package/dist/toc/toc-transformer.js +69 -0
  139. package/dist/toc/toc-transformer.js.map +1 -0
  140. package/dist/tree/index.d.ts +4 -0
  141. package/dist/tree/index.d.ts.map +1 -0
  142. package/dist/tree/index.js +4 -0
  143. package/dist/tree/index.js.map +1 -0
  144. package/dist/tree/list-to-tree.d.ts +7 -0
  145. package/dist/tree/list-to-tree.d.ts.map +1 -0
  146. package/dist/tree/list-to-tree.js +33 -0
  147. package/dist/tree/list-to-tree.js.map +1 -0
  148. package/dist/tree/post-processing.d.ts +12 -0
  149. package/dist/tree/post-processing.d.ts.map +1 -0
  150. package/dist/tree/post-processing.js +87 -0
  151. package/dist/tree/post-processing.js.map +1 -0
  152. package/dist/tree/tree-utils.d.ts +18 -0
  153. package/dist/tree/tree-utils.d.ts.map +1 -0
  154. package/dist/tree/tree-utils.js +43 -0
  155. package/dist/tree/tree-utils.js.map +1 -0
  156. package/dist/tree-parser.d.ts +30 -0
  157. package/dist/tree-parser.d.ts.map +1 -0
  158. package/dist/tree-parser.js +73 -0
  159. package/dist/tree-parser.js.map +1 -0
  160. package/dist/types.d.ts +3 -0
  161. package/dist/types.d.ts.map +1 -0
  162. package/dist/types.js +2 -0
  163. package/dist/types.js.map +1 -0
  164. package/dist/utils/config-loader.d.ts +15 -0
  165. package/dist/utils/config-loader.d.ts.map +1 -0
  166. package/dist/utils/config-loader.js +19 -0
  167. package/dist/utils/config-loader.js.map +1 -0
  168. package/dist/utils/index.d.ts +7 -0
  169. package/dist/utils/index.d.ts.map +1 -0
  170. package/dist/utils/index.js +6 -0
  171. package/dist/utils/index.js.map +1 -0
  172. package/dist/utils/json-parser.d.ts +2 -0
  173. package/dist/utils/json-parser.d.ts.map +1 -0
  174. package/dist/utils/json-parser.js +76 -0
  175. package/dist/utils/json-parser.js.map +1 -0
  176. package/dist/utils/logger.d.ts +3 -0
  177. package/dist/utils/logger.d.ts.map +1 -0
  178. package/dist/utils/logger.js +10 -0
  179. package/dist/utils/logger.js.map +1 -0
  180. package/dist/utils/page-utils.d.ts +16 -0
  181. package/dist/utils/page-utils.d.ts.map +1 -0
  182. package/dist/utils/page-utils.js +56 -0
  183. package/dist/utils/page-utils.js.map +1 -0
  184. package/dist/utils/token-counter.d.ts +2 -0
  185. package/dist/utils/token-counter.d.ts.map +1 -0
  186. package/dist/utils/token-counter.js +5 -0
  187. package/dist/utils/token-counter.js.map +1 -0
  188. package/dist/vector-lib/adapters/in-memory-adapter.d.ts +14 -0
  189. package/dist/vector-lib/adapters/in-memory-adapter.d.ts.map +1 -0
  190. package/dist/vector-lib/adapters/in-memory-adapter.js +55 -0
  191. package/dist/vector-lib/adapters/in-memory-adapter.js.map +1 -0
  192. package/dist/vector-lib/adapters/vector-store.d.ts +10 -0
  193. package/dist/vector-lib/adapters/vector-store.d.ts.map +1 -0
  194. package/dist/vector-lib/adapters/vector-store.js +2 -0
  195. package/dist/vector-lib/adapters/vector-store.js.map +1 -0
  196. package/dist/vector-lib/chunker/tree-chunker.d.ts +8 -0
  197. package/dist/vector-lib/chunker/tree-chunker.d.ts.map +1 -0
  198. package/dist/vector-lib/chunker/tree-chunker.js +59 -0
  199. package/dist/vector-lib/chunker/tree-chunker.js.map +1 -0
  200. package/dist/vector-lib/embedder/embedder.d.ts +8 -0
  201. package/dist/vector-lib/embedder/embedder.d.ts.map +1 -0
  202. package/dist/vector-lib/embedder/embedder.js +2 -0
  203. package/dist/vector-lib/embedder/embedder.js.map +1 -0
  204. package/dist/vector-lib/index.d.ts +10 -0
  205. package/dist/vector-lib/index.d.ts.map +1 -0
  206. package/dist/vector-lib/index.js +6 -0
  207. package/dist/vector-lib/index.js.map +1 -0
  208. package/dist/vector-lib/search/hybrid-search.d.ts +19 -0
  209. package/dist/vector-lib/search/hybrid-search.d.ts.map +1 -0
  210. package/dist/vector-lib/search/hybrid-search.js +25 -0
  211. package/dist/vector-lib/search/hybrid-search.js.map +1 -0
  212. package/dist/vector-lib/search/reranker.d.ts +14 -0
  213. package/dist/vector-lib/search/reranker.d.ts.map +1 -0
  214. package/dist/vector-lib/search/reranker.js +2 -0
  215. package/dist/vector-lib/search/reranker.js.map +1 -0
  216. package/dist/vector-lib/types.d.ts +29 -0
  217. package/dist/vector-lib/types.d.ts.map +1 -0
  218. package/dist/vector-lib/types.js +2 -0
  219. package/dist/vector-lib/types.js.map +1 -0
  220. package/dist/vector-lib/vector-enhancer.d.ts +28 -0
  221. package/dist/vector-lib/vector-enhancer.d.ts.map +1 -0
  222. package/dist/vector-lib/vector-enhancer.js +54 -0
  223. package/dist/vector-lib/vector-enhancer.js.map +1 -0
  224. package/dist/vector.d.ts +5 -0
  225. package/dist/vector.d.ts.map +1 -0
  226. package/dist/vector.js +3 -0
  227. package/dist/vector.js.map +1 -0
  228. package/dist/verification/fix-toc.d.ts +13 -0
  229. package/dist/verification/fix-toc.d.ts.map +1 -0
  230. package/dist/verification/fix-toc.js +73 -0
  231. package/dist/verification/fix-toc.js.map +1 -0
  232. package/dist/verification/index.d.ts +3 -0
  233. package/dist/verification/index.d.ts.map +1 -0
  234. package/dist/verification/index.js +3 -0
  235. package/dist/verification/index.js.map +1 -0
  236. package/dist/verification/verify-toc.d.ts +17 -0
  237. package/dist/verification/verify-toc.d.ts.map +1 -0
  238. package/dist/verification/verify-toc.js +64 -0
  239. package/dist/verification/verify-toc.js.map +1 -0
  240. package/package.json +58 -0
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Splits a PageIndexResult tree into chunks suitable for vector indexing.
3
+ * Each leaf node with text becomes one or more chunks.
4
+ */
5
+ export const treeChunker = ((result, config = {}) => {
6
+ const maxChunkTokens = config.chunkMaxTokens ?? 1000;
7
+ const chunks = [];
8
+ const docName = result.docName;
9
+ function processNode(node) {
10
+ if (node.text) {
11
+ // Simple chunking: split by paragraphs if text is too long
12
+ const text = node.text;
13
+ const chunkTexts = splitText(text, maxChunkTokens);
14
+ for (let i = 0; i < chunkTexts.length; i++) {
15
+ chunks.push({
16
+ id: `${docName}:${node.nodeId ?? 'unknown'}:${i}`,
17
+ text: chunkTexts[i],
18
+ metadata: {
19
+ docName,
20
+ nodeId: node.nodeId ?? '',
21
+ title: node.title,
22
+ startIndex: node.startIndex,
23
+ endIndex: node.endIndex,
24
+ chunkIndex: i,
25
+ },
26
+ });
27
+ }
28
+ }
29
+ for (const child of node.nodes) {
30
+ processNode(child);
31
+ }
32
+ }
33
+ for (const node of result.structure) {
34
+ processNode(node);
35
+ }
36
+ return chunks;
37
+ });
38
+ function splitText(text, maxTokens) {
39
+ // Rough estimate: 1 token ≈ 4 chars
40
+ const maxChars = maxTokens * 4;
41
+ if (text.length <= maxChars) {
42
+ return [text];
43
+ }
44
+ const paragraphs = text.split(/\n\n+/);
45
+ const chunks = [];
46
+ let current = '';
47
+ for (const para of paragraphs) {
48
+ if (current.length + para.length > maxChars && current.length > 0) {
49
+ chunks.push(current.trim());
50
+ current = '';
51
+ }
52
+ current += (current ? '\n\n' : '') + para;
53
+ }
54
+ if (current.trim()) {
55
+ chunks.push(current.trim());
56
+ }
57
+ return chunks.length > 0 ? chunks : [text];
58
+ }
59
+ //# sourceMappingURL=tree-chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tree-chunker.js","sourceRoot":"","sources":["../../../src/vector-lib/chunker/tree-chunker.ts"],"names":[],"mappings":"AAGA;;;GAGG;AACH,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,CAC1B,MAAuB,EACvB,SAAuB,EAAE,EAChB,EAAE;IACX,MAAM,cAAc,GAAG,MAAM,CAAC,cAAc,IAAI,IAAI,CAAC;IACrD,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC;IAE/B,SAAS,WAAW,CAAC,IAAc;QACjC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YACd,2DAA2D;YAC3D,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;YACvB,MAAM,UAAU,GAAG,SAAS,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;YAEnD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC3C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,GAAG,OAAO,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,IAAI,CAAC,EAAE;oBACjD,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC;oBACnB,QAAQ,EAAE;wBACR,OAAO;wBACP,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;wBACzB,KAAK,EAAE,IAAI,CAAC,KAAK;wBACjB,UAAU,EAAE,IAAI,CAAC,UAAU;wBAC3B,QAAQ,EAAE,IAAI,CAAC,QAAQ;wBACvB,UAAU,EAAE,CAAC;qBACd;iBACF,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC/B,WAAW,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACpC,WAAW,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC,CAAmB,CAAC;AAErB,SAAS,SAAS,CAAC,IAAY,EAAE,SAAiB;IAChD,oCAAoC;IACpC,MAAM,QAAQ,GAAG,SAAS,GAAG,CAAC,CAAC;IAE/B,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;QAC5B,OAAO,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC;IAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACvC,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;IAEjB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;YAC5B,OAAO,GAAG,EAAE,CAAC;QACf,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC;IAC5C,CAAC;IAED,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QACnB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IAC9B,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;AAC7C,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Embedder interface — user implements this.
3
+ */
4
+ export interface Embedder {
5
+ embed(texts: string[]): Promise<number[][]>;
6
+ readonly dimension: number;
7
+ }
8
+ //# sourceMappingURL=embedder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedder.d.ts","sourceRoot":"","sources":["../../../src/vector-lib/embedder/embedder.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IAC5C,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=embedder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedder.js","sourceRoot":"","sources":["../../../src/vector-lib/embedder/embedder.ts"],"names":[],"mappings":""}
@@ -0,0 +1,10 @@
1
+ export type { VectorConfig, SearchResult, Chunk, VectorRecord, Chunker, } from './types.js';
2
+ export type { Embedder } from './embedder/embedder.js';
3
+ export type { VectorStore } from './adapters/vector-store.js';
4
+ export type { Reranker } from './search/reranker.js';
5
+ export { InMemoryAdapter } from './adapters/in-memory-adapter.js';
6
+ export { treeChunker } from './chunker/tree-chunker.js';
7
+ export { VectorEnhancer } from './vector-enhancer.js';
8
+ export { HybridSearch } from './search/hybrid-search.js';
9
+ export type { HybridSearchConfig } from './search/hybrid-search.js';
10
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/vector-lib/index.ts"],"names":[],"mappings":"AACA,YAAY,EACV,YAAY,EACZ,YAAY,EACZ,KAAK,EACL,YAAY,EACZ,OAAO,GACR,MAAM,YAAY,CAAC;AAGpB,YAAY,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAC;AACvD,YAAY,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAC9D,YAAY,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAGrD,OAAO,EAAE,eAAe,EAAE,MAAM,iCAAiC,CAAC;AAClE,OAAO,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AACxD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,YAAY,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC"}
@@ -0,0 +1,6 @@
1
+ // Implementations
2
+ export { InMemoryAdapter } from './adapters/in-memory-adapter.js';
3
+ export { treeChunker } from './chunker/tree-chunker.js';
4
+ export { VectorEnhancer } from './vector-enhancer.js';
5
+ export { HybridSearch } from './search/hybrid-search.js';
6
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/vector-lib/index.ts"],"names":[],"mappings":"AAcA,kBAAkB;AAClB,OAAO,EAAE,eAAe,EAAE,MAAM,iCAAiC,CAAC;AAClE,OAAO,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AACxD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC"}
@@ -0,0 +1,19 @@
1
+ import type { SearchResult } from '../types.js';
2
+ import { VectorEnhancer } from '../vector-enhancer.js';
3
+ export interface HybridSearchConfig {
4
+ vectorTopK?: number;
5
+ rerankTopK?: number;
6
+ }
7
+ /**
8
+ * Hybrid search combining vector similarity with tree structure awareness.
9
+ */
10
+ export declare class HybridSearch {
11
+ private readonly enhancer;
12
+ private readonly config;
13
+ constructor(enhancer: VectorEnhancer, config?: HybridSearchConfig);
14
+ /**
15
+ * Search with vector retrieval, then rerank/filter to top results.
16
+ */
17
+ search(query: string, filter?: Record<string, unknown>): Promise<SearchResult[]>;
18
+ }
19
+ //# sourceMappingURL=hybrid-search.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hybrid-search.d.ts","sourceRoot":"","sources":["../../../src/vector-lib/search/hybrid-search.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAEvD,MAAM,WAAW,kBAAkB;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAiB;IAC1C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA+B;gBAE1C,QAAQ,EAAE,cAAc,EAAE,MAAM,GAAE,kBAAuB;IAQrE;;OAEG;IACG,MAAM,CACV,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC/B,OAAO,CAAC,YAAY,EAAE,CAAC;CAW3B"}
@@ -0,0 +1,25 @@
1
+ import { VectorEnhancer } from '../vector-enhancer.js';
2
+ /**
3
+ * Hybrid search combining vector similarity with tree structure awareness.
4
+ */
5
+ export class HybridSearch {
6
+ enhancer;
7
+ config;
8
+ constructor(enhancer, config = {}) {
9
+ this.enhancer = enhancer;
10
+ this.config = {
11
+ vectorTopK: config.vectorTopK ?? 20,
12
+ rerankTopK: config.rerankTopK ?? 5,
13
+ };
14
+ }
15
+ /**
16
+ * Search with vector retrieval, then rerank/filter to top results.
17
+ */
18
+ async search(query, filter) {
19
+ const results = await this.enhancer.search(query, this.config.vectorTopK, filter);
20
+ // Simple reranking: just take top results by score
21
+ // A real implementation would use a reranker model
22
+ return results.slice(0, this.config.rerankTopK);
23
+ }
24
+ }
25
+ //# sourceMappingURL=hybrid-search.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hybrid-search.js","sourceRoot":"","sources":["../../../src/vector-lib/search/hybrid-search.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAOvD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,QAAQ,CAAiB;IACzB,MAAM,CAA+B;IAEtD,YAAY,QAAwB,EAAE,SAA6B,EAAE;QACnE,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,MAAM,GAAG;YACZ,UAAU,EAAE,MAAM,CAAC,UAAU,IAAI,EAAE;YACnC,UAAU,EAAE,MAAM,CAAC,UAAU,IAAI,CAAC;SACnC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,KAAa,EACb,MAAgC;QAEhC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,MAAM,CACxC,KAAK,EACL,IAAI,CAAC,MAAM,CAAC,UAAU,EACtB,MAAM,CACP,CAAC;QAEF,mDAAmD;QACnD,mDAAmD;QACnD,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;IAClD,CAAC;CACF"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Reranker interface — user implements this for custom reranking.
3
+ */
4
+ export interface Reranker {
5
+ rerank(query: string, documents: Array<{
6
+ id: string;
7
+ text: string;
8
+ score: number;
9
+ }>, topK: number): Promise<Array<{
10
+ id: string;
11
+ score: number;
12
+ }>>;
13
+ }
14
+ //# sourceMappingURL=reranker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reranker.d.ts","sourceRoot":"","sources":["../../../src/vector-lib/search/reranker.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,MAAM,CACJ,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,KAAK,CAAC;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,EAC7D,IAAI,EAAE,MAAM,GACX,OAAO,CAAC,KAAK,CAAC;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC,CAAC;CAClD"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=reranker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reranker.js","sourceRoot":"","sources":["../../../src/vector-lib/search/reranker.ts"],"names":[],"mappings":""}
@@ -0,0 +1,29 @@
1
+ import type { PageIndexResult } from '../types.js';
2
+ export interface VectorRecord {
3
+ id: string;
4
+ vector: number[];
5
+ payload: Record<string, unknown>;
6
+ }
7
+ export interface SearchResult {
8
+ id: string;
9
+ score: number;
10
+ payload: Record<string, unknown>;
11
+ }
12
+ export interface VectorConfig {
13
+ chunkMaxTokens?: number;
14
+ chunkOverlap?: number;
15
+ }
16
+ export interface Chunk {
17
+ id: string;
18
+ text: string;
19
+ metadata: {
20
+ docName: string;
21
+ nodeId: string;
22
+ title: string;
23
+ startIndex?: number;
24
+ endIndex?: number;
25
+ chunkIndex: number;
26
+ };
27
+ }
28
+ export type Chunker = (result: PageIndexResult, config?: VectorConfig) => Chunk[];
29
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/vector-lib/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAEnD,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAClC;AAED,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAClC;AAED,MAAM,WAAW,YAAY;IAC3B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,KAAK;IACpB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE;QACR,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED,MAAM,MAAM,OAAO,GAAG,CACpB,MAAM,EAAE,eAAe,EACvB,MAAM,CAAC,EAAE,YAAY,KAClB,KAAK,EAAE,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/vector-lib/types.ts"],"names":[],"mappings":""}
@@ -0,0 +1,28 @@
1
+ import type { PageIndexResult } from '../types.js';
2
+ import type { Embedder } from './embedder/embedder.js';
3
+ import type { VectorStore } from './adapters/vector-store.js';
4
+ import type { SearchResult, VectorConfig, Chunker } from './types.js';
5
+ /**
6
+ * VectorEnhancer: indexes PageIndexResult into a vector store
7
+ * and provides search capabilities.
8
+ */
9
+ export declare class VectorEnhancer {
10
+ private readonly store;
11
+ private readonly embedder;
12
+ private readonly chunker;
13
+ private readonly config;
14
+ constructor(store: VectorStore, embedder: Embedder, chunker: Chunker, config?: VectorConfig);
15
+ /**
16
+ * Index a PageIndexResult into the vector store.
17
+ */
18
+ index(result: PageIndexResult): Promise<number>;
19
+ /**
20
+ * Search the vector store with a text query.
21
+ */
22
+ search(query: string, topK?: number, filter?: Record<string, unknown>): Promise<SearchResult[]>;
23
+ /**
24
+ * Delete all chunks for a document.
25
+ */
26
+ deleteDocument(docName: string): Promise<void>;
27
+ }
28
+ //# sourceMappingURL=vector-enhancer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vector-enhancer.d.ts","sourceRoot":"","sources":["../../src/vector-lib/vector-enhancer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAC;AACvD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAC9D,OAAO,KAAK,EAAE,YAAY,EAAE,YAAY,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAEtE;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAc;IACpC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAW;IACpC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAU;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAe;gBAGpC,KAAK,EAAE,WAAW,EAClB,QAAQ,EAAE,QAAQ,EAClB,OAAO,EAAE,OAAO,EAChB,MAAM,GAAE,YAAiB;IAQ3B;;OAEG;IACG,KAAK,CAAC,MAAM,EAAE,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC;IAkBrD;;OAEG;IACG,MAAM,CACV,KAAK,EAAE,MAAM,EACb,IAAI,SAAI,EACR,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC/B,OAAO,CAAC,YAAY,EAAE,CAAC;IAK1B;;OAEG;IACG,cAAc,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;CAcrD"}
@@ -0,0 +1,54 @@
1
+ /**
2
+ * VectorEnhancer: indexes PageIndexResult into a vector store
3
+ * and provides search capabilities.
4
+ */
5
+ export class VectorEnhancer {
6
+ store;
7
+ embedder;
8
+ chunker;
9
+ config;
10
+ constructor(store, embedder, chunker, config = {}) {
11
+ this.store = store;
12
+ this.embedder = embedder;
13
+ this.chunker = chunker;
14
+ this.config = config;
15
+ }
16
+ /**
17
+ * Index a PageIndexResult into the vector store.
18
+ */
19
+ async index(result) {
20
+ const chunks = this.chunker(result, this.config);
21
+ if (chunks.length === 0)
22
+ return 0;
23
+ const texts = chunks.map((c) => c.text);
24
+ const embeddings = await this.embedder.embed(texts);
25
+ const records = chunks.map((chunk, i) => ({
26
+ id: chunk.id,
27
+ vector: embeddings[i],
28
+ payload: chunk.metadata,
29
+ }));
30
+ await this.store.upsert(records);
31
+ return chunks.length;
32
+ }
33
+ /**
34
+ * Search the vector store with a text query.
35
+ */
36
+ async search(query, topK = 5, filter) {
37
+ const [queryVector] = await this.embedder.embed([query]);
38
+ return this.store.search(queryVector, topK, filter);
39
+ }
40
+ /**
41
+ * Delete all chunks for a document.
42
+ */
43
+ async deleteDocument(docName) {
44
+ // This requires the store to support filtering by docName
45
+ // For now, this is a placeholder — real implementations would
46
+ // use the store's filter capabilities
47
+ const results = await this.store.search(new Array(this.embedder.dimension).fill(0), 10000, { docName });
48
+ const ids = results.map((r) => r.id);
49
+ if (ids.length > 0) {
50
+ await this.store.delete(ids);
51
+ }
52
+ }
53
+ }
54
+ //# sourceMappingURL=vector-enhancer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vector-enhancer.js","sourceRoot":"","sources":["../../src/vector-lib/vector-enhancer.ts"],"names":[],"mappings":"AAKA;;;GAGG;AACH,MAAM,OAAO,cAAc;IACR,KAAK,CAAc;IACnB,QAAQ,CAAW;IACnB,OAAO,CAAU;IACjB,MAAM,CAAe;IAEtC,YACE,KAAkB,EAClB,QAAkB,EAClB,OAAgB,EAChB,SAAuB,EAAE;QAEzB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,MAAuB;QACjC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QAEjD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC;QAElC,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAEpD,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YACxC,EAAE,EAAE,KAAK,CAAC,EAAE;YACZ,MAAM,EAAE,UAAU,CAAC,CAAC,CAAC;YACrB,OAAO,EAAE,KAAK,CAAC,QAA8C;SAC9D,CAAC,CAAC,CAAC;QAEJ,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACjC,OAAO,MAAM,CAAC,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,KAAa,EACb,IAAI,GAAG,CAAC,EACR,MAAgC;QAEhC,MAAM,CAAC,WAAW,CAAC,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC;QACzD,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;IACtD,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAAC,OAAe;QAClC,0DAA0D;QAC1D,8DAA8D;QAC9D,sCAAsC;QACtC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CACrC,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAC1C,KAAK,EACL,EAAE,OAAO,EAAE,CACZ,CAAC;QACF,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACrC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnB,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC/B,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,5 @@
1
+ export type { VectorConfig, SearchResult, Chunk, VectorRecord, Chunker, } from './vector-lib/index.js';
2
+ export type { Embedder, VectorStore, Reranker } from './vector-lib/index.js';
3
+ export { InMemoryAdapter, treeChunker, VectorEnhancer, HybridSearch, } from './vector-lib/index.js';
4
+ export type { HybridSearchConfig } from './vector-lib/index.js';
5
+ //# sourceMappingURL=vector.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vector.d.ts","sourceRoot":"","sources":["../src/vector.ts"],"names":[],"mappings":"AACA,YAAY,EACV,YAAY,EACZ,YAAY,EACZ,KAAK,EACL,YAAY,EACZ,OAAO,GACR,MAAM,uBAAuB,CAAC;AAG/B,YAAY,EAAE,QAAQ,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAG7E,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACd,YAAY,GACb,MAAM,uBAAuB,CAAC;AAC/B,YAAY,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC"}
package/dist/vector.js ADDED
@@ -0,0 +1,3 @@
1
+ // Implementations
2
+ export { InMemoryAdapter, treeChunker, VectorEnhancer, HybridSearch, } from './vector-lib/index.js';
3
+ //# sourceMappingURL=vector.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vector.js","sourceRoot":"","sources":["../src/vector.ts"],"names":[],"mappings":"AAYA,kBAAkB;AAClB,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACd,YAAY,GACb,MAAM,uBAAuB,CAAC"}
@@ -0,0 +1,13 @@
1
+ import type { TocItem } from '../types.js';
2
+ import { LlmClient } from '../llm/llm-client.js';
3
+ /**
4
+ * Attempts to fix incorrect TOC entries with retries.
5
+ */
6
+ export declare function fixIncorrectTocWithRetries(toc: TocItem[], pageList: Array<{
7
+ text: string;
8
+ }>, incorrectResults: Array<{
9
+ index: number;
10
+ title: string;
11
+ physicalIndex: number;
12
+ }>, llmClient: LlmClient, maxAttempts?: number): Promise<TocItem[]>;
13
+ //# sourceMappingURL=fix-toc.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-toc.d.ts","sourceRoot":"","sources":["../../src/verification/fix-toc.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAKjD;;GAEG;AACH,wBAAsB,0BAA0B,CAC9C,GAAG,EAAE,OAAO,EAAE,EACd,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,gBAAgB,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,MAAM,CAAA;CAAE,CAAC,EAChF,SAAS,EAAE,SAAS,EACpB,WAAW,SAAI,GACd,OAAO,CAAC,OAAO,EAAE,CAAC,CAcpB"}
@@ -0,0 +1,73 @@
1
+ import { LlmClient } from '../llm/llm-client.js';
2
+ import { singleTocItemIndexFixerPrompt } from '../llm/prompts.js';
3
+ import { convertPhysicalIndexToInt, addPhysicalIndexTags } from '../utils/page-utils.js';
4
+ import { checkTitleAppearance } from './verify-toc.js';
5
+ /**
6
+ * Attempts to fix incorrect TOC entries with retries.
7
+ */
8
+ export async function fixIncorrectTocWithRetries(toc, pageList, incorrectResults, llmClient, maxAttempts = 3) {
9
+ let currentIncorrect = incorrectResults;
10
+ let fixAttempt = 0;
11
+ while (currentIncorrect.length > 0 && fixAttempt < maxAttempts) {
12
+ const { updatedToc, stillIncorrect } = await fixIncorrectToc(toc, pageList, currentIncorrect, llmClient);
13
+ toc = updatedToc;
14
+ currentIncorrect = stillIncorrect;
15
+ fixAttempt++;
16
+ }
17
+ return toc;
18
+ }
19
+ async function fixIncorrectToc(toc, pageList, incorrectResults, llmClient) {
20
+ const stillIncorrect = [];
21
+ for (const incorrect of incorrectResults) {
22
+ const { index, title } = incorrect;
23
+ // Find prev/next correct entries for range
24
+ let prevCorrectIdx = 1;
25
+ for (let i = index - 1; i >= 0; i--) {
26
+ const pi = toc[i].physicalIndex;
27
+ if (pi != null) {
28
+ prevCorrectIdx = pi;
29
+ break;
30
+ }
31
+ }
32
+ let nextCorrectIdx = pageList.length;
33
+ for (let i = index + 1; i < toc.length; i++) {
34
+ const pi = toc[i].physicalIndex;
35
+ if (pi != null) {
36
+ nextCorrectIdx = pi;
37
+ break;
38
+ }
39
+ }
40
+ // Extract range pages with tags
41
+ const rangePages = pageList.slice(prevCorrectIdx - 1, nextCorrectIdx);
42
+ const taggedContent = addPhysicalIndexTags(rangePages, prevCorrectIdx);
43
+ // Ask LLM to find the correct page
44
+ const result = await llmClient.chatJson([
45
+ {
46
+ role: 'user',
47
+ content: singleTocItemIndexFixerPrompt(title, taggedContent),
48
+ },
49
+ ]);
50
+ if (result.physical_index) {
51
+ try {
52
+ const newIdx = convertPhysicalIndexToInt(result.physical_index);
53
+ toc[index].physicalIndex = newIdx;
54
+ // Verify the fix
55
+ const pageIdx = newIdx - 1;
56
+ if (pageIdx >= 0 && pageIdx < pageList.length) {
57
+ const isCorrect = await checkTitleAppearance(title, pageList[pageIdx].text, llmClient);
58
+ if (!isCorrect) {
59
+ stillIncorrect.push({ index, title, physicalIndex: newIdx });
60
+ }
61
+ }
62
+ }
63
+ catch {
64
+ stillIncorrect.push(incorrect);
65
+ }
66
+ }
67
+ else {
68
+ stillIncorrect.push(incorrect);
69
+ }
70
+ }
71
+ return { updatedToc: toc, stillIncorrect };
72
+ }
73
+ //# sourceMappingURL=fix-toc.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fix-toc.js","sourceRoot":"","sources":["../../src/verification/fix-toc.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,6BAA6B,EAAE,MAAM,mBAAmB,CAAC;AAClE,OAAO,EAAE,yBAAyB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AACzF,OAAO,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAEvD;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,0BAA0B,CAC9C,GAAc,EACd,QAAiC,EACjC,gBAAgF,EAChF,SAAoB,EACpB,WAAW,GAAG,CAAC;IAEf,IAAI,gBAAgB,GAAG,gBAAgB,CAAC;IACxC,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,OAAO,gBAAgB,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,GAAG,WAAW,EAAE,CAAC;QAC/D,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,GAAG,MAAM,eAAe,CAC1D,GAAG,EAAE,QAAQ,EAAE,gBAAgB,EAAE,SAAS,CAC3C,CAAC;QACF,GAAG,GAAG,UAAU,CAAC;QACjB,gBAAgB,GAAG,cAAc,CAAC;QAClC,UAAU,EAAE,CAAC;IACf,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAED,KAAK,UAAU,eAAe,CAC5B,GAAc,EACd,QAAiC,EACjC,gBAAgF,EAChF,SAAoB;IAKpB,MAAM,cAAc,GAAmE,EAAE,CAAC;IAE1F,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,SAAS,CAAC;QAEnC,2CAA2C;QAC3C,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,KAAK,IAAI,CAAC,GAAG,KAAK,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;YAChC,IAAI,EAAE,IAAI,IAAI,EAAE,CAAC;gBACf,cAAc,GAAG,EAAE,CAAC;gBACpB,MAAM;YACR,CAAC;QACH,CAAC;QAED,IAAI,cAAc,GAAG,QAAQ,CAAC,MAAM,CAAC;QACrC,KAAK,IAAI,CAAC,GAAG,KAAK,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;YAChC,IAAI,EAAE,IAAI,IAAI,EAAE,CAAC;gBACf,cAAc,GAAG,EAAE,CAAC;gBACpB,MAAM;YACR,CAAC;QACH,CAAC;QAED,gCAAgC;QAChC,MAAM,UAAU,GAAG,QAAQ,CAAC,KAAK,CAAC,cAAc,GAAG,CAAC,EAAE,cAAc,CAAC,CAAC;QACtE,MAAM,aAAa,GAAG,oBAAoB,CAAC,UAAU,EAAE,cAAc,CAAC,CAAC;QAEvE,mCAAmC;QACnC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAA6B;YAClE;gBACE,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE,6BAA6B,CAAC,KAAK,EAAE,aAAa,CAAC;aAC7D;SACF,CAAC,CAAC;QAEH,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;YAC1B,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,yBAAyB,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;gBAChE,GAAG,CAAC,KAAK,CAAC,CAAC,aAAa,GAAG,MAAM,CAAC;gBAElC,iBAAiB;gBACjB,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,CAAC;gBAC3B,IAAI,OAAO,IAAI,CAAC,IAAI,OAAO,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;oBAC9C,MAAM,SAAS,GAAG,MAAM,oBAAoB,CAC1C,KAAK,EAAE,QAAQ,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,SAAS,CACzC,CAAC;oBACF,IAAI,CAAC,SAAS,EAAE,CAAC;wBACf,cAAc,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,aAAa,EAAE,MAAM,EAAE,CAAC,CAAC;oBAC/D,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACjC,CAAC;QACH,CAAC;aAAM,CAAC;YACN,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,UAAU,EAAE,GAAG,EAAE,cAAc,EAAE,CAAC;AAC7C,CAAC"}
@@ -0,0 +1,3 @@
1
+ export { verifyToc, checkTitleAppearance } from './verify-toc.js';
2
+ export { fixIncorrectTocWithRetries } from './fix-toc.js';
3
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAClE,OAAO,EAAE,0BAA0B,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,3 @@
1
+ export { verifyToc, checkTitleAppearance } from './verify-toc.js';
2
+ export { fixIncorrectTocWithRetries } from './fix-toc.js';
3
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAClE,OAAO,EAAE,0BAA0B,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,17 @@
1
+ import type { TocItem, VerificationResult } from '../types.js';
2
+ import { LlmClient } from '../llm/llm-client.js';
3
+ /**
4
+ * Verifies TOC accuracy by checking if titles appear on their assigned pages.
5
+ *
6
+ * @param toc - The TOC items to verify
7
+ * @param pageList - All document pages
8
+ * @param sampleSize - Number of items to sample (null = check all)
9
+ */
10
+ export declare function verifyToc(toc: TocItem[], pageList: Array<{
11
+ text: string;
12
+ }>, llmClient: LlmClient, sampleSize?: number | null): Promise<VerificationResult>;
13
+ /**
14
+ * Checks if a title appears on a given page using LLM.
15
+ */
16
+ export declare function checkTitleAppearance(title: string, pageText: string, llmClient: LlmClient): Promise<boolean>;
17
+ //# sourceMappingURL=verify-toc.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"verify-toc.d.ts","sourceRoot":"","sources":["../../src/verification/verify-toc.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAC/D,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAGjD;;;;;;GAMG;AACH,wBAAsB,SAAS,CAC7B,GAAG,EAAE,OAAO,EAAE,EACd,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,EACjC,SAAS,EAAE,SAAS,EACpB,UAAU,GAAE,MAAM,GAAG,IAAW,GAC/B,OAAO,CAAC,kBAAkB,CAAC,CAsD7B;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CACxC,KAAK,EAAE,MAAM,EACb,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,SAAS,GACnB,OAAO,CAAC,OAAO,CAAC,CAKlB"}
@@ -0,0 +1,64 @@
1
+ import { LlmClient } from '../llm/llm-client.js';
2
+ import { checkTitleAppearancePrompt } from '../llm/prompts.js';
3
+ /**
4
+ * Verifies TOC accuracy by checking if titles appear on their assigned pages.
5
+ *
6
+ * @param toc - The TOC items to verify
7
+ * @param pageList - All document pages
8
+ * @param sampleSize - Number of items to sample (null = check all)
9
+ */
10
+ export async function verifyToc(toc, pageList, llmClient, sampleSize = null) {
11
+ // Find last non-null physicalIndex
12
+ let lastPhysicalIndex = -1;
13
+ for (let i = toc.length - 1; i >= 0; i--) {
14
+ const pi = toc[i].physicalIndex;
15
+ if (pi != null) {
16
+ lastPhysicalIndex = pi;
17
+ break;
18
+ }
19
+ }
20
+ // If last physical index is less than half the document, clearly wrong
21
+ if (lastPhysicalIndex >= 0 && lastPhysicalIndex < pageList.length / 2) {
22
+ return { accuracy: 0, incorrectResults: [] };
23
+ }
24
+ // Select items to verify
25
+ const itemsWithIndex = toc
26
+ .map((item, index) => ({ item, index }))
27
+ .filter(({ item }) => item.physicalIndex != null);
28
+ let toVerify = itemsWithIndex;
29
+ if (sampleSize != null && sampleSize < itemsWithIndex.length) {
30
+ // Fisher-Yates shuffle for uniform random sampling
31
+ const shuffled = [...itemsWithIndex];
32
+ for (let i = shuffled.length - 1; i > 0; i--) {
33
+ const j = Math.floor(Math.random() * (i + 1));
34
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
35
+ }
36
+ toVerify = shuffled.slice(0, sampleSize);
37
+ }
38
+ // Verify concurrently
39
+ const results = await Promise.all(toVerify.map(async ({ item, index }) => {
40
+ const physicalIndex = item.physicalIndex; // guaranteed by filter above
41
+ const pageIdx = physicalIndex - 1; // Convert to 0-based
42
+ if (pageIdx < 0 || pageIdx >= pageList.length) {
43
+ return { index, title: item.title, physicalIndex, correct: false };
44
+ }
45
+ const correct = await checkTitleAppearance(item.title, pageList[pageIdx].text, llmClient);
46
+ return { index, title: item.title, physicalIndex, correct };
47
+ }));
48
+ const correctCount = results.filter((r) => r.correct).length;
49
+ const accuracy = toVerify.length > 0 ? correctCount / toVerify.length : 1;
50
+ const incorrectResults = results
51
+ .filter((r) => !r.correct)
52
+ .map(({ index, title, physicalIndex }) => ({ index, title, physicalIndex }));
53
+ return { accuracy, incorrectResults };
54
+ }
55
+ /**
56
+ * Checks if a title appears on a given page using LLM.
57
+ */
58
+ export async function checkTitleAppearance(title, pageText, llmClient) {
59
+ const result = await llmClient.chatJson([
60
+ { role: 'user', content: checkTitleAppearancePrompt(title, pageText) },
61
+ ]);
62
+ return result.answer === 'yes';
63
+ }
64
+ //# sourceMappingURL=verify-toc.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"verify-toc.js","sourceRoot":"","sources":["../../src/verification/verify-toc.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AACjD,OAAO,EAAE,0BAA0B,EAAE,MAAM,mBAAmB,CAAC;AAE/D;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,GAAc,EACd,QAAiC,EACjC,SAAoB,EACpB,aAA4B,IAAI;IAEhC,mCAAmC;IACnC,IAAI,iBAAiB,GAAG,CAAC,CAAC,CAAC;IAC3B,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;QAChC,IAAI,EAAE,IAAI,IAAI,EAAE,CAAC;YACf,iBAAiB,GAAG,EAAE,CAAC;YACvB,MAAM;QACR,CAAC;IACH,CAAC;IAED,uEAAuE;IACvE,IAAI,iBAAiB,IAAI,CAAC,IAAI,iBAAiB,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtE,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,gBAAgB,EAAE,EAAE,EAAE,CAAC;IAC/C,CAAC;IAED,yBAAyB;IACzB,MAAM,cAAc,GAAG,GAAG;SACvB,GAAG,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;SACvC,MAAM,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,CAAC;IAEpD,IAAI,QAAQ,GAAG,cAAc,CAAC;IAC9B,IAAI,UAAU,IAAI,IAAI,IAAI,UAAU,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;QAC7D,mDAAmD;QACnD,MAAM,QAAQ,GAAG,CAAC,GAAG,cAAc,CAAC,CAAC;QACrC,KAAK,IAAI,CAAC,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1D,CAAC;QACD,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IAC3C,CAAC;IAED,sBAAsB;IACtB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE;QACrC,MAAM,aAAa,GAAG,IAAI,CAAC,aAAuB,CAAC,CAAC,6BAA6B;QACjF,MAAM,OAAO,GAAG,aAAa,GAAG,CAAC,CAAC,CAAC,qBAAqB;QACxD,IAAI,OAAO,GAAG,CAAC,IAAI,OAAO,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;YAC9C,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,aAAa,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;QACrE,CAAC;QACD,MAAM,OAAO,GAAG,MAAM,oBAAoB,CACxC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,SAAS,CAC9C,CAAC;QACF,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC;IAC9D,CAAC,CAAC,CACH,CAAC;IAEF,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IAC7D,MAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1E,MAAM,gBAAgB,GAAG,OAAO;SAC7B,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;SACzB,GAAG,CAAC,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,aAAa,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,aAAa,EAAE,CAAC,CAAC,CAAC;IAE/E,OAAO,EAAE,QAAQ,EAAE,gBAAgB,EAAE,CAAC;AACxC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,KAAa,EACb,QAAgB,EAChB,SAAoB;IAEpB,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAAqB;QAC1D,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,0BAA0B,CAAC,KAAK,EAAE,QAAQ,CAAC,EAAE;KACvE,CAAC,CAAC;IACH,OAAO,MAAM,CAAC,MAAM,KAAK,KAAK,CAAC;AACjC,CAAC"}
package/package.json ADDED
@@ -0,0 +1,58 @@
1
+ {
2
+ "name": "@fastrag/pageindex",
3
+ "version": "0.1.0",
4
+ "description": "TypeScript SDK for PageIndex hierarchical document indexing and optional vector enhancement.",
5
+ "license": "MIT",
6
+ "keywords": [
7
+ "document-indexing",
8
+ "rag",
9
+ "vector-search",
10
+ "chunking",
11
+ "pageindex"
12
+ ],
13
+ "type": "module",
14
+ "exports": {
15
+ ".": {
16
+ "types": "./dist/index.d.ts",
17
+ "import": "./dist/index.js"
18
+ },
19
+ "./types": {
20
+ "types": "./dist/types.d.ts",
21
+ "import": "./dist/types.js"
22
+ },
23
+ "./vector": {
24
+ "types": "./dist/vector.d.ts",
25
+ "import": "./dist/vector.js"
26
+ }
27
+ },
28
+ "main": "./dist/index.js",
29
+ "types": "./dist/index.d.ts",
30
+ "files": [
31
+ "dist"
32
+ ],
33
+ "engines": {
34
+ "node": ">=20"
35
+ },
36
+ "packageManager": "pnpm@10.4.1",
37
+ "scripts": {
38
+ "build": "rm -rf dist && tsc -p tsconfig.build.json",
39
+ "test": "vitest --run",
40
+ "lint": "eslint src/**/*.ts",
41
+ "typecheck": "tsc --noEmit",
42
+ "prepack": "pnpm build"
43
+ },
44
+ "publishConfig": {
45
+ "access": "public"
46
+ },
47
+ "dependencies": {
48
+ "gpt-tokenizer": "^2.8.0"
49
+ },
50
+ "devDependencies": {
51
+ "@types/node": "^22.0.0",
52
+ "@vitest/coverage-v8": "^3.2.4",
53
+ "eslint": "^9.0.0",
54
+ "typescript": "^5.7.0",
55
+ "typescript-eslint": "^8.0.0",
56
+ "vitest": "^3.0.0"
57
+ }
58
+ }