treedex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +523 -0
- package/dist/index.cjs +1213 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +419 -0
- package/dist/index.d.ts +419 -0
- package/dist/index.js +1138 -0
- package/dist/index.js.map +1 -0
- package/package.json +95 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,1213 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __esm = (fn, res) => function __init() {
|
|
9
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
10
|
+
};
|
|
11
|
+
var __export = (target, all) => {
|
|
12
|
+
for (var name in all)
|
|
13
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
14
|
+
};
|
|
15
|
+
var __copyProps = (to, from, except, desc) => {
|
|
16
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
17
|
+
for (let key of __getOwnPropNames(from))
|
|
18
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
19
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
20
|
+
}
|
|
21
|
+
return to;
|
|
22
|
+
};
|
|
23
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
24
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
25
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
26
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
27
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
28
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
29
|
+
mod
|
|
30
|
+
));
|
|
31
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
32
|
+
|
|
33
|
+
// src/pdf-parser.ts
|
|
34
|
+
var pdf_parser_exports = {};
|
|
35
|
+
__export(pdf_parser_exports, {
|
|
36
|
+
countTokens: () => countTokens,
|
|
37
|
+
extractPages: () => extractPages,
|
|
38
|
+
groupPages: () => groupPages,
|
|
39
|
+
pagesToTaggedText: () => pagesToTaggedText
|
|
40
|
+
});
|
|
41
|
+
function countTokens(text) {
|
|
42
|
+
return (0, import_gpt_tokenizer.encode)(text).length;
|
|
43
|
+
}
|
|
44
|
+
async function extractPages(pdfPath) {
|
|
45
|
+
const fs = await import("fs/promises");
|
|
46
|
+
const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
47
|
+
const data = await fs.readFile(pdfPath);
|
|
48
|
+
const doc = await pdfjs.getDocument({ data }).promise;
|
|
49
|
+
const pages = [];
|
|
50
|
+
for (let i = 0; i < doc.numPages; i++) {
|
|
51
|
+
const page = await doc.getPage(i + 1);
|
|
52
|
+
const content = await page.getTextContent();
|
|
53
|
+
const text = content.items.map((item) => {
|
|
54
|
+
const obj = item;
|
|
55
|
+
return typeof obj.str === "string" ? obj.str : "";
|
|
56
|
+
}).join(" ");
|
|
57
|
+
pages.push({
|
|
58
|
+
page_num: i,
|
|
59
|
+
text,
|
|
60
|
+
token_count: countTokens(text)
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
return pages;
|
|
64
|
+
}
|
|
65
|
+
function pagesToTaggedText(pages, start, end) {
|
|
66
|
+
const parts = [];
|
|
67
|
+
for (const page of pages.slice(start, end + 1)) {
|
|
68
|
+
const n = page.page_num;
|
|
69
|
+
parts.push(`<physical_index_${n}>${page.text}</physical_index_${n}>`);
|
|
70
|
+
}
|
|
71
|
+
return parts.join("\n");
|
|
72
|
+
}
|
|
73
|
+
function groupPages(pages, maxTokens = 2e4, overlap = 1) {
|
|
74
|
+
const totalTokens = pages.reduce((sum, p) => sum + p.token_count, 0);
|
|
75
|
+
if (totalTokens <= maxTokens) {
|
|
76
|
+
return [pagesToTaggedText(pages, 0, pages.length - 1)];
|
|
77
|
+
}
|
|
78
|
+
const groups = [];
|
|
79
|
+
let groupStart = 0;
|
|
80
|
+
while (groupStart < pages.length) {
|
|
81
|
+
let running = 0;
|
|
82
|
+
let groupEnd = groupStart;
|
|
83
|
+
while (groupEnd < pages.length) {
|
|
84
|
+
const pageTokens = pages[groupEnd].token_count;
|
|
85
|
+
if (running + pageTokens > maxTokens && groupEnd > groupStart) {
|
|
86
|
+
groupEnd -= 1;
|
|
87
|
+
break;
|
|
88
|
+
}
|
|
89
|
+
running += pageTokens;
|
|
90
|
+
groupEnd += 1;
|
|
91
|
+
}
|
|
92
|
+
if (groupEnd >= pages.length) {
|
|
93
|
+
groupEnd = pages.length - 1;
|
|
94
|
+
}
|
|
95
|
+
groupEnd = Math.min(groupEnd, pages.length - 1);
|
|
96
|
+
groups.push(pagesToTaggedText(pages, groupStart, groupEnd));
|
|
97
|
+
if (groupEnd >= pages.length - 1) {
|
|
98
|
+
break;
|
|
99
|
+
}
|
|
100
|
+
const nextStart = groupEnd + 1 - overlap;
|
|
101
|
+
groupStart = Math.max(nextStart, groupStart + 1);
|
|
102
|
+
}
|
|
103
|
+
return groups;
|
|
104
|
+
}
|
|
105
|
+
var import_gpt_tokenizer;
|
|
106
|
+
var init_pdf_parser = __esm({
|
|
107
|
+
"src/pdf-parser.ts"() {
|
|
108
|
+
"use strict";
|
|
109
|
+
import_gpt_tokenizer = require("gpt-tokenizer");
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
// src/index.ts
|
|
114
|
+
var index_exports = {};
|
|
115
|
+
__export(index_exports, {
|
|
116
|
+
BaseLLM: () => BaseLLM,
|
|
117
|
+
CerebrasLLM: () => CerebrasLLM,
|
|
118
|
+
ClaudeLLM: () => ClaudeLLM,
|
|
119
|
+
CohereLLM: () => CohereLLM,
|
|
120
|
+
DOCXLoader: () => DOCXLoader,
|
|
121
|
+
DeepSeekLLM: () => DeepSeekLLM,
|
|
122
|
+
FireworksLLM: () => FireworksLLM,
|
|
123
|
+
FunctionLLM: () => FunctionLLM,
|
|
124
|
+
GeminiLLM: () => GeminiLLM,
|
|
125
|
+
GroqLLM: () => GroqLLM,
|
|
126
|
+
HTMLLoader: () => HTMLLoader,
|
|
127
|
+
HuggingFaceLLM: () => HuggingFaceLLM,
|
|
128
|
+
MistralLLM: () => MistralLLM,
|
|
129
|
+
OllamaLLM: () => OllamaLLM,
|
|
130
|
+
OpenAICompatibleLLM: () => OpenAICompatibleLLM,
|
|
131
|
+
OpenAILLM: () => OpenAILLM,
|
|
132
|
+
OpenRouterLLM: () => OpenRouterLLM,
|
|
133
|
+
PDFLoader: () => PDFLoader,
|
|
134
|
+
QueryResult: () => QueryResult,
|
|
135
|
+
SambanovaLLM: () => SambanovaLLM,
|
|
136
|
+
TextLoader: () => TextLoader,
|
|
137
|
+
TogetherLLM: () => TogetherLLM,
|
|
138
|
+
TreeDex: () => TreeDex,
|
|
139
|
+
assignNodeIds: () => assignNodeIds,
|
|
140
|
+
assignPageRanges: () => assignPageRanges,
|
|
141
|
+
autoLoader: () => autoLoader,
|
|
142
|
+
collectNodeTexts: () => collectNodeTexts,
|
|
143
|
+
countNodes: () => countNodes,
|
|
144
|
+
countTokens: () => countTokens,
|
|
145
|
+
createNodeMapping: () => createNodeMapping,
|
|
146
|
+
embedTextInTree: () => embedTextInTree,
|
|
147
|
+
extractJson: () => extractJson,
|
|
148
|
+
extractPages: () => extractPages,
|
|
149
|
+
findLargeNodes: () => findLargeNodes,
|
|
150
|
+
getLeafNodes: () => getLeafNodes,
|
|
151
|
+
groupPages: () => groupPages,
|
|
152
|
+
listToTree: () => listToTree,
|
|
153
|
+
pagesToTaggedText: () => pagesToTaggedText,
|
|
154
|
+
printTree: () => printTree,
|
|
155
|
+
retrievalPrompt: () => retrievalPrompt,
|
|
156
|
+
stripTextFromTree: () => stripTextFromTree,
|
|
157
|
+
structureContinuePrompt: () => structureContinuePrompt,
|
|
158
|
+
structureExtractionPrompt: () => structureExtractionPrompt,
|
|
159
|
+
textToPages: () => textToPages,
|
|
160
|
+
treeToFlatList: () => treeToFlatList
|
|
161
|
+
});
|
|
162
|
+
module.exports = __toCommonJS(index_exports);
|
|
163
|
+
|
|
164
|
+
// src/loaders.ts
|
|
165
|
+
init_pdf_parser();
|
|
166
|
+
function textToPages(text, charsPerPage = 3e3) {
|
|
167
|
+
const pages = [];
|
|
168
|
+
for (let i = 0; i < text.length; i += charsPerPage) {
|
|
169
|
+
const chunk = text.slice(i, i + charsPerPage);
|
|
170
|
+
pages.push({
|
|
171
|
+
page_num: pages.length,
|
|
172
|
+
text: chunk,
|
|
173
|
+
token_count: countTokens(chunk)
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
return pages;
|
|
177
|
+
}
|
|
178
|
+
var PDFLoader = class {
|
|
179
|
+
async load(path) {
|
|
180
|
+
const { extractPages: extractPages2 } = await Promise.resolve().then(() => (init_pdf_parser(), pdf_parser_exports));
|
|
181
|
+
return extractPages2(path);
|
|
182
|
+
}
|
|
183
|
+
};
|
|
184
|
+
var TextLoader = class {
|
|
185
|
+
charsPerPage;
|
|
186
|
+
constructor(charsPerPage = 3e3) {
|
|
187
|
+
this.charsPerPage = charsPerPage;
|
|
188
|
+
}
|
|
189
|
+
async load(path) {
|
|
190
|
+
const fs = await import("fs/promises");
|
|
191
|
+
const text = await fs.readFile(path, "utf-8");
|
|
192
|
+
return textToPages(text, this.charsPerPage);
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
var HTMLLoader = class {
|
|
196
|
+
charsPerPage;
|
|
197
|
+
constructor(charsPerPage = 3e3) {
|
|
198
|
+
this.charsPerPage = charsPerPage;
|
|
199
|
+
}
|
|
200
|
+
async load(path) {
|
|
201
|
+
const fs = await import("fs/promises");
|
|
202
|
+
const html = await fs.readFile(path, "utf-8");
|
|
203
|
+
const text = await this.stripHtml(html);
|
|
204
|
+
return textToPages(text, this.charsPerPage);
|
|
205
|
+
}
|
|
206
|
+
async stripHtml(html) {
|
|
207
|
+
try {
|
|
208
|
+
const { Parser } = await import("htmlparser2");
|
|
209
|
+
return new Promise((resolve) => {
|
|
210
|
+
const parts = [];
|
|
211
|
+
let skip = false;
|
|
212
|
+
const parser = new Parser({
|
|
213
|
+
onopentag(name) {
|
|
214
|
+
if (name === "script" || name === "style") skip = true;
|
|
215
|
+
},
|
|
216
|
+
onclosetag(name) {
|
|
217
|
+
if (name === "script" || name === "style") skip = false;
|
|
218
|
+
if (["p", "div", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr"].includes(name)) {
|
|
219
|
+
parts.push("\n");
|
|
220
|
+
}
|
|
221
|
+
},
|
|
222
|
+
ontext(data) {
|
|
223
|
+
if (!skip) parts.push(data);
|
|
224
|
+
},
|
|
225
|
+
onend() {
|
|
226
|
+
const raw = parts.join("");
|
|
227
|
+
resolve(raw.replace(/\n{3,}/g, "\n\n").trim());
|
|
228
|
+
}
|
|
229
|
+
});
|
|
230
|
+
parser.write(html);
|
|
231
|
+
parser.end();
|
|
232
|
+
});
|
|
233
|
+
} catch {
|
|
234
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
};
|
|
238
|
+
var DOCXLoader = class {
|
|
239
|
+
charsPerPage;
|
|
240
|
+
constructor(charsPerPage = 3e3) {
|
|
241
|
+
this.charsPerPage = charsPerPage;
|
|
242
|
+
}
|
|
243
|
+
async load(path) {
|
|
244
|
+
const fs = await import("fs/promises");
|
|
245
|
+
const mammoth = await import("mammoth");
|
|
246
|
+
const buffer = await fs.readFile(path);
|
|
247
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
248
|
+
return textToPages(result.value, this.charsPerPage);
|
|
249
|
+
}
|
|
250
|
+
};
|
|
251
|
+
var EXTENSION_MAP = {
|
|
252
|
+
".pdf": PDFLoader,
|
|
253
|
+
".txt": TextLoader,
|
|
254
|
+
".md": TextLoader,
|
|
255
|
+
".html": HTMLLoader,
|
|
256
|
+
".htm": HTMLLoader,
|
|
257
|
+
".docx": DOCXLoader
|
|
258
|
+
};
|
|
259
|
+
async function autoLoader(filePath) {
|
|
260
|
+
const { extname } = await import("path");
|
|
261
|
+
const ext = extname(filePath).toLowerCase();
|
|
262
|
+
const LoaderClass = EXTENSION_MAP[ext];
|
|
263
|
+
if (!LoaderClass) {
|
|
264
|
+
const supported = Object.keys(EXTENSION_MAP).join(", ");
|
|
265
|
+
throw new Error(
|
|
266
|
+
`Unsupported file extension '${ext}'. Supported: ${supported}`
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
const loader = new LoaderClass();
|
|
270
|
+
return loader.load(filePath);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// src/core.ts
|
|
274
|
+
init_pdf_parser();
|
|
275
|
+
|
|
276
|
+
// src/tree-builder.ts
|
|
277
|
+
function listToTree(flatList) {
|
|
278
|
+
const nodesByStructure = {};
|
|
279
|
+
const roots = [];
|
|
280
|
+
for (const item of flatList) {
|
|
281
|
+
const node = { ...item, nodes: [] };
|
|
282
|
+
const structure = node.structure;
|
|
283
|
+
nodesByStructure[structure] = node;
|
|
284
|
+
const parts = structure.split(".");
|
|
285
|
+
if (parts.length === 1) {
|
|
286
|
+
roots.push(node);
|
|
287
|
+
} else {
|
|
288
|
+
const parentStructure = parts.slice(0, -1).join(".");
|
|
289
|
+
const parent = nodesByStructure[parentStructure];
|
|
290
|
+
if (parent !== void 0) {
|
|
291
|
+
parent.nodes.push(node);
|
|
292
|
+
} else {
|
|
293
|
+
roots.push(node);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return roots;
|
|
298
|
+
}
|
|
299
|
+
function assignRanges(nodes, boundaryEnd) {
|
|
300
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
301
|
+
const node = nodes[i];
|
|
302
|
+
node.start_index = node.physical_index ?? 0;
|
|
303
|
+
if (i + 1 < nodes.length) {
|
|
304
|
+
node.end_index = (nodes[i + 1].physical_index ?? 0) - 1;
|
|
305
|
+
} else {
|
|
306
|
+
node.end_index = boundaryEnd;
|
|
307
|
+
}
|
|
308
|
+
if (node.nodes.length > 0) {
|
|
309
|
+
assignRanges(node.nodes, node.end_index);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
function assignPageRanges(tree, totalPages) {
|
|
314
|
+
assignRanges(tree, totalPages - 1);
|
|
315
|
+
return tree;
|
|
316
|
+
}
|
|
317
|
+
function assignNodeIds(tree) {
|
|
318
|
+
let counter = 0;
|
|
319
|
+
function walk(nodes) {
|
|
320
|
+
for (const node of nodes) {
|
|
321
|
+
counter++;
|
|
322
|
+
node.node_id = String(counter).padStart(4, "0");
|
|
323
|
+
walk(node.nodes);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
walk(tree);
|
|
327
|
+
return tree;
|
|
328
|
+
}
|
|
329
|
+
function findLargeNodes(tree, options = {}) {
|
|
330
|
+
const { maxPages = 10, maxTokens = 2e4, pages = null } = options;
|
|
331
|
+
const large = [];
|
|
332
|
+
function walk(nodes) {
|
|
333
|
+
for (const node of nodes) {
|
|
334
|
+
const start = node.start_index ?? 0;
|
|
335
|
+
const end = node.end_index ?? 0;
|
|
336
|
+
const pageCount = end - start + 1;
|
|
337
|
+
let isLarge = pageCount > maxPages;
|
|
338
|
+
if (!isLarge && pages !== null) {
|
|
339
|
+
const tokenSum = pages.filter((p) => p.page_num >= start && p.page_num <= end).reduce((sum, p) => sum + p.token_count, 0);
|
|
340
|
+
isLarge = tokenSum > maxTokens;
|
|
341
|
+
}
|
|
342
|
+
if (isLarge) {
|
|
343
|
+
large.push(node);
|
|
344
|
+
}
|
|
345
|
+
walk(node.nodes);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
walk(tree);
|
|
349
|
+
return large;
|
|
350
|
+
}
|
|
351
|
+
function embedTextInTree(tree, pages) {
|
|
352
|
+
function walk(nodes) {
|
|
353
|
+
for (const node of nodes) {
|
|
354
|
+
const start = node.start_index ?? 0;
|
|
355
|
+
const end = node.end_index ?? 0;
|
|
356
|
+
node.text = pages.filter((p) => p.page_num >= start && p.page_num <= end).map((p) => p.text).join("\n");
|
|
357
|
+
walk(node.nodes);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
walk(tree);
|
|
361
|
+
return tree;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// src/tree-utils.ts
|
|
365
|
+
function createNodeMapping(tree) {
|
|
366
|
+
const mapping = {};
|
|
367
|
+
function walk(nodes) {
|
|
368
|
+
for (const node of nodes) {
|
|
369
|
+
if (node.node_id !== void 0) {
|
|
370
|
+
mapping[node.node_id] = node;
|
|
371
|
+
}
|
|
372
|
+
walk(node.nodes);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
walk(tree);
|
|
376
|
+
return mapping;
|
|
377
|
+
}
|
|
378
|
+
function stripTextFromTree(tree) {
|
|
379
|
+
const stripped = JSON.parse(JSON.stringify(tree));
|
|
380
|
+
function strip(nodes) {
|
|
381
|
+
for (const node of nodes) {
|
|
382
|
+
delete node.text;
|
|
383
|
+
strip(node.nodes);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
strip(stripped);
|
|
387
|
+
return stripped;
|
|
388
|
+
}
|
|
389
|
+
function collectNodeTexts(nodeIds, nodeMap) {
|
|
390
|
+
const parts = [];
|
|
391
|
+
for (const nid of nodeIds) {
|
|
392
|
+
const node = nodeMap[nid];
|
|
393
|
+
if (node === void 0) continue;
|
|
394
|
+
const title = node.title ?? "Untitled";
|
|
395
|
+
const structure = node.structure ?? "";
|
|
396
|
+
const text = node.text ?? "";
|
|
397
|
+
const header = structure ? `[${structure}: ${title}]` : `[${title}]`;
|
|
398
|
+
parts.push(`${header}
|
|
399
|
+
${text}`);
|
|
400
|
+
}
|
|
401
|
+
return parts.join("\n\n");
|
|
402
|
+
}
|
|
403
|
+
function countNodes(tree) {
|
|
404
|
+
let total = 0;
|
|
405
|
+
for (const node of tree) {
|
|
406
|
+
total += 1;
|
|
407
|
+
total += countNodes(node.nodes);
|
|
408
|
+
}
|
|
409
|
+
return total;
|
|
410
|
+
}
|
|
411
|
+
function getLeafNodes(tree) {
|
|
412
|
+
const leaves = [];
|
|
413
|
+
function walk(nodes) {
|
|
414
|
+
for (const node of nodes) {
|
|
415
|
+
if (node.nodes.length === 0) {
|
|
416
|
+
leaves.push(node);
|
|
417
|
+
} else {
|
|
418
|
+
walk(node.nodes);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
walk(tree);
|
|
423
|
+
return leaves;
|
|
424
|
+
}
|
|
425
|
+
function treeToFlatList(tree) {
|
|
426
|
+
const result = [];
|
|
427
|
+
function walk(nodes) {
|
|
428
|
+
for (const node of nodes) {
|
|
429
|
+
const { nodes: children, ...flat } = node;
|
|
430
|
+
result.push(flat);
|
|
431
|
+
walk(children);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
walk(tree);
|
|
435
|
+
return result;
|
|
436
|
+
}
|
|
437
|
+
function extractJson(text) {
|
|
438
|
+
try {
|
|
439
|
+
return JSON.parse(text);
|
|
440
|
+
} catch {
|
|
441
|
+
}
|
|
442
|
+
const blockMatch = text.match(/```(?:json)?\s*\n?(.*?)```/s);
|
|
443
|
+
if (blockMatch) {
|
|
444
|
+
const block = blockMatch[1].trim();
|
|
445
|
+
try {
|
|
446
|
+
return JSON.parse(block);
|
|
447
|
+
} catch {
|
|
448
|
+
const cleaned = block.replace(/,\s*([}\]])/g, "$1");
|
|
449
|
+
try {
|
|
450
|
+
return JSON.parse(cleaned);
|
|
451
|
+
} catch {
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
for (const [startChar, endChar] of [
|
|
456
|
+
["{", "}"],
|
|
457
|
+
["[", "]"]
|
|
458
|
+
]) {
|
|
459
|
+
const start = text.indexOf(startChar);
|
|
460
|
+
if (start === -1) continue;
|
|
461
|
+
let depth = 0;
|
|
462
|
+
for (let i = start; i < text.length; i++) {
|
|
463
|
+
if (text[i] === startChar) depth++;
|
|
464
|
+
else if (text[i] === endChar) {
|
|
465
|
+
depth--;
|
|
466
|
+
if (depth === 0) {
|
|
467
|
+
const candidate = text.slice(start, i + 1);
|
|
468
|
+
try {
|
|
469
|
+
return JSON.parse(candidate);
|
|
470
|
+
} catch {
|
|
471
|
+
const cleaned = candidate.replace(/,\s*([}\]])/g, "$1");
|
|
472
|
+
try {
|
|
473
|
+
return JSON.parse(cleaned);
|
|
474
|
+
} catch {
|
|
475
|
+
break;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
throw new Error(
|
|
483
|
+
`Could not extract JSON from text: ${text.slice(0, 200)}...`
|
|
484
|
+
);
|
|
485
|
+
}
|
|
486
|
+
function printTree(tree, indent = 0) {
|
|
487
|
+
const prefix = " ".repeat(indent);
|
|
488
|
+
for (const node of tree) {
|
|
489
|
+
const nodeId = node.node_id ?? "????";
|
|
490
|
+
const structure = node.structure ?? "";
|
|
491
|
+
const title = node.title ?? "Untitled";
|
|
492
|
+
const start = node.start_index ?? "?";
|
|
493
|
+
const end = node.end_index ?? "?";
|
|
494
|
+
console.log(
|
|
495
|
+
`${prefix}[${nodeId}] ${structure}: ${title} (pages ${start}-${end})`
|
|
496
|
+
);
|
|
497
|
+
printTree(node.nodes, indent + 1);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// src/prompts.ts
|
|
502
|
+
function structureExtractionPrompt(text) {
|
|
503
|
+
return `You are a document structure analyzer. Given the following document text with physical page index tags, extract the hierarchical structure (table of contents).
|
|
504
|
+
|
|
505
|
+
Return a JSON list of objects, each with:
|
|
506
|
+
- "structure": hierarchical numbering like "1", "1.1", "1.2.3"
|
|
507
|
+
- "title": the section/chapter title
|
|
508
|
+
- "physical_index": the page number (from the <physical_index_N> tag) where this section starts
|
|
509
|
+
|
|
510
|
+
Rules:
|
|
511
|
+
- Use the physical_index tags to determine page numbers
|
|
512
|
+
- Create a logical hierarchy: chapters -> sections -> subsections
|
|
513
|
+
- Every section must have a unique structure ID
|
|
514
|
+
- Return ONLY valid JSON \u2014 no extra text
|
|
515
|
+
|
|
516
|
+
Document text:
|
|
517
|
+
${text}
|
|
518
|
+
|
|
519
|
+
JSON output:
|
|
520
|
+
`;
|
|
521
|
+
}
|
|
522
|
+
function structureContinuePrompt(previousStructure, text) {
|
|
523
|
+
return `You are continuing to extract the hierarchical structure of a document.
|
|
524
|
+
|
|
525
|
+
Here is the structure extracted so far:
|
|
526
|
+
${previousStructure}
|
|
527
|
+
|
|
528
|
+
Now extract the structure from the next portion of the document. Continue the numbering from where the previous structure left off. If a section from the previous portion continues into this portion, do NOT duplicate it.
|
|
529
|
+
|
|
530
|
+
Return a JSON list of NEW sections only (same format as before).
|
|
531
|
+
|
|
532
|
+
Document text:
|
|
533
|
+
${text}
|
|
534
|
+
|
|
535
|
+
JSON output:
|
|
536
|
+
`;
|
|
537
|
+
}
|
|
538
|
+
function retrievalPrompt(treeStructure, query) {
|
|
539
|
+
return `You are a document retrieval system. Given a document's tree structure and a user query, select the most relevant sections that would contain the answer.
|
|
540
|
+
|
|
541
|
+
Document structure:
|
|
542
|
+
${treeStructure}
|
|
543
|
+
|
|
544
|
+
User query: ${query}
|
|
545
|
+
|
|
546
|
+
Return a JSON object with:
|
|
547
|
+
- "node_ids": list of node IDs (strings like "0001", "0005") that are most relevant to the query
|
|
548
|
+
- "reasoning": brief explanation of why these sections were selected
|
|
549
|
+
|
|
550
|
+
Select the smallest set of sections that fully covers the answer. Prefer leaf nodes over parent nodes when the leaf contains the specific content. Return ONLY valid JSON.
|
|
551
|
+
|
|
552
|
+
JSON output:
|
|
553
|
+
`;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// src/core.ts
|
|
557
|
+
var QueryResult = class {
|
|
558
|
+
context;
|
|
559
|
+
nodeIds;
|
|
560
|
+
pageRanges;
|
|
561
|
+
reasoning;
|
|
562
|
+
constructor(context, nodeIds, pageRanges, reasoning) {
|
|
563
|
+
this.context = context;
|
|
564
|
+
this.nodeIds = nodeIds;
|
|
565
|
+
this.pageRanges = pageRanges;
|
|
566
|
+
this.reasoning = reasoning;
|
|
567
|
+
}
|
|
568
|
+
/** Human-readable page ranges like 'pages 5-8, 12-15'. */
|
|
569
|
+
get pagesStr() {
|
|
570
|
+
if (this.pageRanges.length === 0) return "no pages";
|
|
571
|
+
const parts = [];
|
|
572
|
+
for (const [start, end] of this.pageRanges) {
|
|
573
|
+
if (start === end) {
|
|
574
|
+
parts.push(String(start + 1));
|
|
575
|
+
} else {
|
|
576
|
+
parts.push(`${start + 1}-${end + 1}`);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
return "pages " + parts.join(", ");
|
|
580
|
+
}
|
|
581
|
+
toString() {
|
|
582
|
+
return `QueryResult(nodes=${JSON.stringify(this.nodeIds)}, ${this.pagesStr}, context_len=${this.context.length})`;
|
|
583
|
+
}
|
|
584
|
+
};
|
|
585
|
+
var TreeDex = class _TreeDex {
|
|
586
|
+
tree;
|
|
587
|
+
pages;
|
|
588
|
+
llm;
|
|
589
|
+
_nodeMap;
|
|
590
|
+
constructor(tree, pages, llm = null) {
|
|
591
|
+
this.tree = tree;
|
|
592
|
+
this.pages = pages;
|
|
593
|
+
this.llm = llm;
|
|
594
|
+
this._nodeMap = createNodeMapping(tree);
|
|
595
|
+
}
|
|
596
|
+
/**
|
|
597
|
+
* Build a TreeDex index from a file.
|
|
598
|
+
*
|
|
599
|
+
* @param path - Path to document (PDF, TXT, HTML, DOCX)
|
|
600
|
+
* @param llm - LLM backend with .generate(prompt) method
|
|
601
|
+
* @param options - Optional configuration
|
|
602
|
+
*/
|
|
603
|
+
static async fromFile(path, llm, options) {
|
|
604
|
+
const {
|
|
605
|
+
loader,
|
|
606
|
+
maxTokens = 2e4,
|
|
607
|
+
overlap = 1,
|
|
608
|
+
verbose = true
|
|
609
|
+
} = options ?? {};
|
|
610
|
+
if (verbose) {
|
|
611
|
+
const { basename } = await import("path");
|
|
612
|
+
console.log(`Loading: ${basename(path)}`);
|
|
613
|
+
}
|
|
614
|
+
let pages;
|
|
615
|
+
if (loader) {
|
|
616
|
+
pages = await loader.load(path);
|
|
617
|
+
} else {
|
|
618
|
+
pages = await autoLoader(path);
|
|
619
|
+
}
|
|
620
|
+
if (verbose) {
|
|
621
|
+
const totalTokens = pages.reduce((s, p) => s + p.token_count, 0);
|
|
622
|
+
console.log(` ${pages.length} pages, ${totalTokens.toLocaleString()} tokens`);
|
|
623
|
+
}
|
|
624
|
+
return _TreeDex.fromPages(pages, llm, { maxTokens, overlap, verbose });
|
|
625
|
+
}
|
|
626
|
+
/** Build a TreeDex index from pre-extracted pages. */
|
|
627
|
+
static async fromPages(pages, llm, options) {
|
|
628
|
+
const { maxTokens = 2e4, overlap = 1, verbose = true } = options ?? {};
|
|
629
|
+
const groups = groupPages(pages, maxTokens, overlap);
|
|
630
|
+
if (verbose) {
|
|
631
|
+
console.log(` ${groups.length} page group(s) for structure extraction`);
|
|
632
|
+
}
|
|
633
|
+
const allSections = [];
|
|
634
|
+
for (let i = 0; i < groups.length; i++) {
|
|
635
|
+
if (verbose) {
|
|
636
|
+
console.log(
|
|
637
|
+
` Extracting structure from group ${i + 1}/${groups.length}...`
|
|
638
|
+
);
|
|
639
|
+
}
|
|
640
|
+
let prompt;
|
|
641
|
+
if (i === 0) {
|
|
642
|
+
prompt = structureExtractionPrompt(groups[i]);
|
|
643
|
+
} else {
|
|
644
|
+
const prevJson = JSON.stringify(allSections, null, 2);
|
|
645
|
+
prompt = structureContinuePrompt(prevJson, groups[i]);
|
|
646
|
+
}
|
|
647
|
+
const response = await llm.generate(prompt);
|
|
648
|
+
const sections = extractJson(response);
|
|
649
|
+
if (Array.isArray(sections)) {
|
|
650
|
+
allSections.push(
|
|
651
|
+
...sections
|
|
652
|
+
);
|
|
653
|
+
} else if (sections !== null && typeof sections === "object" && "sections" in sections) {
|
|
654
|
+
allSections.push(
|
|
655
|
+
...sections.sections
|
|
656
|
+
);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
if (verbose) {
|
|
660
|
+
console.log(` Extracted ${allSections.length} sections`);
|
|
661
|
+
}
|
|
662
|
+
const tree = listToTree(allSections);
|
|
663
|
+
assignPageRanges(tree, pages.length);
|
|
664
|
+
assignNodeIds(tree);
|
|
665
|
+
embedTextInTree(tree, pages);
|
|
666
|
+
if (verbose) {
|
|
667
|
+
console.log(` Tree: ${countNodes(tree)} nodes`);
|
|
668
|
+
}
|
|
669
|
+
return new _TreeDex(tree, pages, llm);
|
|
670
|
+
}
|
|
671
|
+
/** Create a TreeDex from an existing tree and pages. */
|
|
672
|
+
static fromTree(tree, pages, llm = null) {
|
|
673
|
+
return new _TreeDex(tree, pages, llm);
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Query the index and return relevant context.
|
|
677
|
+
*
|
|
678
|
+
* @param question - The user's question
|
|
679
|
+
* @param llm - Optional LLM override. Uses this.llm if not provided.
|
|
680
|
+
*/
|
|
681
|
+
async query(question, llm) {
|
|
682
|
+
const activeLlm = llm ?? this.llm;
|
|
683
|
+
if (!activeLlm) {
|
|
684
|
+
throw new Error(
|
|
685
|
+
"No LLM provided. Pass llm to query() or TreeDex constructor."
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
const stripped = stripTextFromTree(this.tree);
|
|
689
|
+
const treeJson = JSON.stringify(stripped, null, 2);
|
|
690
|
+
const prompt = retrievalPrompt(treeJson, question);
|
|
691
|
+
const response = await activeLlm.generate(prompt);
|
|
692
|
+
const result = extractJson(response);
|
|
693
|
+
const nodeIds = result.node_ids ?? [];
|
|
694
|
+
const reasoning = result.reasoning ?? "";
|
|
695
|
+
const context = collectNodeTexts(nodeIds, this._nodeMap);
|
|
696
|
+
const pageRanges = [];
|
|
697
|
+
for (const nid of nodeIds) {
|
|
698
|
+
const node = this._nodeMap[nid];
|
|
699
|
+
if (node) {
|
|
700
|
+
const start = node.start_index ?? 0;
|
|
701
|
+
const end = node.end_index ?? 0;
|
|
702
|
+
pageRanges.push([start, end]);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
return new QueryResult(context, nodeIds, pageRanges, reasoning);
|
|
706
|
+
}
|
|
707
|
+
/** Save the index to a JSON file. */
|
|
708
|
+
async save(path) {
|
|
709
|
+
const fs = await import("fs/promises");
|
|
710
|
+
const stripped = stripTextFromTree(this.tree);
|
|
711
|
+
const data = {
|
|
712
|
+
version: "1.0",
|
|
713
|
+
framework: "TreeDex",
|
|
714
|
+
tree: stripped,
|
|
715
|
+
pages: this.pages
|
|
716
|
+
};
|
|
717
|
+
await fs.writeFile(path, JSON.stringify(data, null, 2), "utf-8");
|
|
718
|
+
return path;
|
|
719
|
+
}
|
|
720
|
+
/** Load a TreeDex index from a JSON file. */
|
|
721
|
+
static async load(path, llm) {
|
|
722
|
+
const fs = await import("fs/promises");
|
|
723
|
+
const raw = await fs.readFile(path, "utf-8");
|
|
724
|
+
const data = JSON.parse(raw);
|
|
725
|
+
const tree = data.tree;
|
|
726
|
+
const pages = data.pages;
|
|
727
|
+
assignPageRanges(tree, pages.length);
|
|
728
|
+
embedTextInTree(tree, pages);
|
|
729
|
+
return new _TreeDex(tree, pages, llm ?? null);
|
|
730
|
+
}
|
|
731
|
+
/** Pretty-print the tree structure. */
|
|
732
|
+
showTree() {
|
|
733
|
+
printTree(this.tree);
|
|
734
|
+
}
|
|
735
|
+
/** Return index statistics. */
|
|
736
|
+
stats() {
|
|
737
|
+
const totalTokens = this.pages.reduce((s, p) => s + p.token_count, 0);
|
|
738
|
+
const leaves = getLeafNodes(this.tree);
|
|
739
|
+
return {
|
|
740
|
+
total_pages: this.pages.length,
|
|
741
|
+
total_tokens: totalTokens,
|
|
742
|
+
total_nodes: countNodes(this.tree),
|
|
743
|
+
leaf_nodes: leaves.length,
|
|
744
|
+
root_sections: this.tree.length
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
/** Find sections that exceed size thresholds. */
|
|
748
|
+
findLargeSections(options) {
|
|
749
|
+
return findLargeNodes(this.tree, {
|
|
750
|
+
maxPages: options?.maxPages ?? 10,
|
|
751
|
+
maxTokens: options?.maxTokens ?? 2e4,
|
|
752
|
+
pages: this.pages
|
|
753
|
+
});
|
|
754
|
+
}
|
|
755
|
+
};
|
|
756
|
+
|
|
757
|
+
// src/llm-backends.ts
|
|
758
|
+
var BaseLLM = class {
|
|
759
|
+
toString() {
|
|
760
|
+
return `${this.constructor.name}()`;
|
|
761
|
+
}
|
|
762
|
+
};
|
|
763
|
+
var GeminiLLM = class extends BaseLLM {
|
|
764
|
+
apiKey;
|
|
765
|
+
modelName;
|
|
766
|
+
_client = null;
|
|
767
|
+
constructor(apiKey, model = "gemini-2.0-flash") {
|
|
768
|
+
super();
|
|
769
|
+
this.apiKey = apiKey;
|
|
770
|
+
this.modelName = model;
|
|
771
|
+
}
|
|
772
|
+
async getClient() {
|
|
773
|
+
if (this._client === null) {
|
|
774
|
+
const { GoogleGenerativeAI } = await import("@google/generative-ai");
|
|
775
|
+
const genai = new GoogleGenerativeAI(this.apiKey);
|
|
776
|
+
this._client = genai.getGenerativeModel({ model: this.modelName });
|
|
777
|
+
}
|
|
778
|
+
return this._client;
|
|
779
|
+
}
|
|
780
|
+
async generate(prompt) {
|
|
781
|
+
const model = await this.getClient();
|
|
782
|
+
const response = await model.generateContent(prompt);
|
|
783
|
+
return response.response.text();
|
|
784
|
+
}
|
|
785
|
+
toString() {
|
|
786
|
+
return `GeminiLLM(model=${JSON.stringify(this.modelName)})`;
|
|
787
|
+
}
|
|
788
|
+
};
|
|
789
|
+
var OpenAILLM = class extends BaseLLM {
|
|
790
|
+
apiKey;
|
|
791
|
+
modelName;
|
|
792
|
+
_client = null;
|
|
793
|
+
constructor(apiKey, model = "gpt-4o") {
|
|
794
|
+
super();
|
|
795
|
+
this.apiKey = apiKey;
|
|
796
|
+
this.modelName = model;
|
|
797
|
+
}
|
|
798
|
+
async getClient() {
|
|
799
|
+
if (this._client === null) {
|
|
800
|
+
const { default: OpenAI } = await import("openai");
|
|
801
|
+
this._client = new OpenAI({ apiKey: this.apiKey });
|
|
802
|
+
}
|
|
803
|
+
return this._client;
|
|
804
|
+
}
|
|
805
|
+
async generate(prompt) {
|
|
806
|
+
const client = await this.getClient();
|
|
807
|
+
const response = await client.chat.completions.create({
|
|
808
|
+
model: this.modelName,
|
|
809
|
+
messages: [{ role: "user", content: prompt }]
|
|
810
|
+
});
|
|
811
|
+
return response.choices[0].message.content;
|
|
812
|
+
}
|
|
813
|
+
toString() {
|
|
814
|
+
return `OpenAILLM(model=${JSON.stringify(this.modelName)})`;
|
|
815
|
+
}
|
|
816
|
+
};
|
|
817
|
+
var ClaudeLLM = class extends BaseLLM {
|
|
818
|
+
apiKey;
|
|
819
|
+
modelName;
|
|
820
|
+
_client = null;
|
|
821
|
+
constructor(apiKey, model = "claude-sonnet-4-20250514") {
|
|
822
|
+
super();
|
|
823
|
+
this.apiKey = apiKey;
|
|
824
|
+
this.modelName = model;
|
|
825
|
+
}
|
|
826
|
+
async getClient() {
|
|
827
|
+
if (this._client === null) {
|
|
828
|
+
const { default: Anthropic } = await import("@anthropic-ai/sdk");
|
|
829
|
+
this._client = new Anthropic({ apiKey: this.apiKey });
|
|
830
|
+
}
|
|
831
|
+
return this._client;
|
|
832
|
+
}
|
|
833
|
+
async generate(prompt) {
|
|
834
|
+
const client = await this.getClient();
|
|
835
|
+
const response = await client.messages.create({
|
|
836
|
+
model: this.modelName,
|
|
837
|
+
max_tokens: 4096,
|
|
838
|
+
messages: [{ role: "user", content: prompt }]
|
|
839
|
+
});
|
|
840
|
+
return response.content[0].text;
|
|
841
|
+
}
|
|
842
|
+
toString() {
|
|
843
|
+
return `ClaudeLLM(model=${JSON.stringify(this.modelName)})`;
|
|
844
|
+
}
|
|
845
|
+
};
|
|
846
|
+
var MistralLLM = class extends BaseLLM {
|
|
847
|
+
apiKey;
|
|
848
|
+
modelName;
|
|
849
|
+
_client = null;
|
|
850
|
+
constructor(apiKey, model = "mistral-large-latest") {
|
|
851
|
+
super();
|
|
852
|
+
this.apiKey = apiKey;
|
|
853
|
+
this.modelName = model;
|
|
854
|
+
}
|
|
855
|
+
async getClient() {
|
|
856
|
+
if (this._client === null) {
|
|
857
|
+
const { Mistral } = await import("@mistralai/mistralai");
|
|
858
|
+
this._client = new Mistral({ apiKey: this.apiKey });
|
|
859
|
+
}
|
|
860
|
+
return this._client;
|
|
861
|
+
}
|
|
862
|
+
async generate(prompt) {
|
|
863
|
+
const client = await this.getClient();
|
|
864
|
+
const response = await client.chat.complete({
|
|
865
|
+
model: this.modelName,
|
|
866
|
+
messages: [{ role: "user", content: prompt }]
|
|
867
|
+
});
|
|
868
|
+
return response.choices[0].message.content;
|
|
869
|
+
}
|
|
870
|
+
toString() {
|
|
871
|
+
return `MistralLLM(model=${JSON.stringify(this.modelName)})`;
|
|
872
|
+
}
|
|
873
|
+
};
|
|
874
|
+
var CohereLLM = class extends BaseLLM {
|
|
875
|
+
apiKey;
|
|
876
|
+
modelName;
|
|
877
|
+
_client = null;
|
|
878
|
+
constructor(apiKey, model = "command-r-plus") {
|
|
879
|
+
super();
|
|
880
|
+
this.apiKey = apiKey;
|
|
881
|
+
this.modelName = model;
|
|
882
|
+
}
|
|
883
|
+
async getClient() {
|
|
884
|
+
if (this._client === null) {
|
|
885
|
+
const { CohereClientV2 } = await import("cohere-ai");
|
|
886
|
+
this._client = new CohereClientV2({ token: this.apiKey });
|
|
887
|
+
}
|
|
888
|
+
return this._client;
|
|
889
|
+
}
|
|
890
|
+
async generate(prompt) {
|
|
891
|
+
const client = await this.getClient();
|
|
892
|
+
const response = await client.chat({
|
|
893
|
+
model: this.modelName,
|
|
894
|
+
messages: [{ role: "user", content: prompt }]
|
|
895
|
+
});
|
|
896
|
+
return response.message.content[0].text;
|
|
897
|
+
}
|
|
898
|
+
toString() {
|
|
899
|
+
return `CohereLLM(model=${JSON.stringify(this.modelName)})`;
|
|
900
|
+
}
|
|
901
|
+
};
|
|
902
|
+
var OpenAICompatibleLLM = class extends BaseLLM {
|
|
903
|
+
baseUrl;
|
|
904
|
+
model;
|
|
905
|
+
apiKey;
|
|
906
|
+
maxTokens;
|
|
907
|
+
temperature;
|
|
908
|
+
extraHeaders;
|
|
909
|
+
constructor(options) {
|
|
910
|
+
super();
|
|
911
|
+
this.baseUrl = options.baseUrl.replace(/\/+$/, "");
|
|
912
|
+
this.model = options.model;
|
|
913
|
+
this.apiKey = options.apiKey ?? null;
|
|
914
|
+
this.maxTokens = options.maxTokens ?? 4096;
|
|
915
|
+
this.temperature = options.temperature ?? 0;
|
|
916
|
+
this.extraHeaders = options.extraHeaders ?? {};
|
|
917
|
+
}
|
|
918
|
+
buildHeaders() {
|
|
919
|
+
const headers = {
|
|
920
|
+
"Content-Type": "application/json",
|
|
921
|
+
"User-Agent": "TreeDex/0.1"
|
|
922
|
+
};
|
|
923
|
+
if (this.apiKey) {
|
|
924
|
+
headers["Authorization"] = `Bearer ${this.apiKey}`;
|
|
925
|
+
}
|
|
926
|
+
Object.assign(headers, this.extraHeaders);
|
|
927
|
+
return headers;
|
|
928
|
+
}
|
|
929
|
+
async generate(prompt) {
|
|
930
|
+
const url = `${this.baseUrl}/chat/completions`;
|
|
931
|
+
const payload = {
|
|
932
|
+
model: this.model,
|
|
933
|
+
messages: [{ role: "user", content: prompt }],
|
|
934
|
+
max_tokens: this.maxTokens,
|
|
935
|
+
temperature: this.temperature
|
|
936
|
+
};
|
|
937
|
+
const resp = await fetch(url, {
|
|
938
|
+
method: "POST",
|
|
939
|
+
headers: this.buildHeaders(),
|
|
940
|
+
body: JSON.stringify(payload),
|
|
941
|
+
signal: AbortSignal.timeout(12e4)
|
|
942
|
+
});
|
|
943
|
+
if (!resp.ok) {
|
|
944
|
+
const errorBody = await resp.text();
|
|
945
|
+
throw new Error(
|
|
946
|
+
`API request failed (${resp.status}): ${errorBody}`
|
|
947
|
+
);
|
|
948
|
+
}
|
|
949
|
+
const body = await resp.json();
|
|
950
|
+
return body.choices[0].message.content;
|
|
951
|
+
}
|
|
952
|
+
toString() {
|
|
953
|
+
return `OpenAICompatibleLLM(baseUrl=${JSON.stringify(this.baseUrl)}, model=${JSON.stringify(this.model)})`;
|
|
954
|
+
}
|
|
955
|
+
};
|
|
956
|
+
var GroqLLM = class extends BaseLLM {
|
|
957
|
+
apiKey;
|
|
958
|
+
model;
|
|
959
|
+
_client = null;
|
|
960
|
+
constructor(apiKey, model = "llama-3.3-70b-versatile") {
|
|
961
|
+
super();
|
|
962
|
+
this.apiKey = apiKey;
|
|
963
|
+
this.model = model;
|
|
964
|
+
}
|
|
965
|
+
async getClient() {
|
|
966
|
+
if (this._client === null) {
|
|
967
|
+
const { default: Groq } = await import("groq-sdk");
|
|
968
|
+
this._client = new Groq({ apiKey: this.apiKey });
|
|
969
|
+
}
|
|
970
|
+
return this._client;
|
|
971
|
+
}
|
|
972
|
+
async generate(prompt) {
|
|
973
|
+
const client = await this.getClient();
|
|
974
|
+
const response = await client.chat.completions.create({
|
|
975
|
+
model: this.model,
|
|
976
|
+
messages: [{ role: "user", content: prompt }]
|
|
977
|
+
});
|
|
978
|
+
return response.choices[0].message.content;
|
|
979
|
+
}
|
|
980
|
+
toString() {
|
|
981
|
+
return `GroqLLM(model=${JSON.stringify(this.model)})`;
|
|
982
|
+
}
|
|
983
|
+
};
|
|
984
|
+
var TogetherLLM = class extends OpenAICompatibleLLM {
|
|
985
|
+
constructor(apiKey, model = "meta-llama/Llama-3-70b-chat-hf", options) {
|
|
986
|
+
super({
|
|
987
|
+
baseUrl: "https://api.together.xyz/v1",
|
|
988
|
+
model,
|
|
989
|
+
apiKey,
|
|
990
|
+
...options
|
|
991
|
+
});
|
|
992
|
+
}
|
|
993
|
+
toString() {
|
|
994
|
+
return `TogetherLLM(model=${JSON.stringify(this.model)})`;
|
|
995
|
+
}
|
|
996
|
+
};
|
|
997
|
+
var FireworksLLM = class extends OpenAICompatibleLLM {
|
|
998
|
+
constructor(apiKey, model = "accounts/fireworks/models/llama-v3p1-70b-instruct", options) {
|
|
999
|
+
super({
|
|
1000
|
+
baseUrl: "https://api.fireworks.ai/inference/v1",
|
|
1001
|
+
model,
|
|
1002
|
+
apiKey,
|
|
1003
|
+
...options
|
|
1004
|
+
});
|
|
1005
|
+
}
|
|
1006
|
+
toString() {
|
|
1007
|
+
return `FireworksLLM(model=${JSON.stringify(this.model)})`;
|
|
1008
|
+
}
|
|
1009
|
+
};
|
|
1010
|
+
var OpenRouterLLM = class extends OpenAICompatibleLLM {
|
|
1011
|
+
constructor(apiKey, model = "anthropic/claude-sonnet-4", options) {
|
|
1012
|
+
super({
|
|
1013
|
+
baseUrl: "https://openrouter.ai/api/v1",
|
|
1014
|
+
model,
|
|
1015
|
+
apiKey,
|
|
1016
|
+
...options
|
|
1017
|
+
});
|
|
1018
|
+
}
|
|
1019
|
+
toString() {
|
|
1020
|
+
return `OpenRouterLLM(model=${JSON.stringify(this.model)})`;
|
|
1021
|
+
}
|
|
1022
|
+
};
|
|
1023
|
+
var DeepSeekLLM = class extends OpenAICompatibleLLM {
|
|
1024
|
+
constructor(apiKey, model = "deepseek-chat", options) {
|
|
1025
|
+
super({
|
|
1026
|
+
baseUrl: "https://api.deepseek.com/v1",
|
|
1027
|
+
model,
|
|
1028
|
+
apiKey,
|
|
1029
|
+
...options
|
|
1030
|
+
});
|
|
1031
|
+
}
|
|
1032
|
+
toString() {
|
|
1033
|
+
return `DeepSeekLLM(model=${JSON.stringify(this.model)})`;
|
|
1034
|
+
}
|
|
1035
|
+
};
|
|
1036
|
+
var CerebrasLLM = class extends OpenAICompatibleLLM {
|
|
1037
|
+
constructor(apiKey, model = "llama-3.3-70b", options) {
|
|
1038
|
+
super({
|
|
1039
|
+
baseUrl: "https://api.cerebras.ai/v1",
|
|
1040
|
+
model,
|
|
1041
|
+
apiKey,
|
|
1042
|
+
...options
|
|
1043
|
+
});
|
|
1044
|
+
}
|
|
1045
|
+
toString() {
|
|
1046
|
+
return `CerebrasLLM(model=${JSON.stringify(this.model)})`;
|
|
1047
|
+
}
|
|
1048
|
+
};
|
|
1049
|
+
var SambanovaLLM = class extends OpenAICompatibleLLM {
|
|
1050
|
+
constructor(apiKey, model = "Meta-Llama-3.1-70B-Instruct", options) {
|
|
1051
|
+
super({
|
|
1052
|
+
baseUrl: "https://api.sambanova.ai/v1",
|
|
1053
|
+
model,
|
|
1054
|
+
apiKey,
|
|
1055
|
+
...options
|
|
1056
|
+
});
|
|
1057
|
+
}
|
|
1058
|
+
toString() {
|
|
1059
|
+
return `SambanovaLLM(model=${JSON.stringify(this.model)})`;
|
|
1060
|
+
}
|
|
1061
|
+
};
|
|
1062
|
+
var HuggingFaceLLM = class extends BaseLLM {
|
|
1063
|
+
apiKey;
|
|
1064
|
+
model;
|
|
1065
|
+
maxTokens;
|
|
1066
|
+
constructor(apiKey, model = "mistralai/Mistral-7B-Instruct-v0.3", maxTokens = 4096) {
|
|
1067
|
+
super();
|
|
1068
|
+
this.apiKey = apiKey;
|
|
1069
|
+
this.model = model;
|
|
1070
|
+
this.maxTokens = maxTokens;
|
|
1071
|
+
}
|
|
1072
|
+
async generate(prompt) {
|
|
1073
|
+
const url = `https://api-inference.huggingface.co/models/${this.model}/v1/chat/completions`;
|
|
1074
|
+
const payload = {
|
|
1075
|
+
model: this.model,
|
|
1076
|
+
messages: [{ role: "user", content: prompt }],
|
|
1077
|
+
max_tokens: this.maxTokens
|
|
1078
|
+
};
|
|
1079
|
+
const resp = await fetch(url, {
|
|
1080
|
+
method: "POST",
|
|
1081
|
+
headers: {
|
|
1082
|
+
"Content-Type": "application/json",
|
|
1083
|
+
"User-Agent": "TreeDex/0.1",
|
|
1084
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
1085
|
+
},
|
|
1086
|
+
body: JSON.stringify(payload),
|
|
1087
|
+
signal: AbortSignal.timeout(12e4)
|
|
1088
|
+
});
|
|
1089
|
+
if (!resp.ok) {
|
|
1090
|
+
const errorBody = await resp.text();
|
|
1091
|
+
throw new Error(
|
|
1092
|
+
`HuggingFace request failed (${resp.status}): ${errorBody}`
|
|
1093
|
+
);
|
|
1094
|
+
}
|
|
1095
|
+
const body = await resp.json();
|
|
1096
|
+
return body.choices[0].message.content;
|
|
1097
|
+
}
|
|
1098
|
+
toString() {
|
|
1099
|
+
return `HuggingFaceLLM(model=${JSON.stringify(this.model)})`;
|
|
1100
|
+
}
|
|
1101
|
+
};
|
|
1102
|
+
var OllamaLLM = class extends BaseLLM {
|
|
1103
|
+
model;
|
|
1104
|
+
baseUrl;
|
|
1105
|
+
constructor(model = "llama3", baseUrl = "http://localhost:11434") {
|
|
1106
|
+
super();
|
|
1107
|
+
this.model = model;
|
|
1108
|
+
this.baseUrl = baseUrl.replace(/\/+$/, "");
|
|
1109
|
+
}
|
|
1110
|
+
async generate(prompt) {
|
|
1111
|
+
const url = `${this.baseUrl}/api/generate`;
|
|
1112
|
+
const payload = {
|
|
1113
|
+
model: this.model,
|
|
1114
|
+
prompt,
|
|
1115
|
+
stream: false
|
|
1116
|
+
};
|
|
1117
|
+
const resp = await fetch(url, {
|
|
1118
|
+
method: "POST",
|
|
1119
|
+
headers: {
|
|
1120
|
+
"Content-Type": "application/json",
|
|
1121
|
+
"User-Agent": "TreeDex/0.1"
|
|
1122
|
+
},
|
|
1123
|
+
body: JSON.stringify(payload),
|
|
1124
|
+
signal: AbortSignal.timeout(12e4)
|
|
1125
|
+
});
|
|
1126
|
+
if (!resp.ok) {
|
|
1127
|
+
const errorBody = await resp.text();
|
|
1128
|
+
throw new Error(
|
|
1129
|
+
`Ollama request failed (${resp.status}): ${errorBody}`
|
|
1130
|
+
);
|
|
1131
|
+
}
|
|
1132
|
+
const body = await resp.json();
|
|
1133
|
+
return body.response;
|
|
1134
|
+
}
|
|
1135
|
+
toString() {
|
|
1136
|
+
return `OllamaLLM(model=${JSON.stringify(this.model)})`;
|
|
1137
|
+
}
|
|
1138
|
+
};
|
|
1139
|
+
var FunctionLLM = class extends BaseLLM {
|
|
1140
|
+
_fn;
|
|
1141
|
+
constructor(fn) {
|
|
1142
|
+
super();
|
|
1143
|
+
if (typeof fn !== "function") {
|
|
1144
|
+
throw new TypeError(`Expected a function, got ${typeof fn}`);
|
|
1145
|
+
}
|
|
1146
|
+
this._fn = fn;
|
|
1147
|
+
}
|
|
1148
|
+
async generate(prompt) {
|
|
1149
|
+
const result = await this._fn(prompt);
|
|
1150
|
+
if (typeof result !== "string") {
|
|
1151
|
+
throw new TypeError(
|
|
1152
|
+
`LLM function must return string, got ${typeof result}`
|
|
1153
|
+
);
|
|
1154
|
+
}
|
|
1155
|
+
return result;
|
|
1156
|
+
}
|
|
1157
|
+
toString() {
|
|
1158
|
+
const name = this._fn.name || "anonymous";
|
|
1159
|
+
return `FunctionLLM(fn=${name})`;
|
|
1160
|
+
}
|
|
1161
|
+
};
|
|
1162
|
+
|
|
1163
|
+
// src/index.ts
|
|
1164
|
+
init_pdf_parser();
|
|
1165
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
1166
|
+
0 && (module.exports = {
|
|
1167
|
+
BaseLLM,
|
|
1168
|
+
CerebrasLLM,
|
|
1169
|
+
ClaudeLLM,
|
|
1170
|
+
CohereLLM,
|
|
1171
|
+
DOCXLoader,
|
|
1172
|
+
DeepSeekLLM,
|
|
1173
|
+
FireworksLLM,
|
|
1174
|
+
FunctionLLM,
|
|
1175
|
+
GeminiLLM,
|
|
1176
|
+
GroqLLM,
|
|
1177
|
+
HTMLLoader,
|
|
1178
|
+
HuggingFaceLLM,
|
|
1179
|
+
MistralLLM,
|
|
1180
|
+
OllamaLLM,
|
|
1181
|
+
OpenAICompatibleLLM,
|
|
1182
|
+
OpenAILLM,
|
|
1183
|
+
OpenRouterLLM,
|
|
1184
|
+
PDFLoader,
|
|
1185
|
+
QueryResult,
|
|
1186
|
+
SambanovaLLM,
|
|
1187
|
+
TextLoader,
|
|
1188
|
+
TogetherLLM,
|
|
1189
|
+
TreeDex,
|
|
1190
|
+
assignNodeIds,
|
|
1191
|
+
assignPageRanges,
|
|
1192
|
+
autoLoader,
|
|
1193
|
+
collectNodeTexts,
|
|
1194
|
+
countNodes,
|
|
1195
|
+
countTokens,
|
|
1196
|
+
createNodeMapping,
|
|
1197
|
+
embedTextInTree,
|
|
1198
|
+
extractJson,
|
|
1199
|
+
extractPages,
|
|
1200
|
+
findLargeNodes,
|
|
1201
|
+
getLeafNodes,
|
|
1202
|
+
groupPages,
|
|
1203
|
+
listToTree,
|
|
1204
|
+
pagesToTaggedText,
|
|
1205
|
+
printTree,
|
|
1206
|
+
retrievalPrompt,
|
|
1207
|
+
stripTextFromTree,
|
|
1208
|
+
structureContinuePrompt,
|
|
1209
|
+
structureExtractionPrompt,
|
|
1210
|
+
textToPages,
|
|
1211
|
+
treeToFlatList
|
|
1212
|
+
});
|
|
1213
|
+
//# sourceMappingURL=index.cjs.map
|