treedex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1213 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function __init() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
11
+ var __export = (target, all) => {
12
+ for (var name in all)
13
+ __defProp(target, name, { get: all[name], enumerable: true });
14
+ };
15
+ var __copyProps = (to, from, except, desc) => {
16
+ if (from && typeof from === "object" || typeof from === "function") {
17
+ for (let key of __getOwnPropNames(from))
18
+ if (!__hasOwnProp.call(to, key) && key !== except)
19
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
20
+ }
21
+ return to;
22
+ };
23
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
24
+ // If the importer is in node compatibility mode or this is not an ESM
25
+ // file that has been converted to a CommonJS file using a Babel-
26
+ // compatible transform (i.e. "__esModule" has not been set), then set
27
+ // "default" to the CommonJS "module.exports" for node compatibility.
28
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
29
+ mod
30
+ ));
31
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
32
+
33
+ // src/pdf-parser.ts
34
+ var pdf_parser_exports = {};
35
+ __export(pdf_parser_exports, {
36
+ countTokens: () => countTokens,
37
+ extractPages: () => extractPages,
38
+ groupPages: () => groupPages,
39
+ pagesToTaggedText: () => pagesToTaggedText
40
+ });
41
+ function countTokens(text) {
42
+ return (0, import_gpt_tokenizer.encode)(text).length;
43
+ }
44
+ async function extractPages(pdfPath) {
45
+ const fs = await import("fs/promises");
46
+ const pdfjs = await import("pdfjs-dist/legacy/build/pdf.mjs");
47
+ const data = await fs.readFile(pdfPath);
48
+ const doc = await pdfjs.getDocument({ data }).promise;
49
+ const pages = [];
50
+ for (let i = 0; i < doc.numPages; i++) {
51
+ const page = await doc.getPage(i + 1);
52
+ const content = await page.getTextContent();
53
+ const text = content.items.map((item) => {
54
+ const obj = item;
55
+ return typeof obj.str === "string" ? obj.str : "";
56
+ }).join(" ");
57
+ pages.push({
58
+ page_num: i,
59
+ text,
60
+ token_count: countTokens(text)
61
+ });
62
+ }
63
+ return pages;
64
+ }
65
+ function pagesToTaggedText(pages, start, end) {
66
+ const parts = [];
67
+ for (const page of pages.slice(start, end + 1)) {
68
+ const n = page.page_num;
69
+ parts.push(`<physical_index_${n}>${page.text}</physical_index_${n}>`);
70
+ }
71
+ return parts.join("\n");
72
+ }
73
+ function groupPages(pages, maxTokens = 2e4, overlap = 1) {
74
+ const totalTokens = pages.reduce((sum, p) => sum + p.token_count, 0);
75
+ if (totalTokens <= maxTokens) {
76
+ return [pagesToTaggedText(pages, 0, pages.length - 1)];
77
+ }
78
+ const groups = [];
79
+ let groupStart = 0;
80
+ while (groupStart < pages.length) {
81
+ let running = 0;
82
+ let groupEnd = groupStart;
83
+ while (groupEnd < pages.length) {
84
+ const pageTokens = pages[groupEnd].token_count;
85
+ if (running + pageTokens > maxTokens && groupEnd > groupStart) {
86
+ groupEnd -= 1;
87
+ break;
88
+ }
89
+ running += pageTokens;
90
+ groupEnd += 1;
91
+ }
92
+ if (groupEnd >= pages.length) {
93
+ groupEnd = pages.length - 1;
94
+ }
95
+ groupEnd = Math.min(groupEnd, pages.length - 1);
96
+ groups.push(pagesToTaggedText(pages, groupStart, groupEnd));
97
+ if (groupEnd >= pages.length - 1) {
98
+ break;
99
+ }
100
+ const nextStart = groupEnd + 1 - overlap;
101
+ groupStart = Math.max(nextStart, groupStart + 1);
102
+ }
103
+ return groups;
104
+ }
105
+ var import_gpt_tokenizer;
106
+ var init_pdf_parser = __esm({
107
+ "src/pdf-parser.ts"() {
108
+ "use strict";
109
+ import_gpt_tokenizer = require("gpt-tokenizer");
110
+ }
111
+ });
112
+
113
+ // src/index.ts
114
+ var index_exports = {};
115
+ __export(index_exports, {
116
+ BaseLLM: () => BaseLLM,
117
+ CerebrasLLM: () => CerebrasLLM,
118
+ ClaudeLLM: () => ClaudeLLM,
119
+ CohereLLM: () => CohereLLM,
120
+ DOCXLoader: () => DOCXLoader,
121
+ DeepSeekLLM: () => DeepSeekLLM,
122
+ FireworksLLM: () => FireworksLLM,
123
+ FunctionLLM: () => FunctionLLM,
124
+ GeminiLLM: () => GeminiLLM,
125
+ GroqLLM: () => GroqLLM,
126
+ HTMLLoader: () => HTMLLoader,
127
+ HuggingFaceLLM: () => HuggingFaceLLM,
128
+ MistralLLM: () => MistralLLM,
129
+ OllamaLLM: () => OllamaLLM,
130
+ OpenAICompatibleLLM: () => OpenAICompatibleLLM,
131
+ OpenAILLM: () => OpenAILLM,
132
+ OpenRouterLLM: () => OpenRouterLLM,
133
+ PDFLoader: () => PDFLoader,
134
+ QueryResult: () => QueryResult,
135
+ SambanovaLLM: () => SambanovaLLM,
136
+ TextLoader: () => TextLoader,
137
+ TogetherLLM: () => TogetherLLM,
138
+ TreeDex: () => TreeDex,
139
+ assignNodeIds: () => assignNodeIds,
140
+ assignPageRanges: () => assignPageRanges,
141
+ autoLoader: () => autoLoader,
142
+ collectNodeTexts: () => collectNodeTexts,
143
+ countNodes: () => countNodes,
144
+ countTokens: () => countTokens,
145
+ createNodeMapping: () => createNodeMapping,
146
+ embedTextInTree: () => embedTextInTree,
147
+ extractJson: () => extractJson,
148
+ extractPages: () => extractPages,
149
+ findLargeNodes: () => findLargeNodes,
150
+ getLeafNodes: () => getLeafNodes,
151
+ groupPages: () => groupPages,
152
+ listToTree: () => listToTree,
153
+ pagesToTaggedText: () => pagesToTaggedText,
154
+ printTree: () => printTree,
155
+ retrievalPrompt: () => retrievalPrompt,
156
+ stripTextFromTree: () => stripTextFromTree,
157
+ structureContinuePrompt: () => structureContinuePrompt,
158
+ structureExtractionPrompt: () => structureExtractionPrompt,
159
+ textToPages: () => textToPages,
160
+ treeToFlatList: () => treeToFlatList
161
+ });
162
+ module.exports = __toCommonJS(index_exports);
163
+
164
+ // src/loaders.ts
165
+ init_pdf_parser();
166
+ function textToPages(text, charsPerPage = 3e3) {
167
+ const pages = [];
168
+ for (let i = 0; i < text.length; i += charsPerPage) {
169
+ const chunk = text.slice(i, i + charsPerPage);
170
+ pages.push({
171
+ page_num: pages.length,
172
+ text: chunk,
173
+ token_count: countTokens(chunk)
174
+ });
175
+ }
176
+ return pages;
177
+ }
178
+ var PDFLoader = class {
179
+ async load(path) {
180
+ const { extractPages: extractPages2 } = await Promise.resolve().then(() => (init_pdf_parser(), pdf_parser_exports));
181
+ return extractPages2(path);
182
+ }
183
+ };
184
+ var TextLoader = class {
185
+ charsPerPage;
186
+ constructor(charsPerPage = 3e3) {
187
+ this.charsPerPage = charsPerPage;
188
+ }
189
+ async load(path) {
190
+ const fs = await import("fs/promises");
191
+ const text = await fs.readFile(path, "utf-8");
192
+ return textToPages(text, this.charsPerPage);
193
+ }
194
+ };
195
+ var HTMLLoader = class {
196
+ charsPerPage;
197
+ constructor(charsPerPage = 3e3) {
198
+ this.charsPerPage = charsPerPage;
199
+ }
200
+ async load(path) {
201
+ const fs = await import("fs/promises");
202
+ const html = await fs.readFile(path, "utf-8");
203
+ const text = await this.stripHtml(html);
204
+ return textToPages(text, this.charsPerPage);
205
+ }
206
+ async stripHtml(html) {
207
+ try {
208
+ const { Parser } = await import("htmlparser2");
209
+ return new Promise((resolve) => {
210
+ const parts = [];
211
+ let skip = false;
212
+ const parser = new Parser({
213
+ onopentag(name) {
214
+ if (name === "script" || name === "style") skip = true;
215
+ },
216
+ onclosetag(name) {
217
+ if (name === "script" || name === "style") skip = false;
218
+ if (["p", "div", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr"].includes(name)) {
219
+ parts.push("\n");
220
+ }
221
+ },
222
+ ontext(data) {
223
+ if (!skip) parts.push(data);
224
+ },
225
+ onend() {
226
+ const raw = parts.join("");
227
+ resolve(raw.replace(/\n{3,}/g, "\n\n").trim());
228
+ }
229
+ });
230
+ parser.write(html);
231
+ parser.end();
232
+ });
233
+ } catch {
234
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
235
+ }
236
+ }
237
+ };
238
+ var DOCXLoader = class {
239
+ charsPerPage;
240
+ constructor(charsPerPage = 3e3) {
241
+ this.charsPerPage = charsPerPage;
242
+ }
243
+ async load(path) {
244
+ const fs = await import("fs/promises");
245
+ const mammoth = await import("mammoth");
246
+ const buffer = await fs.readFile(path);
247
+ const result = await mammoth.extractRawText({ buffer });
248
+ return textToPages(result.value, this.charsPerPage);
249
+ }
250
+ };
251
+ var EXTENSION_MAP = {
252
+ ".pdf": PDFLoader,
253
+ ".txt": TextLoader,
254
+ ".md": TextLoader,
255
+ ".html": HTMLLoader,
256
+ ".htm": HTMLLoader,
257
+ ".docx": DOCXLoader
258
+ };
259
+ async function autoLoader(filePath) {
260
+ const { extname } = await import("path");
261
+ const ext = extname(filePath).toLowerCase();
262
+ const LoaderClass = EXTENSION_MAP[ext];
263
+ if (!LoaderClass) {
264
+ const supported = Object.keys(EXTENSION_MAP).join(", ");
265
+ throw new Error(
266
+ `Unsupported file extension '${ext}'. Supported: ${supported}`
267
+ );
268
+ }
269
+ const loader = new LoaderClass();
270
+ return loader.load(filePath);
271
+ }
272
+
273
+ // src/core.ts
274
+ init_pdf_parser();
275
+
276
+ // src/tree-builder.ts
277
+ function listToTree(flatList) {
278
+ const nodesByStructure = {};
279
+ const roots = [];
280
+ for (const item of flatList) {
281
+ const node = { ...item, nodes: [] };
282
+ const structure = node.structure;
283
+ nodesByStructure[structure] = node;
284
+ const parts = structure.split(".");
285
+ if (parts.length === 1) {
286
+ roots.push(node);
287
+ } else {
288
+ const parentStructure = parts.slice(0, -1).join(".");
289
+ const parent = nodesByStructure[parentStructure];
290
+ if (parent !== void 0) {
291
+ parent.nodes.push(node);
292
+ } else {
293
+ roots.push(node);
294
+ }
295
+ }
296
+ }
297
+ return roots;
298
+ }
299
+ function assignRanges(nodes, boundaryEnd) {
300
+ for (let i = 0; i < nodes.length; i++) {
301
+ const node = nodes[i];
302
+ node.start_index = node.physical_index ?? 0;
303
+ if (i + 1 < nodes.length) {
304
+ node.end_index = (nodes[i + 1].physical_index ?? 0) - 1;
305
+ } else {
306
+ node.end_index = boundaryEnd;
307
+ }
308
+ if (node.nodes.length > 0) {
309
+ assignRanges(node.nodes, node.end_index);
310
+ }
311
+ }
312
+ }
313
+ function assignPageRanges(tree, totalPages) {
314
+ assignRanges(tree, totalPages - 1);
315
+ return tree;
316
+ }
317
+ function assignNodeIds(tree) {
318
+ let counter = 0;
319
+ function walk(nodes) {
320
+ for (const node of nodes) {
321
+ counter++;
322
+ node.node_id = String(counter).padStart(4, "0");
323
+ walk(node.nodes);
324
+ }
325
+ }
326
+ walk(tree);
327
+ return tree;
328
+ }
329
+ function findLargeNodes(tree, options = {}) {
330
+ const { maxPages = 10, maxTokens = 2e4, pages = null } = options;
331
+ const large = [];
332
+ function walk(nodes) {
333
+ for (const node of nodes) {
334
+ const start = node.start_index ?? 0;
335
+ const end = node.end_index ?? 0;
336
+ const pageCount = end - start + 1;
337
+ let isLarge = pageCount > maxPages;
338
+ if (!isLarge && pages !== null) {
339
+ const tokenSum = pages.filter((p) => p.page_num >= start && p.page_num <= end).reduce((sum, p) => sum + p.token_count, 0);
340
+ isLarge = tokenSum > maxTokens;
341
+ }
342
+ if (isLarge) {
343
+ large.push(node);
344
+ }
345
+ walk(node.nodes);
346
+ }
347
+ }
348
+ walk(tree);
349
+ return large;
350
+ }
351
+ function embedTextInTree(tree, pages) {
352
+ function walk(nodes) {
353
+ for (const node of nodes) {
354
+ const start = node.start_index ?? 0;
355
+ const end = node.end_index ?? 0;
356
+ node.text = pages.filter((p) => p.page_num >= start && p.page_num <= end).map((p) => p.text).join("\n");
357
+ walk(node.nodes);
358
+ }
359
+ }
360
+ walk(tree);
361
+ return tree;
362
+ }
363
+
364
+ // src/tree-utils.ts
365
+ function createNodeMapping(tree) {
366
+ const mapping = {};
367
+ function walk(nodes) {
368
+ for (const node of nodes) {
369
+ if (node.node_id !== void 0) {
370
+ mapping[node.node_id] = node;
371
+ }
372
+ walk(node.nodes);
373
+ }
374
+ }
375
+ walk(tree);
376
+ return mapping;
377
+ }
378
+ function stripTextFromTree(tree) {
379
+ const stripped = JSON.parse(JSON.stringify(tree));
380
+ function strip(nodes) {
381
+ for (const node of nodes) {
382
+ delete node.text;
383
+ strip(node.nodes);
384
+ }
385
+ }
386
+ strip(stripped);
387
+ return stripped;
388
+ }
389
+ function collectNodeTexts(nodeIds, nodeMap) {
390
+ const parts = [];
391
+ for (const nid of nodeIds) {
392
+ const node = nodeMap[nid];
393
+ if (node === void 0) continue;
394
+ const title = node.title ?? "Untitled";
395
+ const structure = node.structure ?? "";
396
+ const text = node.text ?? "";
397
+ const header = structure ? `[${structure}: ${title}]` : `[${title}]`;
398
+ parts.push(`${header}
399
+ ${text}`);
400
+ }
401
+ return parts.join("\n\n");
402
+ }
403
+ function countNodes(tree) {
404
+ let total = 0;
405
+ for (const node of tree) {
406
+ total += 1;
407
+ total += countNodes(node.nodes);
408
+ }
409
+ return total;
410
+ }
411
+ function getLeafNodes(tree) {
412
+ const leaves = [];
413
+ function walk(nodes) {
414
+ for (const node of nodes) {
415
+ if (node.nodes.length === 0) {
416
+ leaves.push(node);
417
+ } else {
418
+ walk(node.nodes);
419
+ }
420
+ }
421
+ }
422
+ walk(tree);
423
+ return leaves;
424
+ }
425
+ function treeToFlatList(tree) {
426
+ const result = [];
427
+ function walk(nodes) {
428
+ for (const node of nodes) {
429
+ const { nodes: children, ...flat } = node;
430
+ result.push(flat);
431
+ walk(children);
432
+ }
433
+ }
434
+ walk(tree);
435
+ return result;
436
+ }
437
+ function extractJson(text) {
438
+ try {
439
+ return JSON.parse(text);
440
+ } catch {
441
+ }
442
+ const blockMatch = text.match(/```(?:json)?\s*\n?(.*?)```/s);
443
+ if (blockMatch) {
444
+ const block = blockMatch[1].trim();
445
+ try {
446
+ return JSON.parse(block);
447
+ } catch {
448
+ const cleaned = block.replace(/,\s*([}\]])/g, "$1");
449
+ try {
450
+ return JSON.parse(cleaned);
451
+ } catch {
452
+ }
453
+ }
454
+ }
455
+ for (const [startChar, endChar] of [
456
+ ["{", "}"],
457
+ ["[", "]"]
458
+ ]) {
459
+ const start = text.indexOf(startChar);
460
+ if (start === -1) continue;
461
+ let depth = 0;
462
+ for (let i = start; i < text.length; i++) {
463
+ if (text[i] === startChar) depth++;
464
+ else if (text[i] === endChar) {
465
+ depth--;
466
+ if (depth === 0) {
467
+ const candidate = text.slice(start, i + 1);
468
+ try {
469
+ return JSON.parse(candidate);
470
+ } catch {
471
+ const cleaned = candidate.replace(/,\s*([}\]])/g, "$1");
472
+ try {
473
+ return JSON.parse(cleaned);
474
+ } catch {
475
+ break;
476
+ }
477
+ }
478
+ }
479
+ }
480
+ }
481
+ }
482
+ throw new Error(
483
+ `Could not extract JSON from text: ${text.slice(0, 200)}...`
484
+ );
485
+ }
486
+ function printTree(tree, indent = 0) {
487
+ const prefix = " ".repeat(indent);
488
+ for (const node of tree) {
489
+ const nodeId = node.node_id ?? "????";
490
+ const structure = node.structure ?? "";
491
+ const title = node.title ?? "Untitled";
492
+ const start = node.start_index ?? "?";
493
+ const end = node.end_index ?? "?";
494
+ console.log(
495
+ `${prefix}[${nodeId}] ${structure}: ${title} (pages ${start}-${end})`
496
+ );
497
+ printTree(node.nodes, indent + 1);
498
+ }
499
+ }
500
+
501
+ // src/prompts.ts
502
+ function structureExtractionPrompt(text) {
503
+ return `You are a document structure analyzer. Given the following document text with physical page index tags, extract the hierarchical structure (table of contents).
504
+
505
+ Return a JSON list of objects, each with:
506
+ - "structure": hierarchical numbering like "1", "1.1", "1.2.3"
507
+ - "title": the section/chapter title
508
+ - "physical_index": the page number (from the <physical_index_N> tag) where this section starts
509
+
510
+ Rules:
511
+ - Use the physical_index tags to determine page numbers
512
+ - Create a logical hierarchy: chapters -> sections -> subsections
513
+ - Every section must have a unique structure ID
514
+ - Return ONLY valid JSON \u2014 no extra text
515
+
516
+ Document text:
517
+ ${text}
518
+
519
+ JSON output:
520
+ `;
521
+ }
522
+ function structureContinuePrompt(previousStructure, text) {
523
+ return `You are continuing to extract the hierarchical structure of a document.
524
+
525
+ Here is the structure extracted so far:
526
+ ${previousStructure}
527
+
528
+ Now extract the structure from the next portion of the document. Continue the numbering from where the previous structure left off. If a section from the previous portion continues into this portion, do NOT duplicate it.
529
+
530
+ Return a JSON list of NEW sections only (same format as before).
531
+
532
+ Document text:
533
+ ${text}
534
+
535
+ JSON output:
536
+ `;
537
+ }
538
+ function retrievalPrompt(treeStructure, query) {
539
+ return `You are a document retrieval system. Given a document's tree structure and a user query, select the most relevant sections that would contain the answer.
540
+
541
+ Document structure:
542
+ ${treeStructure}
543
+
544
+ User query: ${query}
545
+
546
+ Return a JSON object with:
547
+ - "node_ids": list of node IDs (strings like "0001", "0005") that are most relevant to the query
548
+ - "reasoning": brief explanation of why these sections were selected
549
+
550
+ Select the smallest set of sections that fully covers the answer. Prefer leaf nodes over parent nodes when the leaf contains the specific content. Return ONLY valid JSON.
551
+
552
+ JSON output:
553
+ `;
554
+ }
555
+
556
+ // src/core.ts
557
+ var QueryResult = class {
558
+ context;
559
+ nodeIds;
560
+ pageRanges;
561
+ reasoning;
562
+ constructor(context, nodeIds, pageRanges, reasoning) {
563
+ this.context = context;
564
+ this.nodeIds = nodeIds;
565
+ this.pageRanges = pageRanges;
566
+ this.reasoning = reasoning;
567
+ }
568
+ /** Human-readable page ranges like 'pages 5-8, 12-15'. */
569
+ get pagesStr() {
570
+ if (this.pageRanges.length === 0) return "no pages";
571
+ const parts = [];
572
+ for (const [start, end] of this.pageRanges) {
573
+ if (start === end) {
574
+ parts.push(String(start + 1));
575
+ } else {
576
+ parts.push(`${start + 1}-${end + 1}`);
577
+ }
578
+ }
579
+ return "pages " + parts.join(", ");
580
+ }
581
+ toString() {
582
+ return `QueryResult(nodes=${JSON.stringify(this.nodeIds)}, ${this.pagesStr}, context_len=${this.context.length})`;
583
+ }
584
+ };
585
+ var TreeDex = class _TreeDex {
586
+ tree;
587
+ pages;
588
+ llm;
589
+ _nodeMap;
590
+ constructor(tree, pages, llm = null) {
591
+ this.tree = tree;
592
+ this.pages = pages;
593
+ this.llm = llm;
594
+ this._nodeMap = createNodeMapping(tree);
595
+ }
596
+ /**
597
+ * Build a TreeDex index from a file.
598
+ *
599
+ * @param path - Path to document (PDF, TXT, HTML, DOCX)
600
+ * @param llm - LLM backend with .generate(prompt) method
601
+ * @param options - Optional configuration
602
+ */
603
+ static async fromFile(path, llm, options) {
604
+ const {
605
+ loader,
606
+ maxTokens = 2e4,
607
+ overlap = 1,
608
+ verbose = true
609
+ } = options ?? {};
610
+ if (verbose) {
611
+ const { basename } = await import("path");
612
+ console.log(`Loading: ${basename(path)}`);
613
+ }
614
+ let pages;
615
+ if (loader) {
616
+ pages = await loader.load(path);
617
+ } else {
618
+ pages = await autoLoader(path);
619
+ }
620
+ if (verbose) {
621
+ const totalTokens = pages.reduce((s, p) => s + p.token_count, 0);
622
+ console.log(` ${pages.length} pages, ${totalTokens.toLocaleString()} tokens`);
623
+ }
624
+ return _TreeDex.fromPages(pages, llm, { maxTokens, overlap, verbose });
625
+ }
626
+ /** Build a TreeDex index from pre-extracted pages. */
627
+ static async fromPages(pages, llm, options) {
628
+ const { maxTokens = 2e4, overlap = 1, verbose = true } = options ?? {};
629
+ const groups = groupPages(pages, maxTokens, overlap);
630
+ if (verbose) {
631
+ console.log(` ${groups.length} page group(s) for structure extraction`);
632
+ }
633
+ const allSections = [];
634
+ for (let i = 0; i < groups.length; i++) {
635
+ if (verbose) {
636
+ console.log(
637
+ ` Extracting structure from group ${i + 1}/${groups.length}...`
638
+ );
639
+ }
640
+ let prompt;
641
+ if (i === 0) {
642
+ prompt = structureExtractionPrompt(groups[i]);
643
+ } else {
644
+ const prevJson = JSON.stringify(allSections, null, 2);
645
+ prompt = structureContinuePrompt(prevJson, groups[i]);
646
+ }
647
+ const response = await llm.generate(prompt);
648
+ const sections = extractJson(response);
649
+ if (Array.isArray(sections)) {
650
+ allSections.push(
651
+ ...sections
652
+ );
653
+ } else if (sections !== null && typeof sections === "object" && "sections" in sections) {
654
+ allSections.push(
655
+ ...sections.sections
656
+ );
657
+ }
658
+ }
659
+ if (verbose) {
660
+ console.log(` Extracted ${allSections.length} sections`);
661
+ }
662
+ const tree = listToTree(allSections);
663
+ assignPageRanges(tree, pages.length);
664
+ assignNodeIds(tree);
665
+ embedTextInTree(tree, pages);
666
+ if (verbose) {
667
+ console.log(` Tree: ${countNodes(tree)} nodes`);
668
+ }
669
+ return new _TreeDex(tree, pages, llm);
670
+ }
671
+ /** Create a TreeDex from an existing tree and pages. */
672
+ static fromTree(tree, pages, llm = null) {
673
+ return new _TreeDex(tree, pages, llm);
674
+ }
675
+ /**
676
+ * Query the index and return relevant context.
677
+ *
678
+ * @param question - The user's question
679
+ * @param llm - Optional LLM override. Uses this.llm if not provided.
680
+ */
681
+ async query(question, llm) {
682
+ const activeLlm = llm ?? this.llm;
683
+ if (!activeLlm) {
684
+ throw new Error(
685
+ "No LLM provided. Pass llm to query() or TreeDex constructor."
686
+ );
687
+ }
688
+ const stripped = stripTextFromTree(this.tree);
689
+ const treeJson = JSON.stringify(stripped, null, 2);
690
+ const prompt = retrievalPrompt(treeJson, question);
691
+ const response = await activeLlm.generate(prompt);
692
+ const result = extractJson(response);
693
+ const nodeIds = result.node_ids ?? [];
694
+ const reasoning = result.reasoning ?? "";
695
+ const context = collectNodeTexts(nodeIds, this._nodeMap);
696
+ const pageRanges = [];
697
+ for (const nid of nodeIds) {
698
+ const node = this._nodeMap[nid];
699
+ if (node) {
700
+ const start = node.start_index ?? 0;
701
+ const end = node.end_index ?? 0;
702
+ pageRanges.push([start, end]);
703
+ }
704
+ }
705
+ return new QueryResult(context, nodeIds, pageRanges, reasoning);
706
+ }
707
+ /** Save the index to a JSON file. */
708
+ async save(path) {
709
+ const fs = await import("fs/promises");
710
+ const stripped = stripTextFromTree(this.tree);
711
+ const data = {
712
+ version: "1.0",
713
+ framework: "TreeDex",
714
+ tree: stripped,
715
+ pages: this.pages
716
+ };
717
+ await fs.writeFile(path, JSON.stringify(data, null, 2), "utf-8");
718
+ return path;
719
+ }
720
+ /** Load a TreeDex index from a JSON file. */
721
+ static async load(path, llm) {
722
+ const fs = await import("fs/promises");
723
+ const raw = await fs.readFile(path, "utf-8");
724
+ const data = JSON.parse(raw);
725
+ const tree = data.tree;
726
+ const pages = data.pages;
727
+ assignPageRanges(tree, pages.length);
728
+ embedTextInTree(tree, pages);
729
+ return new _TreeDex(tree, pages, llm ?? null);
730
+ }
731
+ /** Pretty-print the tree structure. */
732
+ showTree() {
733
+ printTree(this.tree);
734
+ }
735
+ /** Return index statistics. */
736
+ stats() {
737
+ const totalTokens = this.pages.reduce((s, p) => s + p.token_count, 0);
738
+ const leaves = getLeafNodes(this.tree);
739
+ return {
740
+ total_pages: this.pages.length,
741
+ total_tokens: totalTokens,
742
+ total_nodes: countNodes(this.tree),
743
+ leaf_nodes: leaves.length,
744
+ root_sections: this.tree.length
745
+ };
746
+ }
747
+ /** Find sections that exceed size thresholds. */
748
+ findLargeSections(options) {
749
+ return findLargeNodes(this.tree, {
750
+ maxPages: options?.maxPages ?? 10,
751
+ maxTokens: options?.maxTokens ?? 2e4,
752
+ pages: this.pages
753
+ });
754
+ }
755
+ };
756
+
757
+ // src/llm-backends.ts
758
+ var BaseLLM = class {
759
+ toString() {
760
+ return `${this.constructor.name}()`;
761
+ }
762
+ };
763
+ var GeminiLLM = class extends BaseLLM {
764
+ apiKey;
765
+ modelName;
766
+ _client = null;
767
+ constructor(apiKey, model = "gemini-2.0-flash") {
768
+ super();
769
+ this.apiKey = apiKey;
770
+ this.modelName = model;
771
+ }
772
+ async getClient() {
773
+ if (this._client === null) {
774
+ const { GoogleGenerativeAI } = await import("@google/generative-ai");
775
+ const genai = new GoogleGenerativeAI(this.apiKey);
776
+ this._client = genai.getGenerativeModel({ model: this.modelName });
777
+ }
778
+ return this._client;
779
+ }
780
+ async generate(prompt) {
781
+ const model = await this.getClient();
782
+ const response = await model.generateContent(prompt);
783
+ return response.response.text();
784
+ }
785
+ toString() {
786
+ return `GeminiLLM(model=${JSON.stringify(this.modelName)})`;
787
+ }
788
+ };
789
+ var OpenAILLM = class extends BaseLLM {
790
+ apiKey;
791
+ modelName;
792
+ _client = null;
793
+ constructor(apiKey, model = "gpt-4o") {
794
+ super();
795
+ this.apiKey = apiKey;
796
+ this.modelName = model;
797
+ }
798
+ async getClient() {
799
+ if (this._client === null) {
800
+ const { default: OpenAI } = await import("openai");
801
+ this._client = new OpenAI({ apiKey: this.apiKey });
802
+ }
803
+ return this._client;
804
+ }
805
+ async generate(prompt) {
806
+ const client = await this.getClient();
807
+ const response = await client.chat.completions.create({
808
+ model: this.modelName,
809
+ messages: [{ role: "user", content: prompt }]
810
+ });
811
+ return response.choices[0].message.content;
812
+ }
813
+ toString() {
814
+ return `OpenAILLM(model=${JSON.stringify(this.modelName)})`;
815
+ }
816
+ };
817
+ var ClaudeLLM = class extends BaseLLM {
818
+ apiKey;
819
+ modelName;
820
+ _client = null;
821
+ constructor(apiKey, model = "claude-sonnet-4-20250514") {
822
+ super();
823
+ this.apiKey = apiKey;
824
+ this.modelName = model;
825
+ }
826
+ async getClient() {
827
+ if (this._client === null) {
828
+ const { default: Anthropic } = await import("@anthropic-ai/sdk");
829
+ this._client = new Anthropic({ apiKey: this.apiKey });
830
+ }
831
+ return this._client;
832
+ }
833
+ async generate(prompt) {
834
+ const client = await this.getClient();
835
+ const response = await client.messages.create({
836
+ model: this.modelName,
837
+ max_tokens: 4096,
838
+ messages: [{ role: "user", content: prompt }]
839
+ });
840
+ return response.content[0].text;
841
+ }
842
+ toString() {
843
+ return `ClaudeLLM(model=${JSON.stringify(this.modelName)})`;
844
+ }
845
+ };
846
+ var MistralLLM = class extends BaseLLM {
847
+ apiKey;
848
+ modelName;
849
+ _client = null;
850
+ constructor(apiKey, model = "mistral-large-latest") {
851
+ super();
852
+ this.apiKey = apiKey;
853
+ this.modelName = model;
854
+ }
855
+ async getClient() {
856
+ if (this._client === null) {
857
+ const { Mistral } = await import("@mistralai/mistralai");
858
+ this._client = new Mistral({ apiKey: this.apiKey });
859
+ }
860
+ return this._client;
861
+ }
862
+ async generate(prompt) {
863
+ const client = await this.getClient();
864
+ const response = await client.chat.complete({
865
+ model: this.modelName,
866
+ messages: [{ role: "user", content: prompt }]
867
+ });
868
+ return response.choices[0].message.content;
869
+ }
870
+ toString() {
871
+ return `MistralLLM(model=${JSON.stringify(this.modelName)})`;
872
+ }
873
+ };
874
+ var CohereLLM = class extends BaseLLM {
875
+ apiKey;
876
+ modelName;
877
+ _client = null;
878
+ constructor(apiKey, model = "command-r-plus") {
879
+ super();
880
+ this.apiKey = apiKey;
881
+ this.modelName = model;
882
+ }
883
+ async getClient() {
884
+ if (this._client === null) {
885
+ const { CohereClientV2 } = await import("cohere-ai");
886
+ this._client = new CohereClientV2({ token: this.apiKey });
887
+ }
888
+ return this._client;
889
+ }
890
+ async generate(prompt) {
891
+ const client = await this.getClient();
892
+ const response = await client.chat({
893
+ model: this.modelName,
894
+ messages: [{ role: "user", content: prompt }]
895
+ });
896
+ return response.message.content[0].text;
897
+ }
898
+ toString() {
899
+ return `CohereLLM(model=${JSON.stringify(this.modelName)})`;
900
+ }
901
+ };
902
+ var OpenAICompatibleLLM = class extends BaseLLM {
903
+ baseUrl;
904
+ model;
905
+ apiKey;
906
+ maxTokens;
907
+ temperature;
908
+ extraHeaders;
909
+ constructor(options) {
910
+ super();
911
+ this.baseUrl = options.baseUrl.replace(/\/+$/, "");
912
+ this.model = options.model;
913
+ this.apiKey = options.apiKey ?? null;
914
+ this.maxTokens = options.maxTokens ?? 4096;
915
+ this.temperature = options.temperature ?? 0;
916
+ this.extraHeaders = options.extraHeaders ?? {};
917
+ }
918
+ buildHeaders() {
919
+ const headers = {
920
+ "Content-Type": "application/json",
921
+ "User-Agent": "TreeDex/0.1"
922
+ };
923
+ if (this.apiKey) {
924
+ headers["Authorization"] = `Bearer ${this.apiKey}`;
925
+ }
926
+ Object.assign(headers, this.extraHeaders);
927
+ return headers;
928
+ }
929
+ async generate(prompt) {
930
+ const url = `${this.baseUrl}/chat/completions`;
931
+ const payload = {
932
+ model: this.model,
933
+ messages: [{ role: "user", content: prompt }],
934
+ max_tokens: this.maxTokens,
935
+ temperature: this.temperature
936
+ };
937
+ const resp = await fetch(url, {
938
+ method: "POST",
939
+ headers: this.buildHeaders(),
940
+ body: JSON.stringify(payload),
941
+ signal: AbortSignal.timeout(12e4)
942
+ });
943
+ if (!resp.ok) {
944
+ const errorBody = await resp.text();
945
+ throw new Error(
946
+ `API request failed (${resp.status}): ${errorBody}`
947
+ );
948
+ }
949
+ const body = await resp.json();
950
+ return body.choices[0].message.content;
951
+ }
952
+ toString() {
953
+ return `OpenAICompatibleLLM(baseUrl=${JSON.stringify(this.baseUrl)}, model=${JSON.stringify(this.model)})`;
954
+ }
955
+ };
956
+ var GroqLLM = class extends BaseLLM {
957
+ apiKey;
958
+ model;
959
+ _client = null;
960
+ constructor(apiKey, model = "llama-3.3-70b-versatile") {
961
+ super();
962
+ this.apiKey = apiKey;
963
+ this.model = model;
964
+ }
965
+ async getClient() {
966
+ if (this._client === null) {
967
+ const { default: Groq } = await import("groq-sdk");
968
+ this._client = new Groq({ apiKey: this.apiKey });
969
+ }
970
+ return this._client;
971
+ }
972
+ async generate(prompt) {
973
+ const client = await this.getClient();
974
+ const response = await client.chat.completions.create({
975
+ model: this.model,
976
+ messages: [{ role: "user", content: prompt }]
977
+ });
978
+ return response.choices[0].message.content;
979
+ }
980
+ toString() {
981
+ return `GroqLLM(model=${JSON.stringify(this.model)})`;
982
+ }
983
+ };
984
+ var TogetherLLM = class extends OpenAICompatibleLLM {
985
+ constructor(apiKey, model = "meta-llama/Llama-3-70b-chat-hf", options) {
986
+ super({
987
+ baseUrl: "https://api.together.xyz/v1",
988
+ model,
989
+ apiKey,
990
+ ...options
991
+ });
992
+ }
993
+ toString() {
994
+ return `TogetherLLM(model=${JSON.stringify(this.model)})`;
995
+ }
996
+ };
997
+ var FireworksLLM = class extends OpenAICompatibleLLM {
998
+ constructor(apiKey, model = "accounts/fireworks/models/llama-v3p1-70b-instruct", options) {
999
+ super({
1000
+ baseUrl: "https://api.fireworks.ai/inference/v1",
1001
+ model,
1002
+ apiKey,
1003
+ ...options
1004
+ });
1005
+ }
1006
+ toString() {
1007
+ return `FireworksLLM(model=${JSON.stringify(this.model)})`;
1008
+ }
1009
+ };
1010
+ var OpenRouterLLM = class extends OpenAICompatibleLLM {
1011
+ constructor(apiKey, model = "anthropic/claude-sonnet-4", options) {
1012
+ super({
1013
+ baseUrl: "https://openrouter.ai/api/v1",
1014
+ model,
1015
+ apiKey,
1016
+ ...options
1017
+ });
1018
+ }
1019
+ toString() {
1020
+ return `OpenRouterLLM(model=${JSON.stringify(this.model)})`;
1021
+ }
1022
+ };
1023
+ var DeepSeekLLM = class extends OpenAICompatibleLLM {
1024
+ constructor(apiKey, model = "deepseek-chat", options) {
1025
+ super({
1026
+ baseUrl: "https://api.deepseek.com/v1",
1027
+ model,
1028
+ apiKey,
1029
+ ...options
1030
+ });
1031
+ }
1032
+ toString() {
1033
+ return `DeepSeekLLM(model=${JSON.stringify(this.model)})`;
1034
+ }
1035
+ };
1036
+ var CerebrasLLM = class extends OpenAICompatibleLLM {
1037
+ constructor(apiKey, model = "llama-3.3-70b", options) {
1038
+ super({
1039
+ baseUrl: "https://api.cerebras.ai/v1",
1040
+ model,
1041
+ apiKey,
1042
+ ...options
1043
+ });
1044
+ }
1045
+ toString() {
1046
+ return `CerebrasLLM(model=${JSON.stringify(this.model)})`;
1047
+ }
1048
+ };
1049
+ var SambanovaLLM = class extends OpenAICompatibleLLM {
1050
+ constructor(apiKey, model = "Meta-Llama-3.1-70B-Instruct", options) {
1051
+ super({
1052
+ baseUrl: "https://api.sambanova.ai/v1",
1053
+ model,
1054
+ apiKey,
1055
+ ...options
1056
+ });
1057
+ }
1058
+ toString() {
1059
+ return `SambanovaLLM(model=${JSON.stringify(this.model)})`;
1060
+ }
1061
+ };
1062
+ var HuggingFaceLLM = class extends BaseLLM {
1063
+ apiKey;
1064
+ model;
1065
+ maxTokens;
1066
+ constructor(apiKey, model = "mistralai/Mistral-7B-Instruct-v0.3", maxTokens = 4096) {
1067
+ super();
1068
+ this.apiKey = apiKey;
1069
+ this.model = model;
1070
+ this.maxTokens = maxTokens;
1071
+ }
1072
+ async generate(prompt) {
1073
+ const url = `https://api-inference.huggingface.co/models/${this.model}/v1/chat/completions`;
1074
+ const payload = {
1075
+ model: this.model,
1076
+ messages: [{ role: "user", content: prompt }],
1077
+ max_tokens: this.maxTokens
1078
+ };
1079
+ const resp = await fetch(url, {
1080
+ method: "POST",
1081
+ headers: {
1082
+ "Content-Type": "application/json",
1083
+ "User-Agent": "TreeDex/0.1",
1084
+ Authorization: `Bearer ${this.apiKey}`
1085
+ },
1086
+ body: JSON.stringify(payload),
1087
+ signal: AbortSignal.timeout(12e4)
1088
+ });
1089
+ if (!resp.ok) {
1090
+ const errorBody = await resp.text();
1091
+ throw new Error(
1092
+ `HuggingFace request failed (${resp.status}): ${errorBody}`
1093
+ );
1094
+ }
1095
+ const body = await resp.json();
1096
+ return body.choices[0].message.content;
1097
+ }
1098
+ toString() {
1099
+ return `HuggingFaceLLM(model=${JSON.stringify(this.model)})`;
1100
+ }
1101
+ };
1102
+ var OllamaLLM = class extends BaseLLM {
1103
+ model;
1104
+ baseUrl;
1105
+ constructor(model = "llama3", baseUrl = "http://localhost:11434") {
1106
+ super();
1107
+ this.model = model;
1108
+ this.baseUrl = baseUrl.replace(/\/+$/, "");
1109
+ }
1110
+ async generate(prompt) {
1111
+ const url = `${this.baseUrl}/api/generate`;
1112
+ const payload = {
1113
+ model: this.model,
1114
+ prompt,
1115
+ stream: false
1116
+ };
1117
+ const resp = await fetch(url, {
1118
+ method: "POST",
1119
+ headers: {
1120
+ "Content-Type": "application/json",
1121
+ "User-Agent": "TreeDex/0.1"
1122
+ },
1123
+ body: JSON.stringify(payload),
1124
+ signal: AbortSignal.timeout(12e4)
1125
+ });
1126
+ if (!resp.ok) {
1127
+ const errorBody = await resp.text();
1128
+ throw new Error(
1129
+ `Ollama request failed (${resp.status}): ${errorBody}`
1130
+ );
1131
+ }
1132
+ const body = await resp.json();
1133
+ return body.response;
1134
+ }
1135
+ toString() {
1136
+ return `OllamaLLM(model=${JSON.stringify(this.model)})`;
1137
+ }
1138
+ };
1139
+ var FunctionLLM = class extends BaseLLM {
1140
+ _fn;
1141
+ constructor(fn) {
1142
+ super();
1143
+ if (typeof fn !== "function") {
1144
+ throw new TypeError(`Expected a function, got ${typeof fn}`);
1145
+ }
1146
+ this._fn = fn;
1147
+ }
1148
+ async generate(prompt) {
1149
+ const result = await this._fn(prompt);
1150
+ if (typeof result !== "string") {
1151
+ throw new TypeError(
1152
+ `LLM function must return string, got ${typeof result}`
1153
+ );
1154
+ }
1155
+ return result;
1156
+ }
1157
+ toString() {
1158
+ const name = this._fn.name || "anonymous";
1159
+ return `FunctionLLM(fn=${name})`;
1160
+ }
1161
+ };
1162
+
1163
+ // src/index.ts
1164
+ init_pdf_parser();
1165
+ // Annotate the CommonJS export names for ESM import in node:
1166
+ 0 && (module.exports = {
1167
+ BaseLLM,
1168
+ CerebrasLLM,
1169
+ ClaudeLLM,
1170
+ CohereLLM,
1171
+ DOCXLoader,
1172
+ DeepSeekLLM,
1173
+ FireworksLLM,
1174
+ FunctionLLM,
1175
+ GeminiLLM,
1176
+ GroqLLM,
1177
+ HTMLLoader,
1178
+ HuggingFaceLLM,
1179
+ MistralLLM,
1180
+ OllamaLLM,
1181
+ OpenAICompatibleLLM,
1182
+ OpenAILLM,
1183
+ OpenRouterLLM,
1184
+ PDFLoader,
1185
+ QueryResult,
1186
+ SambanovaLLM,
1187
+ TextLoader,
1188
+ TogetherLLM,
1189
+ TreeDex,
1190
+ assignNodeIds,
1191
+ assignPageRanges,
1192
+ autoLoader,
1193
+ collectNodeTexts,
1194
+ countNodes,
1195
+ countTokens,
1196
+ createNodeMapping,
1197
+ embedTextInTree,
1198
+ extractJson,
1199
+ extractPages,
1200
+ findLargeNodes,
1201
+ getLeafNodes,
1202
+ groupPages,
1203
+ listToTree,
1204
+ pagesToTaggedText,
1205
+ printTree,
1206
+ retrievalPrompt,
1207
+ stripTextFromTree,
1208
+ structureContinuePrompt,
1209
+ structureExtractionPrompt,
1210
+ textToPages,
1211
+ treeToFlatList
1212
+ });
1213
+ //# sourceMappingURL=index.cjs.map