react-native-pageindex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/LICENSE +21 -0
  3. package/README.md +405 -0
  4. package/dist/config.d.ts +4 -0
  5. package/dist/config.d.ts.map +1 -0
  6. package/dist/config.js +22 -0
  7. package/dist/config.js.map +1 -0
  8. package/dist/index.d.ts +49 -0
  9. package/dist/index.d.ts.map +1 -0
  10. package/dist/index.js +75 -0
  11. package/dist/index.js.map +1 -0
  12. package/dist/pageIndex.d.ts +48 -0
  13. package/dist/pageIndex.d.ts.map +1 -0
  14. package/dist/pageIndex.js +962 -0
  15. package/dist/pageIndex.js.map +1 -0
  16. package/dist/pageIndexDocument.d.ts +85 -0
  17. package/dist/pageIndexDocument.d.ts.map +1 -0
  18. package/dist/pageIndexDocument.js +145 -0
  19. package/dist/pageIndexDocument.js.map +1 -0
  20. package/dist/pageIndexMd.d.ts +31 -0
  21. package/dist/pageIndexMd.d.ts.map +1 -0
  22. package/dist/pageIndexMd.js +260 -0
  23. package/dist/pageIndexMd.js.map +1 -0
  24. package/dist/parsers/csv.d.ts +17 -0
  25. package/dist/parsers/csv.d.ts.map +1 -0
  26. package/dist/parsers/csv.js +147 -0
  27. package/dist/parsers/csv.js.map +1 -0
  28. package/dist/parsers/docx.d.ts +20 -0
  29. package/dist/parsers/docx.d.ts.map +1 -0
  30. package/dist/parsers/docx.js +134 -0
  31. package/dist/parsers/docx.js.map +1 -0
  32. package/dist/parsers/xlsx.d.ts +19 -0
  33. package/dist/parsers/xlsx.d.ts.map +1 -0
  34. package/dist/parsers/xlsx.js +121 -0
  35. package/dist/parsers/xlsx.js.map +1 -0
  36. package/dist/reverseIndex.d.ts +39 -0
  37. package/dist/reverseIndex.d.ts.map +1 -0
  38. package/dist/reverseIndex.js +248 -0
  39. package/dist/reverseIndex.js.map +1 -0
  40. package/dist/types.d.ts +190 -0
  41. package/dist/types.d.ts.map +1 -0
  42. package/dist/types.js +4 -0
  43. package/dist/types.js.map +1 -0
  44. package/dist/utils/json.d.ts +13 -0
  45. package/dist/utils/json.d.ts.map +1 -0
  46. package/dist/utils/json.js +69 -0
  47. package/dist/utils/json.js.map +1 -0
  48. package/dist/utils/pdf.d.ts +20 -0
  49. package/dist/utils/pdf.d.ts.map +1 -0
  50. package/dist/utils/pdf.js +96 -0
  51. package/dist/utils/pdf.js.map +1 -0
  52. package/dist/utils/progress.d.ts +29 -0
  53. package/dist/utils/progress.d.ts.map +1 -0
  54. package/dist/utils/progress.js +59 -0
  55. package/dist/utils/progress.js.map +1 -0
  56. package/dist/utils/tokens.d.ts +7 -0
  57. package/dist/utils/tokens.d.ts.map +1 -0
  58. package/dist/utils/tokens.js +12 -0
  59. package/dist/utils/tokens.js.map +1 -0
  60. package/dist/utils/tree.d.ts +88 -0
  61. package/dist/utils/tree.d.ts.map +1 -0
  62. package/dist/utils/tree.js +365 -0
  63. package/dist/utils/tree.js.map +1 -0
  64. package/package.json +76 -0
@@ -0,0 +1,248 @@
1
+ "use strict";
2
+ /**
3
+ * Reverse (inverted) index — maps terms → tree nodes that contain them.
4
+ *
5
+ * Two modes:
6
+ * 'keyword' — fast, no LLM. Extracts stopword-filtered terms with TF scoring.
7
+ * 'llm' — slower, semantic. Uses LLM to extract concept terms per node.
8
+ */
9
+ Object.defineProperty(exports, "__esModule", { value: true });
10
+ exports.buildReverseIndex = buildReverseIndex;
11
+ exports.searchReverseIndex = searchReverseIndex;
12
+ const tree_1 = require("./utils/tree");
13
+ // ─── Stopwords ────────────────────────────────────────────────────────────────
14
+ const STOPWORDS = new Set([
15
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
16
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
17
+ 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
18
+ 'would', 'could', 'should', 'may', 'might', 'shall', 'can', 'not',
19
+ 'this', 'that', 'these', 'those', 'it', 'its', 'we', 'our', 'you',
20
+ 'your', 'he', 'she', 'they', 'their', 'my', 'his', 'her', 'which',
21
+ 'who', 'what', 'when', 'where', 'how', 'if', 'then', 'so', 'than',
22
+ 'more', 'also', 'about', 'into', 'up', 'out', 'no', 'all', 'each',
23
+ 'any', 'some', 'other', 'new', 'one', 'two', 'such', 'only', 'over',
24
+ 'after', 'before', 'between', 'through', 'during', 'including', 'without',
25
+ 'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus',
26
+ 'except', 'however', 'therefore', 'thus', 'hence', 'while', 'although',
27
+ 'because', 'since', 'unless', 'until', 'whether', 'both', 'either',
28
+ 'neither', 'per', 'via', 'etc', 'ie', 'eg',
29
+ ]);
30
+ // ─── Keyword Extraction ───────────────────────────────────────────────────────
31
+ function tokenise(text) {
32
+ return text
33
+ .toLowerCase()
34
+ .replace(/[^\w\s-]/g, ' ')
35
+ .split(/\s+/)
36
+ .map((t) => t.replace(/^-+|-+$/g, '').trim())
37
+ .filter(Boolean);
38
+ }
39
+ function extractKeywords(text, minLength) {
40
+ const tokens = tokenise(text);
41
+ const tf = new Map();
42
+ for (const token of tokens) {
43
+ if (token.length < minLength)
44
+ continue;
45
+ if (STOPWORDS.has(token))
46
+ continue;
47
+ if (/^\d+$/.test(token))
48
+ continue; // pure numbers
49
+ tf.set(token, (tf.get(token) ?? 0) + 1);
50
+ }
51
+ return tf;
52
+ }
53
+ // Normalise TF into 0-1 score (log-normalised)
54
+ function normaliseTf(count, maxCount) {
55
+ if (maxCount === 0)
56
+ return 0;
57
+ return Math.log1p(count) / Math.log1p(maxCount);
58
+ }
59
+ // ─── LLM Prompt ───────────────────────────────────────────────────────────────
60
+ function buildLlmPrompt(nodeTitle, nodeSummary, maxTerms) {
61
+ return `Extract up to ${maxTerms} key concepts, named entities, or important terms from the section below.
62
+ Return a JSON array of strings only — short terms or phrases (1–4 words each), no duplicates, no stopwords.
63
+
64
+ Section title: ${nodeTitle}
65
+ Section summary: ${nodeSummary}
66
+
67
+ Respond with ONLY a JSON array, e.g.: ["machine learning", "gradient descent", "neural network"]`;
68
+ }
69
+ async function extractLlmTerms(node, llm, maxTerms) {
70
+ const title = node.title ?? '';
71
+ const summary = node.summary ?? node.prefix_summary ?? node.text ?? '';
72
+ if (!title && !summary)
73
+ return [];
74
+ const prompt = buildLlmPrompt(title, summary, maxTerms);
75
+ try {
76
+ const result = await llm(prompt);
77
+ const match = result.content.match(/\[[\s\S]*\]/);
78
+ if (!match)
79
+ return [];
80
+ const parsed = JSON.parse(match[0]);
81
+ if (!Array.isArray(parsed))
82
+ return [];
83
+ return parsed
84
+ .filter((t) => typeof t === 'string')
85
+ .map((t) => t.toLowerCase().trim())
86
+ .filter((t) => t.length >= 2)
87
+ .slice(0, maxTerms);
88
+ }
89
+ catch {
90
+ return [];
91
+ }
92
+ }
93
+ // ─── Node Text for Keyword Mode ───────────────────────────────────────────────
94
+ function nodeTextForKeywords(node) {
95
+ const parts = [];
96
+ if (node.title)
97
+ parts.push(node.title, node.title); // weight title 2x
98
+ if (node.summary)
99
+ parts.push(node.summary);
100
+ if (node.prefix_summary)
101
+ parts.push(node.prefix_summary);
102
+ if (node.text)
103
+ parts.push(node.text);
104
+ return parts.join(' ');
105
+ }
106
+ // ─── Build Reverse Index ──────────────────────────────────────────────────────
107
+ /**
108
+ * Builds an inverted index from a `PageIndexResult`.
109
+ *
110
+ * In **keyword** mode (default), terms are extracted via stopword-filtered TF
111
+ * scoring — fast, no LLM calls needed.
112
+ *
113
+ * In **llm** mode, the LLM extracts semantic concept terms from each node's
114
+ * title + summary — slower but catches synonyms/concepts.
115
+ *
116
+ * @param result The forward-index output from `pageIndex()` / `pageIndexMd()`
117
+ * @param pages Original page data (optional; used for extra keyword signal)
118
+ * @param llm LLM provider (required for mode 'llm')
119
+ * @param options Index options
120
+ */
121
+ async function buildReverseIndex(input) {
122
+ const { result, llm, options = {} } = input;
123
+ const { mode = 'keyword', minTermLength = 3, maxTermsPerNode = 10, onProgress, } = options;
124
+ if (mode === 'llm' && !llm) {
125
+ throw new Error('[PageIndex] LLM provider is required when mode is "llm"');
126
+ }
127
+ // Flatten tree into leaf + branch nodes (all nodes)
128
+ const nodes = (0, tree_1.getNodes)(result.structure);
129
+ const terms = {};
130
+ const total = nodes.length;
131
+ for (let i = 0; i < total; i++) {
132
+ const node = nodes[i];
133
+ onProgress?.({
134
+ step: mode === 'llm' ? 'Extracting concepts via LLM' : 'Extracting keywords',
135
+ percent: Math.round((i / total) * 90),
136
+ detail: `Node ${i + 1} / ${total}: ${node.title ?? ''}`,
137
+ });
138
+ const entry = {
139
+ nodeId: node.node_id,
140
+ nodeTitle: node.title ?? '',
141
+ startIndex: node.start_index,
142
+ endIndex: node.end_index,
143
+ score: 0, // filled in below
144
+ };
145
+ if (mode === 'keyword') {
146
+ const text = nodeTextForKeywords(node);
147
+ const tf = extractKeywords(text, minTermLength);
148
+ if (tf.size === 0)
149
+ continue;
150
+ const maxCount = Math.max(...tf.values());
151
+ // Keep top-N by count
152
+ const sorted = [...tf.entries()]
153
+ .sort((a, b) => b[1] - a[1])
154
+ .slice(0, maxTermsPerNode);
155
+ for (const [term, count] of sorted) {
156
+ const score = normaliseTf(count, maxCount);
157
+ const entryWithScore = { ...entry, score };
158
+ if (!terms[term])
159
+ terms[term] = [];
160
+ terms[term].push(entryWithScore);
161
+ }
162
+ }
163
+ else {
164
+ // LLM mode
165
+ const termList = await extractLlmTerms(node, llm, maxTermsPerNode);
166
+ for (let rank = 0; rank < termList.length; rank++) {
167
+ const term = termList[rank];
168
+ // Score decays with position (first term most important)
169
+ const score = 1 - rank / termList.length;
170
+ const entryWithScore = { ...entry, score };
171
+ if (!terms[term])
172
+ terms[term] = [];
173
+ terms[term].push(entryWithScore);
174
+ }
175
+ }
176
+ }
177
+ // Sort each term's entries by score descending
178
+ for (const term of Object.keys(terms)) {
179
+ terms[term].sort((a, b) => b.score - a.score);
180
+ }
181
+ onProgress?.({
182
+ step: 'Reverse index complete',
183
+ percent: 100,
184
+ detail: `${Object.keys(terms).length} terms across ${total} nodes`,
185
+ });
186
+ return {
187
+ docName: result.doc_name,
188
+ terms,
189
+ stats: {
190
+ totalTerms: Object.keys(terms).length,
191
+ totalNodes: total,
192
+ indexMode: mode,
193
+ indexedAt: new Date().toISOString(),
194
+ },
195
+ };
196
+ }
197
+ // ─── Search ───────────────────────────────────────────────────────────────────
198
+ /**
199
+ * Queries the reverse index for one or more terms.
200
+ * Multi-word queries are split and each term is looked up separately;
201
+ * nodes matching multiple terms get a combined score boost.
202
+ *
203
+ * @param index The reverse index (from `buildReverseIndex`)
204
+ * @param query Free-text query string
205
+ * @param topK Max results to return (default: 10)
206
+ */
207
+ function searchReverseIndex(index, query, topK = 10) {
208
+ const queryTerms = tokenise(query).filter((t) => t.length >= 2 && !STOPWORDS.has(t));
209
+ if (queryTerms.length === 0)
210
+ return [];
211
+ // nodeId (or nodeTitle as fallback) → combined result
212
+ const combined = new Map();
213
+ for (const qTerm of queryTerms) {
214
+ // Exact match
215
+ const exactHits = index.terms[qTerm] ?? [];
216
+ // Prefix / partial match
217
+ const partialHits = [];
218
+ for (const [term, entries] of Object.entries(index.terms)) {
219
+ if (term !== qTerm && term.includes(qTerm)) {
220
+ partialHits.push(...entries.map((e) => ({ ...e, score: e.score * 0.6 })));
221
+ }
222
+ }
223
+ const allHits = [...exactHits, ...partialHits];
224
+ for (const hit of allHits) {
225
+ const key = hit.nodeId ?? hit.nodeTitle;
226
+ const existing = combined.get(key);
227
+ if (existing) {
228
+ existing.totalScore += hit.score;
229
+ // keep the best individual term score
230
+ if (hit.score > existing.score) {
231
+ existing.score = hit.score;
232
+ existing.matchedTerm = qTerm;
233
+ }
234
+ }
235
+ else {
236
+ combined.set(key, {
237
+ ...hit,
238
+ matchedTerm: qTerm,
239
+ totalScore: hit.score,
240
+ });
241
+ }
242
+ }
243
+ }
244
+ return [...combined.values()]
245
+ .sort((a, b) => b.totalScore - a.totalScore)
246
+ .slice(0, topK);
247
+ }
248
+ //# sourceMappingURL=reverseIndex.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reverseIndex.js","sourceRoot":"","sources":["../src/reverseIndex.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG;;AAoIH,8CAiGC;AAaD,gDAoDC;AA1RD,uCAAwC;AAExC,iFAAiF;AAEjF,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC;IACxB,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK;IACnE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI;IAClE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAClE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;IACjE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK;IACjE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO;IACjE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM;IACjE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM;IACjE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IACnE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS;IACzE,QAAQ,EAAE,OAAO,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM;IACpE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU;IACtE,SAAS,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,QAAQ;IAClE,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI;CAC3C,CAAC,CAAC;AAEH,iFAAiF;AAEjF,SAAS,QAAQ,CAAC,IAAY;IAC5B,OAAO,IAAI;SACR,WAAW,EAAE;SACb,OAAO,CAAC,WAAW,EAAE,GAAG,CAAC;SACzB,KAAK,CAAC,KAAK,CAAC;SACZ,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;SAC5C,MAAM,CAAC,OAAO,CAAC,CAAC;AACrB,CAAC;AAED,SAAS,eAAe,CACtB,IAAY,EACZ,SAAiB;IAEjB,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAC;IAErC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,MAAM,GAAG,SAAS;YAAE,SAAS;QACvC,IAAI,SAAS,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,SAAS;QACnC,IAAI,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC;YAAE,SAAS,CAAC,eAAe;QAClD,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,+CAA+C;AAC/C,SAAS,WAAW,CAAC,KAAa,EAAE,QAAgB;IAClD,IAAI,QAAQ,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAC7B,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;AAClD,CAAC;AAED,iFAAiF;AAEjF,SAAS,cAAc,CAAC,SAAiB,EAAE,WAAmB,EAAE,QAAgB;IAC9E,OAAO,iBAAiB,QAAQ;;;iBAGjB,SAAS;mBACP,WAAW;;iGAEmE,CAAC;AAClG,CAAC;AAED,KAAK,UAAU,eAAe,CAC5B,IAAc,EACd,GAAgB,EAChB,QAAgB;IAEhB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;IAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;IACvE,IAAI,CAAC,KAAK,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,CAAC;IAElC,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;IACxD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,CAAC;QACjC,MAAM,KAAK,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAClD,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAC;QACtB,MAAM,MAAM,GAAY,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC;YAAE,OAAO,EAAE,CAAC;QACtC,OAAQ,MAAoB;aACzB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;aACpC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAY,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;aAC9C,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;aAC5B,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF,SAAS,mBAAmB,CAAC,IAAc;IACzC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,IAAI,CAAC,KAAK;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,kBAAkB;IACtE,IAAI,IAAI,CAAC,OAAO;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC3C,IAAI,IAAI,CAAC,cAAc;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IACzD,IAAI,IAAI,CAAC,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrC,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,iFAAiF;AAEjF;;;;;;;;;;;;;GAaG;AACI,KAAK,UAAU,iBAAiB,CAAC,KAKvC;IACC,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,OAAO,GAAG,EAAE,EAAE,GAAG,KAAK,CAAC;IAC5C,MAAM,EACJ,IAAI,GAAG,SAAS,EAChB,aAAa,GAAG,CAAC,EACjB,eAAe,GAAG,EAAE,EACpB,UAAU,GACX,GAAG,OAAO,CAAC;IAEZ,IAAI,IAAI,KAAK,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;IAC7E,CAAC;IAED,oDAAoD;IACpD,MAAM,KAAK,GAAG,IAAA,eAAQ,EAAC,MAAM,CAAC,SAAS,CAAC,CAAC;IAEzC,MAAM,KAAK,GAAwC,EAAE,CAAC;IAEtD,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;IAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEtB,UAAU,EAAE,CAAC;YACX,IAAI,EAAE,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,6BAA6B,CAAC,CAAC,CAAC,qBAAqB;YAC5E,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,GAAG,EAAE,CAAC;YACrC,MAAM,EAAE,QAAQ,CAAC,GAAG,CAAC,MAAM,KAAK,KAAK,IAAI,CAAC,KAAK,IAAI,EAAE,EAAE;SACxD,CAAC,CAAC;QAEH,MAAM,KAAK,GAAsB;YAC/B,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,SAAS,EAAE,IAAI,CAAC,KAAK,IAAI,EAAE;YAC3B,UAAU,EAAE,IAAI,CAAC,WAAW;YAC5B,QAAQ,EAAE,IAAI,CAAC,SAAS;YACxB,KAAK,EAAE,CAAC,EAAE,kBAAkB;SAC7B,CAAC;QAEF,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,IAAI,GAAG,mBAAmB,CAAC,IAAI,CAAC,CAAC;YACvC,MAAM,EAAE,GAAG,eAAe,CAAC,IAAI,EAAE,aAAa,CAAC,CAAC;YAEhD,IAAI,EAAE,CAAC,IAAI,KAAK,CAAC;gBAAE,SAAS;YAE5B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;YAE1C,sBAAsB;YACtB,MAAM,MAAM,GAAG,CAAC,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC;iBAC7B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;iBAC3B,KAAK,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;YAE7B,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;gBACnC,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;gBAC3C,MAAM,cAAc,GAAsB,EAAE,GAAG,KAAK,EAAE,KAAK,EAAE,CAAC;gBAC9D,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;gBACnC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;aAAM,CAAC;YACN,WAAW;YACX,MAAM,QAAQ,GAAG,MAAM,eAAe,CAAC,IAAI,EAAE,GAAI,EAAE,eAAe,CAAC,CAAC;YAEpE,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC;gBAClD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;gBAC5B,yDAAyD;gBACzD,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC;gBACzC,MAAM,cAAc,GAAsB,EAAE,GAAG,KAAK,EAAE,KAAK,EAAE,CAAC;gBAC9D,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;gBACnC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;IACH,CAAC;IAED,+CAA+C;IAC/C,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACtC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAChD,CAAC;IAED,UAAU,EAAE,CAAC;QACX,IAAI,EAAE,wBAAwB;QAC9B,OAAO,EAAE,GAAG;QACZ,MAAM,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,iBAAiB,KAAK,QAAQ;KACnE,CAAC,CAAC;IAEH,OAAO;QACL,OAAO,EAAE,MAAM,CAAC,QAAQ;QACxB,KAAK;QACL,KAAK,EAAE;YACL,UAAU,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM;YACrC,UAAU,EAAE,KAAK;YACjB,SAAS,EAAE,IAAI;YACf,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;AACJ,CAAC;AAED,iFAAiF;AAEjF;;;;;;;;GAQG;AACH,SAAgB,kBAAkB,CAChC,KAAmB,EACnB,KAAa,EACb,IAAI,GAAG,EAAE;IAET,MAAM,UAAU,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,MAAM,CACvC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAC1C,CAAC;IAEF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEvC,sDAAsD;IACtD,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAwB,CAAC;IAEjD,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,cAAc;QACd,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;QAC3C,yBAAyB;QACzB,MAAM,WAAW,GAAwB,EAAE,CAAC;QAC5C,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1D,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC3C,WAAW,CAAC,IAAI,CACd,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,GAAG,EAAE,CAAC,CAAC,CACxD,CAAC;YACJ,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,CAAC,GAAG,SAAS,EAAE,GAAG,WAAW,CAAC,CAAC;QAE/C,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;YAC1B,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,SAAS,CAAC;YACxC,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACnC,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,UAAU,IAAI,GAAG,CAAC,KAAK,CAAC;gBACjC,sCAAsC;gBACtC,IAAI,GAAG,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,EAAE,CAAC;oBAC/B,QAAQ,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;oBAC3B,QAAQ,CAAC,WAAW,GAAG,KAAK,CAAC;gBAC/B,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE;oBAChB,GAAG,GAAG;oBACN,WAAW,EAAE,KAAK;oBAClB,UAAU,EAAE,GAAG,CAAC,KAAK;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC;SAC3C,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;AACpB,CAAC"}
@@ -0,0 +1,190 @@
1
+ /** A single message in a chat conversation */
2
+ export interface LLMMessage {
3
+ role: 'user' | 'assistant';
4
+ content: string;
5
+ }
6
+ /** The finish reason returned by the LLM */
7
+ export type LLMFinishReason = 'stop' | 'length' | string;
8
+ /** The result of an LLM call */
9
+ export interface LLMResult {
10
+ content: string;
11
+ finishReason: LLMFinishReason;
12
+ }
13
+ /**
14
+ * Provider-agnostic LLM callback.
15
+ * Wire up OpenAI, Anthropic, Ollama, or any other provider here.
16
+ *
17
+ * @example
18
+ * const llm: LLMProvider = async (prompt, opts) => {
19
+ * const response = await openai.chat.completions.create({
20
+ * model: 'gpt-4o',
21
+ * messages: [
22
+ * ...(opts?.chatHistory ?? []),
23
+ * { role: 'user', content: prompt },
24
+ * ],
25
+ * });
26
+ * return {
27
+ * content: response.choices[0].message.content ?? '',
28
+ * finishReason: response.choices[0].finish_reason ?? 'stop',
29
+ * };
30
+ * };
31
+ */
32
+ export type LLMProvider = (prompt: string, options?: {
33
+ chatHistory?: LLMMessage[];
34
+ }) => Promise<LLMResult>;
35
+ /** Emitted at every major milestone during processing */
36
+ export interface ProgressInfo {
37
+ /** Human-readable description of the current step */
38
+ step: string;
39
+ /** Overall progress 0–100 */
40
+ percent: number;
41
+ /** Optional extra detail, e.g. "Page 3 / 45" or "Node 2 / 8" */
42
+ detail?: string;
43
+ }
44
+ export type ProgressCallback = (info: ProgressInfo) => void;
45
+ /**
46
+ * Returns the approximate number of tokens in `text`.
47
+ * Default implementation: Math.ceil(text.length / 4)
48
+ * Plug in `js-tiktoken` or similar for exact counts.
49
+ */
50
+ export type TokenCounter = (text: string) => number;
51
+ /** Text and token count for a single document page */
52
+ export interface PageData {
53
+ text: string;
54
+ tokenCount: number;
55
+ }
56
+ /** A node in the hierarchical tree index */
57
+ export interface TreeNode {
58
+ title: string;
59
+ node_id?: string;
60
+ start_index?: number;
61
+ end_index?: number;
62
+ summary?: string;
63
+ prefix_summary?: string;
64
+ text?: string;
65
+ nodes?: TreeNode[];
66
+ structure?: string;
67
+ physical_index?: number | null;
68
+ appear_start?: string;
69
+ list_index?: number;
70
+ page?: number | null;
71
+ }
72
+ /** The final output of pageIndex() or pageIndexMd() */
73
+ export interface PageIndexResult {
74
+ doc_name: string;
75
+ doc_description?: string;
76
+ structure: TreeNode[];
77
+ }
78
+ /** File formats supported by pageIndexDocument() */
79
+ export type DocumentFileType = 'pdf' | 'docx' | 'csv' | 'xlsx' | 'md';
80
+ /** Options for CSV parsing */
81
+ export interface CsvParseOptions {
82
+ /** Column delimiter (default: auto-detect from ',', ';', '\t') */
83
+ delimiter?: string;
84
+ /** Rows per page-chunk (default: 100) */
85
+ rowsPerPage?: number;
86
+ /** Treat first row as header (default: true) */
87
+ hasHeader?: boolean;
88
+ }
89
+ /** Options for XLSX parsing */
90
+ export interface XlsxParseOptions {
91
+ /** Sheet names to include (default: all sheets) */
92
+ sheets?: string[];
93
+ /** Max rows per sheet-chunk (default: 200) */
94
+ rowsPerChunk?: number;
95
+ }
96
+ /** A single reverse-index hit — points back to the tree node that contains the term */
97
+ export interface ReverseIndexEntry {
98
+ nodeId?: string;
99
+ nodeTitle: string;
100
+ /** Start page of the node (1-based) */
101
+ startIndex?: number;
102
+ /** End page of the node (1-based) */
103
+ endIndex?: number;
104
+ /**
105
+ * Relevance score 0–1.
106
+ * Keyword mode: TF-based score.
107
+ * LLM mode: LLM-assigned importance.
108
+ */
109
+ score: number;
110
+ }
111
+ /** The complete reverse (inverted) index for one document */
112
+ export interface ReverseIndex {
113
+ docName: string;
114
+ /**
115
+ * Maps every indexed term → list of nodes that contain it, sorted by score desc.
116
+ * Keys are lowercase, normalised terms.
117
+ */
118
+ terms: Record<string, ReverseIndexEntry[]>;
119
+ stats: {
120
+ totalTerms: number;
121
+ totalNodes: number;
122
+ indexMode: 'keyword' | 'llm';
123
+ indexedAt: string;
124
+ };
125
+ }
126
+ /** A ranked search result returned by searchReverseIndex() */
127
+ export interface SearchResult extends ReverseIndexEntry {
128
+ /** The term that matched the query */
129
+ matchedTerm: string;
130
+ /** Combined score when multiple terms match */
131
+ totalScore: number;
132
+ }
133
+ /** Options for building the reverse index */
134
+ export interface ReverseIndexOptions {
135
+ /**
136
+ * 'keyword' — fast, no LLM calls, extracts terms via stopword-filtered TF
137
+ * 'llm' — slower, semantic; uses LLM to extract concepts per node
138
+ * default: 'keyword'
139
+ */
140
+ mode?: 'keyword' | 'llm';
141
+ /** Minimum term length to index (default: 3) */
142
+ minTermLength?: number;
143
+ /** Max number of terms to extract per node in LLM mode (default: 10) */
144
+ maxTermsPerNode?: number;
145
+ /** Called at each major milestone */
146
+ onProgress?: ProgressCallback;
147
+ }
148
+ /** Options for the PDF pipeline */
149
+ export interface PageIndexOptions {
150
+ /** Number of pages to scan for an existing Table of Contents (default: 20) */
151
+ tocCheckPageNum?: number;
152
+ /** Max pages a single tree node may span before sub-indexing (default: 10) */
153
+ maxPageNumEachNode?: number;
154
+ /** Max tokens a single tree node may contain before sub-indexing (default: 20000) */
155
+ maxTokenNumEachNode?: number;
156
+ /** Whether to add a sequential node_id to each node (default: true) */
157
+ ifAddNodeId?: boolean;
158
+ /** Whether to generate an LLM summary for each node (default: true) */
159
+ ifAddNodeSummary?: boolean;
160
+ /** Whether to generate a one-sentence document description (default: false) */
161
+ ifAddDocDescription?: boolean;
162
+ /** Whether to include raw page text in each node (default: false) */
163
+ ifAddNodeText?: boolean;
164
+ /** Custom token counting function (default: ~4 chars/token approximation) */
165
+ tokenCounter?: TokenCounter;
166
+ /** Called at each major processing milestone with step name and 0–100 percent */
167
+ onProgress?: ProgressCallback;
168
+ }
169
+ /** Options for the Markdown pipeline */
170
+ export interface MdPageIndexOptions {
171
+ /** Whether to merge small nodes together (default: false) */
172
+ ifThinning?: boolean;
173
+ /** Minimum token threshold for thinning (default: 5000) */
174
+ minTokenThreshold?: number;
175
+ /** Whether to generate an LLM summary for each node (default: true) */
176
+ ifAddNodeSummary?: boolean;
177
+ /** Token threshold below which raw text is used instead of generating a summary (default: 200) */
178
+ summaryTokenThreshold?: number;
179
+ /** Whether to generate a one-sentence document description (default: false) */
180
+ ifAddDocDescription?: boolean;
181
+ /** Whether to include raw text in each node (default: false) */
182
+ ifAddNodeText?: boolean;
183
+ /** Whether to add a sequential node_id to each node (default: true) */
184
+ ifAddNodeId?: boolean;
185
+ /** Custom token counting function */
186
+ tokenCounter?: TokenCounter;
187
+ /** Called at each major processing milestone with step name and 0–100 percent */
188
+ onProgress?: ProgressCallback;
189
+ }
190
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA,8CAA8C;AAC9C,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,MAAM,GAAG,WAAW,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,4CAA4C;AAC5C,MAAM,MAAM,eAAe,GAAG,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;AAEzD,gCAAgC;AAChC,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,eAAe,CAAC;CAC/B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,MAAM,WAAW,GAAG,CACxB,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;IAAE,WAAW,CAAC,EAAE,UAAU,EAAE,CAAA;CAAE,KACrC,OAAO,CAAC,SAAS,CAAC,CAAC;AAIxB,yDAAyD;AACzD,MAAM,WAAW,YAAY;IAC3B,qDAAqD;IACrD,IAAI,EAAE,MAAM,CAAC;IACb,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,gEAAgE;IAChE,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,MAAM,gBAAgB,GAAG,CAAC,IAAI,EAAE,YAAY,KAAK,IAAI,CAAC;AAI5D;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;AAIpD,sDAAsD;AACtD,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;CACpB;AAID,4CAA4C;AAC5C,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,QAAQ,EAAE,CAAC;IAEnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB;AAED,uDAAuD;AACvD,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,SAAS,EAAE,QAAQ,EAAE,CAAC;CACvB;AAID,oDAAoD;AACpD,MAAM,MAAM,gBAAgB,GAAG,KAAK,GAAG,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAEtE,8BAA8B;AAC9B,MAAM,WAAW,eAAe;IAC9B,kEAAkE;IAClE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,yCAAyC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,+BAA+B;AAC/B,MAAM,WAAW,gBAAgB;IAC/B,mDAAmD;IACnD,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAClB,8CAA8C;IAC9C,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAID,uFAAuF;AACvF,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,uCAAuC;IACvC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qCAAqC;IACrC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAC;CACf;AAED,6DAA6D;AAC7D,MAAM,WAAW,YAAY;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,iBAAiB,EAAE,CAAC,CAAC;IAC3C,KAAK,EAAE;QACL,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,SAAS,EAAE,SAAS,GAAG,KAAK,CAAC;QAC7B,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH;AAED,8DAA8D;AAC9D,MAAM,WAAW,YAAa,SAAQ,iBAAiB;IACrD,sCAAsC;IACtC,WAAW,EAAE,MAAM,CAAC;IACpB,+CAA+C;IAC/C,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,6CAA6C;AAC7C,MAAM,WAAW,mBAAmB;IAClC;;;;OAIG;IACH,IAAI,CAAC,EAAE,SAAS,GAAG,KAAK,CAAC;IACzB,gDAAgD;IAChD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,wEAAwE;IACxE,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,qCAAqC;IACrC,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B;AAID,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,8EAA8E;IAC9E,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,8EAA8E;IAC9E,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,qFAAqF;IACrF,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,uEAAuE;IACvE,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,uEAAuE;IACvE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,+EAA+E;IAC/E,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,qEAAqE;IACrE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,6EAA6E;IAC7E,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,iFAAiF;IACjF,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B;AAED,wCAAwC;AACxC,MAAM,WAAW,kBAAkB;IACjC,6DAA6D;IAC7D,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,2DAA2D;IAC3D,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,kGAAkG;IAClG,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,+EAA+E;IAC/E,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,gEAAgE;IAChE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,uEAAuE;IACvE,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,qCAAqC;IACrC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,iFAAiF;IACjF,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B"}
package/dist/types.js ADDED
@@ -0,0 +1,4 @@
1
+ "use strict";
2
+ // ─── LLM Provider ────────────────────────────────────────────────────────────
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";AAAA,gFAAgF"}
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Extracts the text between ```json ... ``` fences, or returns the raw string.
3
+ * Port of `get_json_content()` from pageindex/utils.py
4
+ */
5
+ export declare function getJsonContent(response: string): string;
6
+ /**
7
+ * Parses a JSON string from an LLM response, handling common quirks
8
+ * (fenced code blocks, Python `None` → JSON `null`, trailing commas).
9
+ * Returns an empty object `{}` on failure.
10
+ * Port of `extract_json()` from pageindex/utils.py
11
+ */
12
+ export declare function extractJson(content: string): Record<string, unknown> | unknown[];
13
+ //# sourceMappingURL=json.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"json.d.ts","sourceRoot":"","sources":["../../src/utils/json.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAQvD;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,OAAO,EAAE,CA6ChF"}
@@ -0,0 +1,69 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.getJsonContent = getJsonContent;
4
+ exports.extractJson = extractJson;
5
+ /**
6
+ * Extracts the text between ```json ... ``` fences, or returns the raw string.
7
+ * Port of `get_json_content()` from pageindex/utils.py
8
+ */
9
+ function getJsonContent(response) {
10
+ const start = response.indexOf('```json');
11
+ if (start !== -1) {
12
+ const after = response.slice(start + 7);
13
+ const end = after.lastIndexOf('```');
14
+ return (end !== -1 ? after.slice(0, end) : after).trim();
15
+ }
16
+ return response.trim();
17
+ }
18
+ /**
19
+ * Parses a JSON string from an LLM response, handling common quirks
20
+ * (fenced code blocks, Python `None` → JSON `null`, trailing commas).
21
+ * Returns an empty object `{}` on failure.
22
+ * Port of `extract_json()` from pageindex/utils.py
23
+ */
24
+ function extractJson(content) {
25
+ try {
26
+ // Strip ```json ... ``` fences if present
27
+ let jsonContent;
28
+ const fenceStart = content.indexOf('```json');
29
+ if (fenceStart !== -1) {
30
+ const afterFence = content.slice(fenceStart + 7);
31
+ const fenceEnd = afterFence.lastIndexOf('```');
32
+ jsonContent = (fenceEnd !== -1 ? afterFence.slice(0, fenceEnd) : afterFence).trim();
33
+ }
34
+ else {
35
+ jsonContent = content.trim();
36
+ }
37
+ // Common cleanup
38
+ jsonContent = jsonContent
39
+ .replace(/\bNone\b/g, 'null') // Python None → JSON null
40
+ .replace(/\bTrue\b/g, 'true') // Python True → JSON true
41
+ .replace(/\bFalse\b/g, 'false') // Python False → JSON false
42
+ .replace(/\r?\n/g, ' ') // Remove newlines
43
+ .replace(/\s+/g, ' '); // Normalize whitespace
44
+ return JSON.parse(jsonContent);
45
+ }
46
+ catch {
47
+ // Second attempt: remove trailing commas before ] or }
48
+ try {
49
+ let jsonContent = content.trim()
50
+ .replace(/\bNone\b/g, 'null')
51
+ .replace(/\bTrue\b/g, 'true')
52
+ .replace(/\bFalse\b/g, 'false')
53
+ .replace(/,\s*]/g, ']')
54
+ .replace(/,\s*}/g, '}');
55
+ const fenceStart = jsonContent.indexOf('```json');
56
+ if (fenceStart !== -1) {
57
+ const afterFence = jsonContent.slice(fenceStart + 7);
58
+ const fenceEnd = afterFence.lastIndexOf('```');
59
+ jsonContent = (fenceEnd !== -1 ? afterFence.slice(0, fenceEnd) : afterFence).trim();
60
+ }
61
+ return JSON.parse(jsonContent);
62
+ }
63
+ catch {
64
+ console.warn('[PageIndex] Failed to parse JSON from LLM response:', content.slice(0, 200));
65
+ return {};
66
+ }
67
+ }
68
+ }
69
+ //# sourceMappingURL=json.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/utils/json.ts"],"names":[],"mappings":";;AAIA,wCAQC;AAQD,kCA6CC;AAjED;;;GAGG;AACH,SAAgB,cAAc,CAAC,QAAgB;IAC7C,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAC1C,IAAI,KAAK,KAAK,CAAC,CAAC,EAAE,CAAC;QACjB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;QACxC,MAAM,GAAG,GAAG,KAAK,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;IAC3D,CAAC;IACD,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;AACzB,CAAC;AAED;;;;;GAKG;AACH,SAAgB,WAAW,CAAC,OAAe;IACzC,IAAI,CAAC;QACH,0CAA0C;QAC1C,IAAI,WAAmB,CAAC;QACxB,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QAC9C,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;YACtB,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;YACjD,MAAM,QAAQ,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;YAC/C,WAAW,GAAG,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;aAAM,CAAC;YACN,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC/B,CAAC;QAED,iBAAiB;QACjB,WAAW,GAAG,WAAW;aACtB,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAG,0BAA0B;aACzD,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAG,0BAA0B;aACzD,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC,4BAA4B;aAC3D,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAS,kBAAkB;aACjD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAU,uBAAuB;QAEzD,OAAO,IAAI,CAAC,KAAK,CAAC,WAAW,CAAwC,CAAC;IACxE,CAAC;IAAC,MAAM,CAAC;QACP,uDAAuD;QACvD,IAAI,CAAC;YACH,IAAI,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE;iBAC7B,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC;iBAC5B,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC;iBAC5B,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC;iBAC9B,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;iBACtB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;YAE1B,MAAM,UAAU,GAAG,WAAW,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAClD,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;gBACtB,MAAM,UAAU,GAAG,WAAW,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;gBACrD,MAAM,QAAQ,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBAC/C,WAAW,GAAG,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC;YACtF,CAAC;YAED,OAAO,IAAI,CAAC,KAAK,CAAC,WAAW,CAAwC,CAAC;QACxE,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,CAAC,IAAI,CAAC,qDAAqD,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3F,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;AACH,CAAC"}
@@ -0,0 +1,20 @@
1
+ import type { PageData, TokenCounter } from '../types';
2
+ /**
3
+ * Extracts per-page text from a PDF buffer using `pdfjs-dist`.
4
+ *
5
+ * This is an **optional helper** — install `pdfjs-dist` (>=4.0.0) to use it.
6
+ * If you already have page text (e.g., from `react-native-pdf` or a backend),
7
+ * you can pass `PageData[]` directly to `pageIndex()` without calling this.
8
+ *
9
+ * @param data Raw PDF bytes (ArrayBuffer or Uint8Array)
10
+ * @param counter Token counter function (defaults to ~4 chars/token)
11
+ * @returns Array of `{ text, tokenCount }` — one entry per page
12
+ *
13
+ * @example
14
+ * import RNFS from 'react-native-fs';
15
+ * const base64 = await RNFS.readFile(filePath, 'base64');
16
+ * const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
17
+ * const pages = await extractPdfPages(bytes.buffer as ArrayBuffer);
18
+ */
19
+ export declare function extractPdfPages(data: ArrayBuffer | Uint8Array, counter?: TokenCounter): Promise<PageData[]>;
20
+ //# sourceMappingURL=pdf.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/utils/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAGvD;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAsB,eAAe,CACnC,IAAI,EAAE,WAAW,GAAG,UAAU,EAC9B,OAAO,GAAE,YAAkC,GAC1C,OAAO,CAAC,QAAQ,EAAE,CAAC,CA6CrB"}