react-native-pageindex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/LICENSE +21 -0
- package/README.md +405 -0
- package/dist/config.d.ts +4 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +22 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +49 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +75 -0
- package/dist/index.js.map +1 -0
- package/dist/pageIndex.d.ts +48 -0
- package/dist/pageIndex.d.ts.map +1 -0
- package/dist/pageIndex.js +962 -0
- package/dist/pageIndex.js.map +1 -0
- package/dist/pageIndexDocument.d.ts +85 -0
- package/dist/pageIndexDocument.d.ts.map +1 -0
- package/dist/pageIndexDocument.js +145 -0
- package/dist/pageIndexDocument.js.map +1 -0
- package/dist/pageIndexMd.d.ts +31 -0
- package/dist/pageIndexMd.d.ts.map +1 -0
- package/dist/pageIndexMd.js +260 -0
- package/dist/pageIndexMd.js.map +1 -0
- package/dist/parsers/csv.d.ts +17 -0
- package/dist/parsers/csv.d.ts.map +1 -0
- package/dist/parsers/csv.js +147 -0
- package/dist/parsers/csv.js.map +1 -0
- package/dist/parsers/docx.d.ts +20 -0
- package/dist/parsers/docx.d.ts.map +1 -0
- package/dist/parsers/docx.js +134 -0
- package/dist/parsers/docx.js.map +1 -0
- package/dist/parsers/xlsx.d.ts +19 -0
- package/dist/parsers/xlsx.d.ts.map +1 -0
- package/dist/parsers/xlsx.js +121 -0
- package/dist/parsers/xlsx.js.map +1 -0
- package/dist/reverseIndex.d.ts +39 -0
- package/dist/reverseIndex.d.ts.map +1 -0
- package/dist/reverseIndex.js +248 -0
- package/dist/reverseIndex.js.map +1 -0
- package/dist/types.d.ts +190 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/json.d.ts +13 -0
- package/dist/utils/json.d.ts.map +1 -0
- package/dist/utils/json.js +69 -0
- package/dist/utils/json.js.map +1 -0
- package/dist/utils/pdf.d.ts +20 -0
- package/dist/utils/pdf.d.ts.map +1 -0
- package/dist/utils/pdf.js +96 -0
- package/dist/utils/pdf.js.map +1 -0
- package/dist/utils/progress.d.ts +29 -0
- package/dist/utils/progress.d.ts.map +1 -0
- package/dist/utils/progress.js +59 -0
- package/dist/utils/progress.js.map +1 -0
- package/dist/utils/tokens.d.ts +7 -0
- package/dist/utils/tokens.d.ts.map +1 -0
- package/dist/utils/tokens.js +12 -0
- package/dist/utils/tokens.js.map +1 -0
- package/dist/utils/tree.d.ts +88 -0
- package/dist/utils/tree.d.ts.map +1 -0
- package/dist/utils/tree.js +365 -0
- package/dist/utils/tree.js.map +1 -0
- package/package.json +76 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Reverse (inverted) index — maps terms → tree nodes that contain them.
|
|
4
|
+
*
|
|
5
|
+
* Two modes:
|
|
6
|
+
* 'keyword' — fast, no LLM. Extracts stopword-filtered terms with TF scoring.
|
|
7
|
+
* 'llm' — slower, semantic. Uses LLM to extract concept terms per node.
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.buildReverseIndex = buildReverseIndex;
|
|
11
|
+
exports.searchReverseIndex = searchReverseIndex;
|
|
12
|
+
const tree_1 = require("./utils/tree");
|
|
13
|
+
// ─── Stopwords ────────────────────────────────────────────────────────────────
|
|
14
|
+
const STOPWORDS = new Set([
|
|
15
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
16
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
|
17
|
+
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
|
18
|
+
'would', 'could', 'should', 'may', 'might', 'shall', 'can', 'not',
|
|
19
|
+
'this', 'that', 'these', 'those', 'it', 'its', 'we', 'our', 'you',
|
|
20
|
+
'your', 'he', 'she', 'they', 'their', 'my', 'his', 'her', 'which',
|
|
21
|
+
'who', 'what', 'when', 'where', 'how', 'if', 'then', 'so', 'than',
|
|
22
|
+
'more', 'also', 'about', 'into', 'up', 'out', 'no', 'all', 'each',
|
|
23
|
+
'any', 'some', 'other', 'new', 'one', 'two', 'such', 'only', 'over',
|
|
24
|
+
'after', 'before', 'between', 'through', 'during', 'including', 'without',
|
|
25
|
+
'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus',
|
|
26
|
+
'except', 'however', 'therefore', 'thus', 'hence', 'while', 'although',
|
|
27
|
+
'because', 'since', 'unless', 'until', 'whether', 'both', 'either',
|
|
28
|
+
'neither', 'per', 'via', 'etc', 'ie', 'eg',
|
|
29
|
+
]);
|
|
30
|
+
// ─── Keyword Extraction ───────────────────────────────────────────────────────
|
|
31
|
+
function tokenise(text) {
|
|
32
|
+
return text
|
|
33
|
+
.toLowerCase()
|
|
34
|
+
.replace(/[^\w\s-]/g, ' ')
|
|
35
|
+
.split(/\s+/)
|
|
36
|
+
.map((t) => t.replace(/^-+|-+$/g, '').trim())
|
|
37
|
+
.filter(Boolean);
|
|
38
|
+
}
|
|
39
|
+
function extractKeywords(text, minLength) {
|
|
40
|
+
const tokens = tokenise(text);
|
|
41
|
+
const tf = new Map();
|
|
42
|
+
for (const token of tokens) {
|
|
43
|
+
if (token.length < minLength)
|
|
44
|
+
continue;
|
|
45
|
+
if (STOPWORDS.has(token))
|
|
46
|
+
continue;
|
|
47
|
+
if (/^\d+$/.test(token))
|
|
48
|
+
continue; // pure numbers
|
|
49
|
+
tf.set(token, (tf.get(token) ?? 0) + 1);
|
|
50
|
+
}
|
|
51
|
+
return tf;
|
|
52
|
+
}
|
|
53
|
+
// Normalise TF into 0-1 score (log-normalised)
|
|
54
|
+
function normaliseTf(count, maxCount) {
|
|
55
|
+
if (maxCount === 0)
|
|
56
|
+
return 0;
|
|
57
|
+
return Math.log1p(count) / Math.log1p(maxCount);
|
|
58
|
+
}
|
|
59
|
+
// ─── LLM Prompt ───────────────────────────────────────────────────────────────
|
|
60
|
+
function buildLlmPrompt(nodeTitle, nodeSummary, maxTerms) {
|
|
61
|
+
return `Extract up to ${maxTerms} key concepts, named entities, or important terms from the section below.
|
|
62
|
+
Return a JSON array of strings only — short terms or phrases (1–4 words each), no duplicates, no stopwords.
|
|
63
|
+
|
|
64
|
+
Section title: ${nodeTitle}
|
|
65
|
+
Section summary: ${nodeSummary}
|
|
66
|
+
|
|
67
|
+
Respond with ONLY a JSON array, e.g.: ["machine learning", "gradient descent", "neural network"]`;
|
|
68
|
+
}
|
|
69
|
+
async function extractLlmTerms(node, llm, maxTerms) {
|
|
70
|
+
const title = node.title ?? '';
|
|
71
|
+
const summary = node.summary ?? node.prefix_summary ?? node.text ?? '';
|
|
72
|
+
if (!title && !summary)
|
|
73
|
+
return [];
|
|
74
|
+
const prompt = buildLlmPrompt(title, summary, maxTerms);
|
|
75
|
+
try {
|
|
76
|
+
const result = await llm(prompt);
|
|
77
|
+
const match = result.content.match(/\[[\s\S]*\]/);
|
|
78
|
+
if (!match)
|
|
79
|
+
return [];
|
|
80
|
+
const parsed = JSON.parse(match[0]);
|
|
81
|
+
if (!Array.isArray(parsed))
|
|
82
|
+
return [];
|
|
83
|
+
return parsed
|
|
84
|
+
.filter((t) => typeof t === 'string')
|
|
85
|
+
.map((t) => t.toLowerCase().trim())
|
|
86
|
+
.filter((t) => t.length >= 2)
|
|
87
|
+
.slice(0, maxTerms);
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
return [];
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// ─── Node Text for Keyword Mode ───────────────────────────────────────────────
|
|
94
|
+
function nodeTextForKeywords(node) {
|
|
95
|
+
const parts = [];
|
|
96
|
+
if (node.title)
|
|
97
|
+
parts.push(node.title, node.title); // weight title 2x
|
|
98
|
+
if (node.summary)
|
|
99
|
+
parts.push(node.summary);
|
|
100
|
+
if (node.prefix_summary)
|
|
101
|
+
parts.push(node.prefix_summary);
|
|
102
|
+
if (node.text)
|
|
103
|
+
parts.push(node.text);
|
|
104
|
+
return parts.join(' ');
|
|
105
|
+
}
|
|
106
|
+
// ─── Build Reverse Index ──────────────────────────────────────────────────────
|
|
107
|
+
/**
|
|
108
|
+
* Builds an inverted index from a `PageIndexResult`.
|
|
109
|
+
*
|
|
110
|
+
* In **keyword** mode (default), terms are extracted via stopword-filtered TF
|
|
111
|
+
* scoring — fast, no LLM calls needed.
|
|
112
|
+
*
|
|
113
|
+
* In **llm** mode, the LLM extracts semantic concept terms from each node's
|
|
114
|
+
* title + summary — slower but catches synonyms/concepts.
|
|
115
|
+
*
|
|
116
|
+
* @param result The forward-index output from `pageIndex()` / `pageIndexMd()`
|
|
117
|
+
* @param pages Original page data (optional; used for extra keyword signal)
|
|
118
|
+
* @param llm LLM provider (required for mode 'llm')
|
|
119
|
+
* @param options Index options
|
|
120
|
+
*/
|
|
121
|
+
async function buildReverseIndex(input) {
|
|
122
|
+
const { result, llm, options = {} } = input;
|
|
123
|
+
const { mode = 'keyword', minTermLength = 3, maxTermsPerNode = 10, onProgress, } = options;
|
|
124
|
+
if (mode === 'llm' && !llm) {
|
|
125
|
+
throw new Error('[PageIndex] LLM provider is required when mode is "llm"');
|
|
126
|
+
}
|
|
127
|
+
// Flatten tree into leaf + branch nodes (all nodes)
|
|
128
|
+
const nodes = (0, tree_1.getNodes)(result.structure);
|
|
129
|
+
const terms = {};
|
|
130
|
+
const total = nodes.length;
|
|
131
|
+
for (let i = 0; i < total; i++) {
|
|
132
|
+
const node = nodes[i];
|
|
133
|
+
onProgress?.({
|
|
134
|
+
step: mode === 'llm' ? 'Extracting concepts via LLM' : 'Extracting keywords',
|
|
135
|
+
percent: Math.round((i / total) * 90),
|
|
136
|
+
detail: `Node ${i + 1} / ${total}: ${node.title ?? ''}`,
|
|
137
|
+
});
|
|
138
|
+
const entry = {
|
|
139
|
+
nodeId: node.node_id,
|
|
140
|
+
nodeTitle: node.title ?? '',
|
|
141
|
+
startIndex: node.start_index,
|
|
142
|
+
endIndex: node.end_index,
|
|
143
|
+
score: 0, // filled in below
|
|
144
|
+
};
|
|
145
|
+
if (mode === 'keyword') {
|
|
146
|
+
const text = nodeTextForKeywords(node);
|
|
147
|
+
const tf = extractKeywords(text, minTermLength);
|
|
148
|
+
if (tf.size === 0)
|
|
149
|
+
continue;
|
|
150
|
+
const maxCount = Math.max(...tf.values());
|
|
151
|
+
// Keep top-N by count
|
|
152
|
+
const sorted = [...tf.entries()]
|
|
153
|
+
.sort((a, b) => b[1] - a[1])
|
|
154
|
+
.slice(0, maxTermsPerNode);
|
|
155
|
+
for (const [term, count] of sorted) {
|
|
156
|
+
const score = normaliseTf(count, maxCount);
|
|
157
|
+
const entryWithScore = { ...entry, score };
|
|
158
|
+
if (!terms[term])
|
|
159
|
+
terms[term] = [];
|
|
160
|
+
terms[term].push(entryWithScore);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
// LLM mode
|
|
165
|
+
const termList = await extractLlmTerms(node, llm, maxTermsPerNode);
|
|
166
|
+
for (let rank = 0; rank < termList.length; rank++) {
|
|
167
|
+
const term = termList[rank];
|
|
168
|
+
// Score decays with position (first term most important)
|
|
169
|
+
const score = 1 - rank / termList.length;
|
|
170
|
+
const entryWithScore = { ...entry, score };
|
|
171
|
+
if (!terms[term])
|
|
172
|
+
terms[term] = [];
|
|
173
|
+
terms[term].push(entryWithScore);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// Sort each term's entries by score descending
|
|
178
|
+
for (const term of Object.keys(terms)) {
|
|
179
|
+
terms[term].sort((a, b) => b.score - a.score);
|
|
180
|
+
}
|
|
181
|
+
onProgress?.({
|
|
182
|
+
step: 'Reverse index complete',
|
|
183
|
+
percent: 100,
|
|
184
|
+
detail: `${Object.keys(terms).length} terms across ${total} nodes`,
|
|
185
|
+
});
|
|
186
|
+
return {
|
|
187
|
+
docName: result.doc_name,
|
|
188
|
+
terms,
|
|
189
|
+
stats: {
|
|
190
|
+
totalTerms: Object.keys(terms).length,
|
|
191
|
+
totalNodes: total,
|
|
192
|
+
indexMode: mode,
|
|
193
|
+
indexedAt: new Date().toISOString(),
|
|
194
|
+
},
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
// ─── Search ───────────────────────────────────────────────────────────────────
|
|
198
|
+
/**
|
|
199
|
+
* Queries the reverse index for one or more terms.
|
|
200
|
+
* Multi-word queries are split and each term is looked up separately;
|
|
201
|
+
* nodes matching multiple terms get a combined score boost.
|
|
202
|
+
*
|
|
203
|
+
* @param index The reverse index (from `buildReverseIndex`)
|
|
204
|
+
* @param query Free-text query string
|
|
205
|
+
* @param topK Max results to return (default: 10)
|
|
206
|
+
*/
|
|
207
|
+
function searchReverseIndex(index, query, topK = 10) {
|
|
208
|
+
const queryTerms = tokenise(query).filter((t) => t.length >= 2 && !STOPWORDS.has(t));
|
|
209
|
+
if (queryTerms.length === 0)
|
|
210
|
+
return [];
|
|
211
|
+
// nodeId (or nodeTitle as fallback) → combined result
|
|
212
|
+
const combined = new Map();
|
|
213
|
+
for (const qTerm of queryTerms) {
|
|
214
|
+
// Exact match
|
|
215
|
+
const exactHits = index.terms[qTerm] ?? [];
|
|
216
|
+
// Prefix / partial match
|
|
217
|
+
const partialHits = [];
|
|
218
|
+
for (const [term, entries] of Object.entries(index.terms)) {
|
|
219
|
+
if (term !== qTerm && term.includes(qTerm)) {
|
|
220
|
+
partialHits.push(...entries.map((e) => ({ ...e, score: e.score * 0.6 })));
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const allHits = [...exactHits, ...partialHits];
|
|
224
|
+
for (const hit of allHits) {
|
|
225
|
+
const key = hit.nodeId ?? hit.nodeTitle;
|
|
226
|
+
const existing = combined.get(key);
|
|
227
|
+
if (existing) {
|
|
228
|
+
existing.totalScore += hit.score;
|
|
229
|
+
// keep the best individual term score
|
|
230
|
+
if (hit.score > existing.score) {
|
|
231
|
+
existing.score = hit.score;
|
|
232
|
+
existing.matchedTerm = qTerm;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
else {
|
|
236
|
+
combined.set(key, {
|
|
237
|
+
...hit,
|
|
238
|
+
matchedTerm: qTerm,
|
|
239
|
+
totalScore: hit.score,
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
return [...combined.values()]
|
|
245
|
+
.sort((a, b) => b.totalScore - a.totalScore)
|
|
246
|
+
.slice(0, topK);
|
|
247
|
+
}
|
|
248
|
+
//# sourceMappingURL=reverseIndex.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reverseIndex.js","sourceRoot":"","sources":["../src/reverseIndex.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG;;AAoIH,8CAiGC;AAaD,gDAoDC;AA1RD,uCAAwC;AAExC,iFAAiF;AAEjF,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC;IACxB,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK;IACnE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI;IAClE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAClE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;IACjE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK;IACjE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,OAAO;IACjE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM;IACjE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM;IACjE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IACnE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS;IACzE,QAAQ,EAAE,OAAO,EAAE,WAAW,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM;IACpE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU;IACtE,SAAS,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,QAAQ;IAClE,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI;CAC3C,CAAC,CAAC;AAEH,iFAAiF;AAEjF,SAAS,QAAQ,CAAC,IAAY;IAC5B,OAAO,IAAI;SACR,WAAW,EAAE;SACb,OAAO,CAAC,WAAW,EAAE,GAAG,CAAC;SACzB,KAAK,CAAC,KAAK,CAAC;SACZ,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;SAC5C,MAAM,CAAC,OAAO,CAAC,CAAC;AACrB,CAAC;AAED,SAAS,eAAe,CACtB,IAAY,EACZ,SAAiB;IAEjB,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAC;IAErC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,MAAM,GAAG,SAAS;YAAE,SAAS;QACvC,IAAI,SAAS,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,SAAS;QACnC,IAAI,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC;YAAE,SAAS,CAAC,eAAe;QAClD,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,+CAA+C;AAC/C,SAAS,WAAW,CAAC,KAAa,EAAE,QAAgB;IAClD,IAAI,QAAQ,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAC7B,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;AAClD,CAAC;AAED,iFAAiF;AAEjF,SAAS,cAAc,CAAC,SAAiB,EAAE,WAAmB,EAAE,QAAgB;IAC9E,OAAO,iBAAiB,QAAQ;;;iBAGjB,SAAS;mBACP,WAAW;;iGAEmE,CAAC;AAClG,CAAC;AAED,KAAK,UAAU,eAAe,CAC5B,IAAc,EACd,GAAgB,EAChB,QAAgB;IAEhB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;IAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;IACvE,IAAI,CAAC,KAAK,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,CAAC;IAElC,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;IACxD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,MAAM,CAAC,CAAC;QACjC,MAAM,KAAK,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAClD,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAC;QACtB,MAAM,MAAM,GAAY,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC;YAAE,OAAO,EAAE,CAAC;QACtC,OAAQ,MAAoB;aACzB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;aACpC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAY,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;aAC9C,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;aAC5B,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF,SAAS,mBAAmB,CAAC,IAAc;IACzC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,IAAI,CAAC,KAAK;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,kBAAkB;IACtE,IAAI,IAAI,CAAC,OAAO;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC3C,IAAI,IAAI,CAAC,cAAc;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IACzD,IAAI,IAAI,CAAC,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACrC,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,iFAAiF;AAEjF;;;;;;;;;;;;;GAaG;AACI,KAAK,UAAU,iBAAiB,CAAC,KAKvC;IACC,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,OAAO,GAAG,EAAE,EAAE,GAAG,KAAK,CAAC;IAC5C,MAAM,EACJ,IAAI,GAAG,SAAS,EAChB,aAAa,GAAG,CAAC,EACjB,eAAe,GAAG,EAAE,EACpB,UAAU,GACX,GAAG,OAAO,CAAC;IAEZ,IAAI,IAAI,KAAK,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;IAC7E,CAAC;IAED,oDAAoD;IACpD,MAAM,KAAK,GAAG,IAAA,eAAQ,EAAC,MAAM,CAAC,SAAS,CAAC,CAAC;IAEzC,MAAM,KAAK,GAAwC,EAAE,CAAC;IAEtD,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;IAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAEtB,UAAU,EAAE,CAAC;YACX,IAAI,EAAE,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,6BAA6B,CAAC,CAAC,CAAC,qBAAqB;YAC5E,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,GAAG,EAAE,CAAC;YACrC,MAAM,EAAE,QAAQ,CAAC,GAAG,CAAC,MAAM,KAAK,KAAK,IAAI,CAAC,KAAK,IAAI,EAAE,EAAE;SACxD,CAAC,CAAC;QAEH,MAAM,KAAK,GAAsB;YAC/B,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,SAAS,EAAE,IAAI,CAAC,KAAK,IAAI,EAAE;YAC3B,UAAU,EAAE,IAAI,CAAC,WAAW;YAC5B,QAAQ,EAAE,IAAI,CAAC,SAAS;YACxB,KAAK,EAAE,CAAC,EAAE,kBAAkB;SAC7B,CAAC;QAEF,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,IAAI,GAAG,mBAAmB,CAAC,IAAI,CAAC,CAAC;YACvC,MAAM,EAAE,GAAG,eAAe,CAAC,IAAI,EAAE,aAAa,CAAC,CAAC;YAEhD,IAAI,EAAE,CAAC,IAAI,KAAK,CAAC;gBAAE,SAAS;YAE5B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;YAE1C,sBAAsB;YACtB,MAAM,MAAM,GAAG,CAAC,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC;iBAC7B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;iBAC3B,KAAK,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;YAE7B,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;gBACnC,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;gBAC3C,MAAM,cAAc,GAAsB,EAAE,GAAG,KAAK,EAAE,KAAK,EAAE,CAAC;gBAC9D,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;gBACnC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;aAAM,CAAC;YACN,WAAW;YACX,MAAM,QAAQ,GAAG,MAAM,eAAe,CAAC,IAAI,EAAE,GAAI,EAAE,eAAe,CAAC,CAAC;YAEpE,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC;gBAClD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;gBAC5B,yDAAyD;gBACzD,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC;gBACzC,MAAM,cAAc,GAAsB,EAAE,GAAG,KAAK,EAAE,KAAK,EAAE,CAAC;gBAC9D,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;gBACnC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;IACH,CAAC;IAED,+CAA+C;IAC/C,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACtC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAChD,CAAC;IAED,UAAU,EAAE,CAAC;QACX,IAAI,EAAE,wBAAwB;QAC9B,OAAO,EAAE,GAAG;QACZ,MAAM,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,iBAAiB,KAAK,QAAQ;KACnE,CAAC,CAAC;IAEH,OAAO;QACL,OAAO,EAAE,MAAM,CAAC,QAAQ;QACxB,KAAK;QACL,KAAK,EAAE;YACL,UAAU,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM;YACrC,UAAU,EAAE,KAAK;YACjB,SAAS,EAAE,IAAI;YACf,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;AACJ,CAAC;AAED,iFAAiF;AAEjF;;;;;;;;GAQG;AACH,SAAgB,kBAAkB,CAChC,KAAmB,EACnB,KAAa,EACb,IAAI,GAAG,EAAE;IAET,MAAM,UAAU,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,MAAM,CACvC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAC1C,CAAC;IAEF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEvC,sDAAsD;IACtD,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAwB,CAAC;IAEjD,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,cAAc;QACd,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;QAC3C,yBAAyB;QACzB,MAAM,WAAW,GAAwB,EAAE,CAAC;QAC5C,KAAK,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1D,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC3C,WAAW,CAAC,IAAI,CACd,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,GAAG,EAAE,CAAC,CAAC,CACxD,CAAC;YACJ,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,CAAC,GAAG,SAAS,EAAE,GAAG,WAAW,CAAC,CAAC;QAE/C,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;YAC1B,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,SAAS,CAAC;YACxC,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACnC,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,UAAU,IAAI,GAAG,CAAC,KAAK,CAAC;gBACjC,sCAAsC;gBACtC,IAAI,GAAG,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,EAAE,CAAC;oBAC/B,QAAQ,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;oBAC3B,QAAQ,CAAC,WAAW,GAAG,KAAK,CAAC;gBAC/B,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE;oBAChB,GAAG,GAAG;oBACN,WAAW,EAAE,KAAK;oBAClB,UAAU,EAAE,GAAG,CAAC,KAAK;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC;SAC3C,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;AACpB,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/** A single message in a chat conversation */
|
|
2
|
+
export interface LLMMessage {
|
|
3
|
+
role: 'user' | 'assistant';
|
|
4
|
+
content: string;
|
|
5
|
+
}
|
|
6
|
+
/** The finish reason returned by the LLM */
|
|
7
|
+
export type LLMFinishReason = 'stop' | 'length' | string;
|
|
8
|
+
/** The result of an LLM call */
|
|
9
|
+
export interface LLMResult {
|
|
10
|
+
content: string;
|
|
11
|
+
finishReason: LLMFinishReason;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Provider-agnostic LLM callback.
|
|
15
|
+
* Wire up OpenAI, Anthropic, Ollama, or any other provider here.
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* const llm: LLMProvider = async (prompt, opts) => {
|
|
19
|
+
* const response = await openai.chat.completions.create({
|
|
20
|
+
* model: 'gpt-4o',
|
|
21
|
+
* messages: [
|
|
22
|
+
* ...(opts?.chatHistory ?? []),
|
|
23
|
+
* { role: 'user', content: prompt },
|
|
24
|
+
* ],
|
|
25
|
+
* });
|
|
26
|
+
* return {
|
|
27
|
+
* content: response.choices[0].message.content ?? '',
|
|
28
|
+
* finishReason: response.choices[0].finish_reason ?? 'stop',
|
|
29
|
+
* };
|
|
30
|
+
* };
|
|
31
|
+
*/
|
|
32
|
+
export type LLMProvider = (prompt: string, options?: {
|
|
33
|
+
chatHistory?: LLMMessage[];
|
|
34
|
+
}) => Promise<LLMResult>;
|
|
35
|
+
/** Emitted at every major milestone during processing */
|
|
36
|
+
export interface ProgressInfo {
|
|
37
|
+
/** Human-readable description of the current step */
|
|
38
|
+
step: string;
|
|
39
|
+
/** Overall progress 0–100 */
|
|
40
|
+
percent: number;
|
|
41
|
+
/** Optional extra detail, e.g. "Page 3 / 45" or "Node 2 / 8" */
|
|
42
|
+
detail?: string;
|
|
43
|
+
}
|
|
44
|
+
export type ProgressCallback = (info: ProgressInfo) => void;
|
|
45
|
+
/**
|
|
46
|
+
* Returns the approximate number of tokens in `text`.
|
|
47
|
+
* Default implementation: Math.ceil(text.length / 4)
|
|
48
|
+
* Plug in `js-tiktoken` or similar for exact counts.
|
|
49
|
+
*/
|
|
50
|
+
export type TokenCounter = (text: string) => number;
|
|
51
|
+
/** Text and token count for a single document page */
|
|
52
|
+
export interface PageData {
|
|
53
|
+
text: string;
|
|
54
|
+
tokenCount: number;
|
|
55
|
+
}
|
|
56
|
+
/** A node in the hierarchical tree index */
|
|
57
|
+
export interface TreeNode {
|
|
58
|
+
title: string;
|
|
59
|
+
node_id?: string;
|
|
60
|
+
start_index?: number;
|
|
61
|
+
end_index?: number;
|
|
62
|
+
summary?: string;
|
|
63
|
+
prefix_summary?: string;
|
|
64
|
+
text?: string;
|
|
65
|
+
nodes?: TreeNode[];
|
|
66
|
+
structure?: string;
|
|
67
|
+
physical_index?: number | null;
|
|
68
|
+
appear_start?: string;
|
|
69
|
+
list_index?: number;
|
|
70
|
+
page?: number | null;
|
|
71
|
+
}
|
|
72
|
+
/** The final output of pageIndex() or pageIndexMd() */
|
|
73
|
+
export interface PageIndexResult {
|
|
74
|
+
doc_name: string;
|
|
75
|
+
doc_description?: string;
|
|
76
|
+
structure: TreeNode[];
|
|
77
|
+
}
|
|
78
|
+
/** File formats supported by pageIndexDocument() */
|
|
79
|
+
export type DocumentFileType = 'pdf' | 'docx' | 'csv' | 'xlsx' | 'md';
|
|
80
|
+
/** Options for CSV parsing */
|
|
81
|
+
export interface CsvParseOptions {
|
|
82
|
+
/** Column delimiter (default: auto-detect from ',', ';', '\t') */
|
|
83
|
+
delimiter?: string;
|
|
84
|
+
/** Rows per page-chunk (default: 100) */
|
|
85
|
+
rowsPerPage?: number;
|
|
86
|
+
/** Treat first row as header (default: true) */
|
|
87
|
+
hasHeader?: boolean;
|
|
88
|
+
}
|
|
89
|
+
/** Options for XLSX parsing */
|
|
90
|
+
export interface XlsxParseOptions {
|
|
91
|
+
/** Sheet names to include (default: all sheets) */
|
|
92
|
+
sheets?: string[];
|
|
93
|
+
/** Max rows per sheet-chunk (default: 200) */
|
|
94
|
+
rowsPerChunk?: number;
|
|
95
|
+
}
|
|
96
|
+
/** A single reverse-index hit — points back to the tree node that contains the term */
|
|
97
|
+
export interface ReverseIndexEntry {
|
|
98
|
+
nodeId?: string;
|
|
99
|
+
nodeTitle: string;
|
|
100
|
+
/** Start page of the node (1-based) */
|
|
101
|
+
startIndex?: number;
|
|
102
|
+
/** End page of the node (1-based) */
|
|
103
|
+
endIndex?: number;
|
|
104
|
+
/**
|
|
105
|
+
* Relevance score 0–1.
|
|
106
|
+
* Keyword mode: TF-based score.
|
|
107
|
+
* LLM mode: LLM-assigned importance.
|
|
108
|
+
*/
|
|
109
|
+
score: number;
|
|
110
|
+
}
|
|
111
|
+
/** The complete reverse (inverted) index for one document */
|
|
112
|
+
export interface ReverseIndex {
|
|
113
|
+
docName: string;
|
|
114
|
+
/**
|
|
115
|
+
* Maps every indexed term → list of nodes that contain it, sorted by score desc.
|
|
116
|
+
* Keys are lowercase, normalised terms.
|
|
117
|
+
*/
|
|
118
|
+
terms: Record<string, ReverseIndexEntry[]>;
|
|
119
|
+
stats: {
|
|
120
|
+
totalTerms: number;
|
|
121
|
+
totalNodes: number;
|
|
122
|
+
indexMode: 'keyword' | 'llm';
|
|
123
|
+
indexedAt: string;
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
/** A ranked search result returned by searchReverseIndex() */
|
|
127
|
+
export interface SearchResult extends ReverseIndexEntry {
|
|
128
|
+
/** The term that matched the query */
|
|
129
|
+
matchedTerm: string;
|
|
130
|
+
/** Combined score when multiple terms match */
|
|
131
|
+
totalScore: number;
|
|
132
|
+
}
|
|
133
|
+
/** Options for building the reverse index */
|
|
134
|
+
export interface ReverseIndexOptions {
|
|
135
|
+
/**
|
|
136
|
+
* 'keyword' — fast, no LLM calls, extracts terms via stopword-filtered TF
|
|
137
|
+
* 'llm' — slower, semantic; uses LLM to extract concepts per node
|
|
138
|
+
* default: 'keyword'
|
|
139
|
+
*/
|
|
140
|
+
mode?: 'keyword' | 'llm';
|
|
141
|
+
/** Minimum term length to index (default: 3) */
|
|
142
|
+
minTermLength?: number;
|
|
143
|
+
/** Max number of terms to extract per node in LLM mode (default: 10) */
|
|
144
|
+
maxTermsPerNode?: number;
|
|
145
|
+
/** Called at each major milestone */
|
|
146
|
+
onProgress?: ProgressCallback;
|
|
147
|
+
}
|
|
148
|
+
/** Options for the PDF pipeline */
|
|
149
|
+
export interface PageIndexOptions {
|
|
150
|
+
/** Number of pages to scan for an existing Table of Contents (default: 20) */
|
|
151
|
+
tocCheckPageNum?: number;
|
|
152
|
+
/** Max pages a single tree node may span before sub-indexing (default: 10) */
|
|
153
|
+
maxPageNumEachNode?: number;
|
|
154
|
+
/** Max tokens a single tree node may contain before sub-indexing (default: 20000) */
|
|
155
|
+
maxTokenNumEachNode?: number;
|
|
156
|
+
/** Whether to add a sequential node_id to each node (default: true) */
|
|
157
|
+
ifAddNodeId?: boolean;
|
|
158
|
+
/** Whether to generate an LLM summary for each node (default: true) */
|
|
159
|
+
ifAddNodeSummary?: boolean;
|
|
160
|
+
/** Whether to generate a one-sentence document description (default: false) */
|
|
161
|
+
ifAddDocDescription?: boolean;
|
|
162
|
+
/** Whether to include raw page text in each node (default: false) */
|
|
163
|
+
ifAddNodeText?: boolean;
|
|
164
|
+
/** Custom token counting function (default: ~4 chars/token approximation) */
|
|
165
|
+
tokenCounter?: TokenCounter;
|
|
166
|
+
/** Called at each major processing milestone with step name and 0–100 percent */
|
|
167
|
+
onProgress?: ProgressCallback;
|
|
168
|
+
}
|
|
169
|
+
/** Options for the Markdown pipeline */
|
|
170
|
+
export interface MdPageIndexOptions {
|
|
171
|
+
/** Whether to merge small nodes together (default: false) */
|
|
172
|
+
ifThinning?: boolean;
|
|
173
|
+
/** Minimum token threshold for thinning (default: 5000) */
|
|
174
|
+
minTokenThreshold?: number;
|
|
175
|
+
/** Whether to generate an LLM summary for each node (default: true) */
|
|
176
|
+
ifAddNodeSummary?: boolean;
|
|
177
|
+
/** Token threshold below which raw text is used instead of generating a summary (default: 200) */
|
|
178
|
+
summaryTokenThreshold?: number;
|
|
179
|
+
/** Whether to generate a one-sentence document description (default: false) */
|
|
180
|
+
ifAddDocDescription?: boolean;
|
|
181
|
+
/** Whether to include raw text in each node (default: false) */
|
|
182
|
+
ifAddNodeText?: boolean;
|
|
183
|
+
/** Whether to add a sequential node_id to each node (default: true) */
|
|
184
|
+
ifAddNodeId?: boolean;
|
|
185
|
+
/** Custom token counting function */
|
|
186
|
+
tokenCounter?: TokenCounter;
|
|
187
|
+
/** Called at each major processing milestone with step name and 0–100 percent */
|
|
188
|
+
onProgress?: ProgressCallback;
|
|
189
|
+
}
|
|
190
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA,8CAA8C;AAC9C,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,MAAM,GAAG,WAAW,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,4CAA4C;AAC5C,MAAM,MAAM,eAAe,GAAG,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;AAEzD,gCAAgC;AAChC,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,eAAe,CAAC;CAC/B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,MAAM,WAAW,GAAG,CACxB,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;IAAE,WAAW,CAAC,EAAE,UAAU,EAAE,CAAA;CAAE,KACrC,OAAO,CAAC,SAAS,CAAC,CAAC;AAIxB,yDAAyD;AACzD,MAAM,WAAW,YAAY;IAC3B,qDAAqD;IACrD,IAAI,EAAE,MAAM,CAAC;IACb,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,gEAAgE;IAChE,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,MAAM,gBAAgB,GAAG,CAAC,IAAI,EAAE,YAAY,KAAK,IAAI,CAAC;AAI5D;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;AAIpD,sDAAsD;AACtD,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;CACpB;AAID,4CAA4C;AAC5C,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,QAAQ,EAAE,CAAC;IAEnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB;AAED,uDAAuD;AACvD,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,SAAS,EAAE,QAAQ,EAAE,CAAC;CACvB;AAID,oDAAoD;AACpD,MAAM,MAAM,gBAAgB,GAAG,KAAK,GAAG,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC;AAEtE,8BAA8B;AAC9B,MAAM,WAAW,eAAe;IAC9B,kEAAkE;IAClE,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,yCAAyC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,+BAA+B;AAC/B,MAAM,WAAW,gBAAgB;IAC/B,mDAAmD;IACnD,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAClB,8CAA8C;IAC9C,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAID,uFAAuF;AACvF,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,uCAAuC;IACvC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qCAAqC;IACrC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAC;CACf;AAED,6DAA6D;AAC7D,MAAM,WAAW,YAAY;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,iBAAiB,EAAE,CAAC,CAAC;IAC3C,KAAK,EAAE;QACL,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,SAAS,EAAE,SAAS,GAAG,KAAK,CAAC;QAC7B,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH;AAED,8DAA8D;AAC9D,MAAM,WAAW,YAAa,SAAQ,iBAAiB;IACrD,sCAAsC;IACtC,WAAW,EAAE,MAAM,CAAC;IACpB,+CAA+C;IAC/C,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,6CAA6C;AAC7C,MAAM,WAAW,mBAAmB;IAClC;;;;OAIG;IACH,IAAI,CAAC,EAAE,SAAS,GAAG,KAAK,CAAC;IACzB,gDAAgD;IAChD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,wEAAwE;IACxE,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,qCAAqC;IACrC,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B;AAID,mCAAmC;AACnC,MAAM,WAAW,gBAAgB;IAC/B,8EAA8E;IAC9E,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,8EAA8E;IAC9E,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,qFAAqF;IACrF,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,uEAAuE;IACvE,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,uEAAuE;IACvE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,+EAA+E;IAC/E,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,qEAAqE;IACrE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,6EAA6E;IAC7E,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,iFAAiF;IACjF,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B;AAED,wCAAwC;AACxC,MAAM,WAAW,kBAAkB;IACjC,6DAA6D;IAC7D,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,2DAA2D;IAC3D,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,kGAAkG;IAClG,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,+EAA+E;IAC/E,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,gEAAgE;IAChE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,uEAAuE;IACvE,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,qCAAqC;IACrC,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,iFAAiF;IACjF,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";AAAA,gFAAgF"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extracts the text between ```json ... ``` fences, or returns the raw string.
|
|
3
|
+
* Port of `get_json_content()` from pageindex/utils.py
|
|
4
|
+
*/
|
|
5
|
+
export declare function getJsonContent(response: string): string;
|
|
6
|
+
/**
|
|
7
|
+
* Parses a JSON string from an LLM response, handling common quirks
|
|
8
|
+
* (fenced code blocks, Python `None` → JSON `null`, trailing commas).
|
|
9
|
+
* Returns an empty object `{}` on failure.
|
|
10
|
+
* Port of `extract_json()` from pageindex/utils.py
|
|
11
|
+
*/
|
|
12
|
+
export declare function extractJson(content: string): Record<string, unknown> | unknown[];
|
|
13
|
+
//# sourceMappingURL=json.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json.d.ts","sourceRoot":"","sources":["../../src/utils/json.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAQvD;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,OAAO,EAAE,CA6ChF"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.getJsonContent = getJsonContent;
|
|
4
|
+
exports.extractJson = extractJson;
|
|
5
|
+
/**
|
|
6
|
+
* Extracts the text between ```json ... ``` fences, or returns the raw string.
|
|
7
|
+
* Port of `get_json_content()` from pageindex/utils.py
|
|
8
|
+
*/
|
|
9
|
+
function getJsonContent(response) {
|
|
10
|
+
const start = response.indexOf('```json');
|
|
11
|
+
if (start !== -1) {
|
|
12
|
+
const after = response.slice(start + 7);
|
|
13
|
+
const end = after.lastIndexOf('```');
|
|
14
|
+
return (end !== -1 ? after.slice(0, end) : after).trim();
|
|
15
|
+
}
|
|
16
|
+
return response.trim();
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Parses a JSON string from an LLM response, handling common quirks
|
|
20
|
+
* (fenced code blocks, Python `None` → JSON `null`, trailing commas).
|
|
21
|
+
* Returns an empty object `{}` on failure.
|
|
22
|
+
* Port of `extract_json()` from pageindex/utils.py
|
|
23
|
+
*/
|
|
24
|
+
function extractJson(content) {
|
|
25
|
+
try {
|
|
26
|
+
// Strip ```json ... ``` fences if present
|
|
27
|
+
let jsonContent;
|
|
28
|
+
const fenceStart = content.indexOf('```json');
|
|
29
|
+
if (fenceStart !== -1) {
|
|
30
|
+
const afterFence = content.slice(fenceStart + 7);
|
|
31
|
+
const fenceEnd = afterFence.lastIndexOf('```');
|
|
32
|
+
jsonContent = (fenceEnd !== -1 ? afterFence.slice(0, fenceEnd) : afterFence).trim();
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
jsonContent = content.trim();
|
|
36
|
+
}
|
|
37
|
+
// Common cleanup
|
|
38
|
+
jsonContent = jsonContent
|
|
39
|
+
.replace(/\bNone\b/g, 'null') // Python None → JSON null
|
|
40
|
+
.replace(/\bTrue\b/g, 'true') // Python True → JSON true
|
|
41
|
+
.replace(/\bFalse\b/g, 'false') // Python False → JSON false
|
|
42
|
+
.replace(/\r?\n/g, ' ') // Remove newlines
|
|
43
|
+
.replace(/\s+/g, ' '); // Normalize whitespace
|
|
44
|
+
return JSON.parse(jsonContent);
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
// Second attempt: remove trailing commas before ] or }
|
|
48
|
+
try {
|
|
49
|
+
let jsonContent = content.trim()
|
|
50
|
+
.replace(/\bNone\b/g, 'null')
|
|
51
|
+
.replace(/\bTrue\b/g, 'true')
|
|
52
|
+
.replace(/\bFalse\b/g, 'false')
|
|
53
|
+
.replace(/,\s*]/g, ']')
|
|
54
|
+
.replace(/,\s*}/g, '}');
|
|
55
|
+
const fenceStart = jsonContent.indexOf('```json');
|
|
56
|
+
if (fenceStart !== -1) {
|
|
57
|
+
const afterFence = jsonContent.slice(fenceStart + 7);
|
|
58
|
+
const fenceEnd = afterFence.lastIndexOf('```');
|
|
59
|
+
jsonContent = (fenceEnd !== -1 ? afterFence.slice(0, fenceEnd) : afterFence).trim();
|
|
60
|
+
}
|
|
61
|
+
return JSON.parse(jsonContent);
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
console.warn('[PageIndex] Failed to parse JSON from LLM response:', content.slice(0, 200));
|
|
65
|
+
return {};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=json.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json.js","sourceRoot":"","sources":["../../src/utils/json.ts"],"names":[],"mappings":";;AAIA,wCAQC;AAQD,kCA6CC;AAjED;;;GAGG;AACH,SAAgB,cAAc,CAAC,QAAgB;IAC7C,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAC1C,IAAI,KAAK,KAAK,CAAC,CAAC,EAAE,CAAC;QACjB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;QACxC,MAAM,GAAG,GAAG,KAAK,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;IAC3D,CAAC;IACD,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;AACzB,CAAC;AAED;;;;;GAKG;AACH,SAAgB,WAAW,CAAC,OAAe;IACzC,IAAI,CAAC;QACH,0CAA0C;QAC1C,IAAI,WAAmB,CAAC;QACxB,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QAC9C,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;YACtB,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;YACjD,MAAM,QAAQ,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;YAC/C,WAAW,GAAG,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC;QACtF,CAAC;aAAM,CAAC;YACN,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC/B,CAAC;QAED,iBAAiB;QACjB,WAAW,GAAG,WAAW;aACtB,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAG,0BAA0B;aACzD,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAG,0BAA0B;aACzD,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC,4BAA4B;aAC3D,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAS,kBAAkB;aACjD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAU,uBAAuB;QAEzD,OAAO,IAAI,CAAC,KAAK,CAAC,WAAW,CAAwC,CAAC;IACxE,CAAC;IAAC,MAAM,CAAC;QACP,uDAAuD;QACvD,IAAI,CAAC;YACH,IAAI,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE;iBAC7B,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC;iBAC5B,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC;iBAC5B,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC;iBAC9B,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;iBACtB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;YAE1B,MAAM,UAAU,GAAG,WAAW,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAClD,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;gBACtB,MAAM,UAAU,GAAG,WAAW,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;gBACrD,MAAM,QAAQ,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBAC/C,WAAW,GAAG,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC;YACtF,CAAC;YAED,OAAO,IAAI,CAAC,KAAK,CAAC,WAAW,CAAwC,CAAC;QACxE,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,CAAC,IAAI,CAAC,qDAAqD,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3F,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { PageData, TokenCounter } from '../types';
|
|
2
|
+
/**
|
|
3
|
+
* Extracts per-page text from a PDF buffer using `pdfjs-dist`.
|
|
4
|
+
*
|
|
5
|
+
* This is an **optional helper** — install `pdfjs-dist` (>=4.0.0) to use it.
|
|
6
|
+
* If you already have page text (e.g., from `react-native-pdf` or a backend),
|
|
7
|
+
* you can pass `PageData[]` directly to `pageIndex()` without calling this.
|
|
8
|
+
*
|
|
9
|
+
* @param data Raw PDF bytes (ArrayBuffer or Uint8Array)
|
|
10
|
+
* @param counter Token counter function (defaults to ~4 chars/token)
|
|
11
|
+
* @returns Array of `{ text, tokenCount }` — one entry per page
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* import RNFS from 'react-native-fs';
|
|
15
|
+
* const base64 = await RNFS.readFile(filePath, 'base64');
|
|
16
|
+
* const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
|
|
17
|
+
* const pages = await extractPdfPages(bytes.buffer as ArrayBuffer);
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractPdfPages(data: ArrayBuffer | Uint8Array, counter?: TokenCounter): Promise<PageData[]>;
|
|
20
|
+
//# sourceMappingURL=pdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/utils/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAGvD;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAsB,eAAe,CACnC,IAAI,EAAE,WAAW,GAAG,UAAU,EAC9B,OAAO,GAAE,YAAkC,GAC1C,OAAO,CAAC,QAAQ,EAAE,CAAC,CA6CrB"}
|