react-native-pageindex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/LICENSE +21 -0
  3. package/README.md +405 -0
  4. package/dist/config.d.ts +4 -0
  5. package/dist/config.d.ts.map +1 -0
  6. package/dist/config.js +22 -0
  7. package/dist/config.js.map +1 -0
  8. package/dist/index.d.ts +49 -0
  9. package/dist/index.d.ts.map +1 -0
  10. package/dist/index.js +75 -0
  11. package/dist/index.js.map +1 -0
  12. package/dist/pageIndex.d.ts +48 -0
  13. package/dist/pageIndex.d.ts.map +1 -0
  14. package/dist/pageIndex.js +962 -0
  15. package/dist/pageIndex.js.map +1 -0
  16. package/dist/pageIndexDocument.d.ts +85 -0
  17. package/dist/pageIndexDocument.d.ts.map +1 -0
  18. package/dist/pageIndexDocument.js +145 -0
  19. package/dist/pageIndexDocument.js.map +1 -0
  20. package/dist/pageIndexMd.d.ts +31 -0
  21. package/dist/pageIndexMd.d.ts.map +1 -0
  22. package/dist/pageIndexMd.js +260 -0
  23. package/dist/pageIndexMd.js.map +1 -0
  24. package/dist/parsers/csv.d.ts +17 -0
  25. package/dist/parsers/csv.d.ts.map +1 -0
  26. package/dist/parsers/csv.js +147 -0
  27. package/dist/parsers/csv.js.map +1 -0
  28. package/dist/parsers/docx.d.ts +20 -0
  29. package/dist/parsers/docx.d.ts.map +1 -0
  30. package/dist/parsers/docx.js +134 -0
  31. package/dist/parsers/docx.js.map +1 -0
  32. package/dist/parsers/xlsx.d.ts +19 -0
  33. package/dist/parsers/xlsx.d.ts.map +1 -0
  34. package/dist/parsers/xlsx.js +121 -0
  35. package/dist/parsers/xlsx.js.map +1 -0
  36. package/dist/reverseIndex.d.ts +39 -0
  37. package/dist/reverseIndex.d.ts.map +1 -0
  38. package/dist/reverseIndex.js +248 -0
  39. package/dist/reverseIndex.js.map +1 -0
  40. package/dist/types.d.ts +190 -0
  41. package/dist/types.d.ts.map +1 -0
  42. package/dist/types.js +4 -0
  43. package/dist/types.js.map +1 -0
  44. package/dist/utils/json.d.ts +13 -0
  45. package/dist/utils/json.d.ts.map +1 -0
  46. package/dist/utils/json.js +69 -0
  47. package/dist/utils/json.js.map +1 -0
  48. package/dist/utils/pdf.d.ts +20 -0
  49. package/dist/utils/pdf.d.ts.map +1 -0
  50. package/dist/utils/pdf.js +96 -0
  51. package/dist/utils/pdf.js.map +1 -0
  52. package/dist/utils/progress.d.ts +29 -0
  53. package/dist/utils/progress.d.ts.map +1 -0
  54. package/dist/utils/progress.js +59 -0
  55. package/dist/utils/progress.js.map +1 -0
  56. package/dist/utils/tokens.d.ts +7 -0
  57. package/dist/utils/tokens.d.ts.map +1 -0
  58. package/dist/utils/tokens.js +12 -0
  59. package/dist/utils/tokens.js.map +1 -0
  60. package/dist/utils/tree.d.ts +88 -0
  61. package/dist/utils/tree.d.ts.map +1 -0
  62. package/dist/utils/tree.js +365 -0
  63. package/dist/utils/tree.js.map +1 -0
  64. package/package.json +76 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,25 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ---
8
+
9
+ ## [0.1.0] — 2026-03-07
10
+
11
+ ### Added
12
+ - Initial release — TypeScript port of the Python PageIndex project
13
+ - **`pageIndex()`** — PDF hierarchical tree index pipeline (13-step with progress)
14
+ - **`pageIndexMd()`** — Markdown hierarchical tree index pipeline (8-step with progress)
15
+ - **`pageIndexDocument()`** — Unified multi-format entrypoint; auto-detects format from filename
16
+ - **`buildReverseIndex()`** — Inverted index from a forward-index result; `'keyword'` and `'llm'` modes
17
+ - **`searchReverseIndex()`** — Multi-term query with partial-match scoring
18
+ - **Format parsers:**
19
+ - `extractPdfPages()` — PDF via pdfjs-dist (optional dep)
20
+ - `extractDocxPages()` — DOCX via mammoth (optional dep)
21
+ - `extractCsvPages()` — CSV, pure JS, zero dependencies
22
+ - `extractXlsxPages()` — XLSX / XLS via SheetJS (optional dep)
23
+ - **Progress tracking** — `onProgress` callback on all pipelines and `buildReverseIndex`
24
+ - **Provider-agnostic LLM** — pass any `LLMProvider` callback (OpenAI, Anthropic, Ollama, Gemini…)
25
+ - Full TypeScript types and `.d.ts` declarations
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 subham11 (https://github.com/subham11)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,405 @@
1
+ # react-native-pageindex
2
+
3
+ [![npm version](https://img.shields.io/npm/v/react-native-pageindex.svg)](https://www.npmjs.com/package/react-native-pageindex)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
5
+
6
+ **Vectorless, reasoning-based RAG** — builds a hierarchical tree index from any document using any LLM provider. Works in React Native, Node.js, and the browser.
7
+
8
+ No vector database required. Instead of embeddings, the library uses the LLM to *reason* about document structure, producing a navigable tree that lets your AI answer questions with precise source attribution.
9
+
10
+ ---
11
+
12
+ ## Features
13
+
14
+ | Feature | Detail |
15
+ |---|---|
16
+ | **Multi-format** | PDF, Word (.docx), CSV, Spreadsheet (.xlsx/.xls), Markdown |
17
+ | **Forward index** | Hierarchical tree: chapters → sections → subsections |
18
+ | **Reverse index** | Inverted index: term → node locations for fast lookup |
19
+ | **Provider-agnostic** | Pass any LLM (OpenAI, Anthropic, Ollama, Gemini…) |
20
+ | **Progress tracking** | Fine-grained per-step callbacks (13 PDF steps, 8 MD steps) |
21
+ | **Fully typed** | 100% TypeScript, `.d.ts` declarations included |
22
+ | **Optional deps** | pdfjs-dist / mammoth / xlsx are opt-in; CSV & MD have zero deps |
23
+
24
+ ---
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ npm install react-native-pageindex
30
+ ```
31
+
32
+ ### Optional format dependencies
33
+
34
+ Install only what you need:
35
+
36
+ ```bash
37
+ # PDF support
38
+ npm install pdfjs-dist
39
+
40
+ # Word .docx support
41
+ npm install mammoth
42
+
43
+ # Excel / spreadsheet support
44
+ npm install xlsx
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Quick Start
50
+
51
+ ### 1. Wire up your LLM provider
52
+
53
+ ```ts
54
+ import OpenAI from 'openai';
55
+ import { LLMProvider } from 'react-native-pageindex';
56
+
57
+ const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
58
+
59
+ const llm: LLMProvider = async (prompt, opts) => {
60
+ const res = await openai.chat.completions.create({
61
+ model: 'gpt-4o',
62
+ messages: [
63
+ ...(opts?.chatHistory ?? []),
64
+ { role: 'user', content: prompt },
65
+ ],
66
+ });
67
+ return {
68
+ content: res.choices[0].message.content ?? '',
69
+ finishReason: res.choices[0].finish_reason ?? 'stop',
70
+ };
71
+ };
72
+ ```
73
+
74
+ ### 2. Index any document
75
+
76
+ ```ts
77
+ import { pageIndexDocument } from 'react-native-pageindex';
78
+ import { readFileSync } from 'fs';
79
+
80
+ // Works with PDF, DOCX, XLSX, CSV, or Markdown
81
+ const data = readFileSync('report.pdf');
82
+
83
+ const result = await pageIndexDocument({
84
+ data,
85
+ fileName: 'report.pdf', // used to auto-detect format
86
+ docName: 'Annual Report 2024',
87
+ llm,
88
+ options: {
89
+ onProgress: ({ step, percent, detail }) =>
90
+ console.log(`[${percent}%] ${step}${detail ? ` — ${detail}` : ''}`),
91
+ },
92
+ });
93
+
94
+ console.log(result.structure); // hierarchical tree
95
+ ```
96
+
97
+ ### 3. Build a reverse index for fast search
98
+
99
+ ```ts
100
+ import { buildReverseIndex, searchReverseIndex } from 'react-native-pageindex';
101
+
102
+ const reverseIndex = await buildReverseIndex({
103
+ result, // forward index from pageIndexDocument()
104
+ options: {
105
+ mode: 'keyword', // 'keyword' (fast, no LLM) | 'llm' (semantic)
106
+ },
107
+ });
108
+
109
+ const hits = searchReverseIndex(reverseIndex, 'revenue growth', 5);
110
+ // hits[0] = { nodeTitle, nodeId, score, matchedTerm, totalScore, ... }
111
+ ```
112
+
113
+ ---
114
+
115
+ ## API
116
+
117
+ ### `pageIndexDocument(input)` — Unified entrypoint
118
+
119
+ Accepts any supported file format and returns a hierarchical `PageIndexResult`.
120
+
121
+ ```ts
122
+ interface PageIndexDocumentInput {
123
+ data?: ArrayBuffer | Uint8Array | string; // binary for PDF/DOCX/XLSX; string for CSV/MD
124
+ text?: string; // convenience alias for Markdown / CSV
125
+ fileType?: 'pdf' | 'docx' | 'csv' | 'xlsx' | 'md'; // inferred from fileName if omitted
126
+ fileName?: string;
127
+ docName?: string;
128
+ llm: LLMProvider;
129
+ options?: PageIndexDocumentOptions;
130
+ }
131
+ ```
132
+
133
+ `PageIndexDocumentOptions`:
134
+
135
+ | Option | Type | Default | Description |
136
+ |---|---|---|---|
137
+ | `onProgress` | `ProgressCallback` | — | Per-step progress updates |
138
+ | `pdfOptions` | `PageIndexOptions` | — | Forwarded to the PDF pipeline |
139
+ | `mdOptions` | `MdPageIndexOptions` | — | Forwarded to the Markdown pipeline |
140
+ | `csvOptions` | `CsvParseOptions` | — | CSV row-grouping & delimiter options |
141
+ | `xlsxOptions` | `XlsxParseOptions` | — | XLSX sheet selection & row-grouping |
142
+ | `tokenCounter` | `TokenCounter` | `~4 chars/token` | Custom tokeniser |
143
+
144
+ ---
145
+
146
+ ### `pageIndex(input)` — PDF pipeline (direct)
147
+
148
+ Use when you already have extracted pages or want PDF-specific options.
149
+
150
+ ```ts
151
+ import { pageIndex, extractPdfPages } from 'react-native-pageindex';
152
+
153
+ const pages = await extractPdfPages(pdfBuffer); // requires pdfjs-dist
154
+
155
+ const result = await pageIndex({ pages, llm, docName: 'Report' });
156
+ ```
157
+
158
+ `PageIndexOptions`:
159
+
160
+ | Option | Default | Description |
161
+ |---|---|---|
162
+ | `tocCheckPageNum` | `20` | Pages to scan for table of contents |
163
+ | `maxPageNumEachNode` | `10` | Max pages per tree node |
164
+ | `maxTokenNumEachNode` | `20000` | Max tokens per tree node |
165
+ | `ifAddNodeId` | `true` | Attach unique IDs to each node |
166
+ | `ifAddNodeSummary` | `true` | LLM-generated summary per node |
167
+ | `ifAddDocDescription` | `false` | Generate overall document description |
168
+ | `ifAddNodeText` | `false` | Attach raw page text to nodes |
169
+
170
+ ---
171
+
172
+ ### `pageIndexMd(input)` — Markdown pipeline (direct)
173
+
174
+ ```ts
175
+ import { pageIndexMd } from 'react-native-pageindex';
176
+
177
+ const result = await pageIndexMd({
178
+ content: markdownString,
179
+ docName: 'README',
180
+ llm,
181
+ options: { ifThinning: true, minTokenThreshold: 3000 },
182
+ });
183
+ ```
184
+
185
+ `MdPageIndexOptions`:
186
+
187
+ | Option | Default | Description |
188
+ |---|---|---|
189
+ | `ifThinning` | `false` | Merge small sections below threshold |
190
+ | `minTokenThreshold` | `5000` | Min tokens before thinning kicks in |
191
+ | `ifAddNodeSummary` | `true` | LLM-generated summary per node |
192
+ | `summaryTokenThreshold` | `200` | Only summarise nodes above this size |
193
+ | `ifAddDocDescription` | `false` | Generate overall document description |
194
+ | `ifAddNodeText` | `false` | Attach raw section text to nodes |
195
+
196
+ ---
197
+
198
+ ### `buildReverseIndex(input)` — Inverted index
199
+
200
+ ```ts
201
+ const reverseIndex = await buildReverseIndex({
202
+ result, // PageIndexResult
203
+ pages?, // original PageData[] (optional enrichment)
204
+ llm?, // required only for mode: 'llm'
205
+ options?: {
206
+ mode: 'keyword' | 'llm', // default: 'keyword'
207
+ minTermLength: number, // default: 3
208
+ maxTermsPerNode: number, // default: 10
209
+ onProgress: ProgressCallback,
210
+ },
211
+ });
212
+ ```
213
+
214
+ ---
215
+
216
+ ### `searchReverseIndex(index, query, topK?)` — Query the index
217
+
218
+ ```ts
219
+ const results = searchReverseIndex(reverseIndex, 'machine learning', 10);
220
+
221
+ // SearchResult[]
222
+ results.forEach(r => {
223
+ console.log(r.nodeTitle, r.totalScore, r.matchedTerm);
224
+ });
225
+ ```
226
+
227
+ ---
228
+
229
+ ### Format parsers (lower-level)
230
+
231
+ ```ts
232
+ import {
233
+ extractPdfPages, // requires pdfjs-dist
234
+ extractDocxPages, // requires mammoth
235
+ extractCsvPages, // no deps
236
+ extractXlsxPages, // requires xlsx
237
+ } from 'react-native-pageindex';
238
+
239
+ // All return: Promise<PageData[]>
240
+ // PageData = { text: string; tokenCount: number }
241
+ ```
242
+
243
+ ---
244
+
245
+ ### Key Types
246
+
247
+ ```ts
248
+ // LLM provider — wire up any AI
249
+ type LLMProvider = (
250
+ prompt: string,
251
+ options?: { chatHistory?: LLMMessage[] }
252
+ ) => Promise<{ content: string; finishReason: string }>;
253
+
254
+ // Progress tracking
255
+ type ProgressCallback = (info: {
256
+ step: string;
257
+ percent: number;
258
+ detail?: string;
259
+ }) => void;
260
+
261
+ // Forward index result
262
+ interface PageIndexResult {
263
+ structure: TreeNode; // root of the hierarchy
264
+ doc_name: string;
265
+ description?: string;
266
+ }
267
+
268
+ // Tree node
269
+ interface TreeNode {
270
+ title?: string;
271
+ node_id?: string;
272
+ summary?: string;
273
+ start_index?: number;
274
+ end_index?: number;
275
+ children?: TreeNode[];
276
+ [key: string]: unknown;
277
+ }
278
+
279
+ // Reverse index search result
280
+ interface SearchResult extends ReverseIndexEntry {
281
+ matchedTerm: string;
282
+ totalScore: number;
283
+ }
284
+ ```
285
+
286
+ ---
287
+
288
+ ## Progress Tracking
289
+
290
+ Both pipelines emit fine-grained progress events:
291
+
292
+ ```ts
293
+ options: {
294
+ onProgress: ({ step, percent, detail }) => {
295
+ // PDF pipeline steps (0–100%):
296
+ // Initializing → Extracting PDF pages → Scanning for table of contents
297
+ // → Transforming TOC → Mapping page numbers → Building tree
298
+ // → Verifying TOC → Fixing inaccuracies → Resolving large sections
299
+ // → Attaching page text → Generating node summaries
300
+ // → Generating document description → Done
301
+
302
+ // Markdown pipeline steps:
303
+ // Initializing → Parsing headings → Extracting section text
304
+ // → Optimizing tree → Building tree → Generating summaries
305
+ // → Generating description → Done
306
+
307
+ updateProgressBar(percent);
308
+ setStatusText(`${step}${detail ? ': ' + detail : ''}`);
309
+ },
310
+ }
311
+ ```
312
+
313
+ ---
314
+
315
+ ## LLM Provider Examples
316
+
317
+ ### Anthropic Claude
318
+
319
+ ```ts
320
+ import Anthropic from '@anthropic-ai/sdk';
321
+
322
+ const client = new Anthropic();
323
+ const llm: LLMProvider = async (prompt) => {
324
+ const msg = await client.messages.create({
325
+ model: 'claude-opus-4-5',
326
+ max_tokens: 4096,
327
+ messages: [{ role: 'user', content: prompt }],
328
+ });
329
+ const block = msg.content[0];
330
+ return {
331
+ content: block.type === 'text' ? block.text : '',
332
+ finishReason: msg.stop_reason ?? 'stop',
333
+ };
334
+ };
335
+ ```
336
+
337
+ ### Ollama (local)
338
+
339
+ ```ts
340
+ const llm: LLMProvider = async (prompt) => {
341
+ const res = await fetch('http://localhost:11434/api/generate', {
342
+ method: 'POST',
343
+ headers: { 'Content-Type': 'application/json' },
344
+ body: JSON.stringify({ model: 'llama3', prompt, stream: false }),
345
+ });
346
+ const data = await res.json();
347
+ return { content: data.response, finishReason: 'stop' };
348
+ };
349
+ ```
350
+
351
+ ### Google Gemini
352
+
353
+ ```ts
354
+ import { GoogleGenerativeAI } from '@google/generative-ai';
355
+
356
+ const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
357
+ const model = genAI.getGenerativeModel({ model: 'gemini-1.5-pro' });
358
+
359
+ const llm: LLMProvider = async (prompt) => {
360
+ const result = await model.generateContent(prompt);
361
+ return {
362
+ content: result.response.text(),
363
+ finishReason: 'stop',
364
+ };
365
+ };
366
+ ```
367
+
368
+ ---
369
+
370
+ ## React Native Usage
371
+
372
+ ```ts
373
+ // Use RNFS or fetch to get file bytes
374
+ import RNFS from 'react-native-fs';
375
+ import { pageIndexDocument } from 'react-native-pageindex';
376
+
377
+ const base64 = await RNFS.readFile(filePath, 'base64');
378
+ const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
379
+
380
+ const result = await pageIndexDocument({
381
+ data: bytes,
382
+ fileName: 'document.pdf',
383
+ llm,
384
+ options: { onProgress: setProgress },
385
+ });
386
+ ```
387
+
388
+ > **Note:** pdfjs-dist has a web worker that may need special Metro configuration.
389
+ > Alternatively, pass pre-extracted `pages: PageData[]` directly to `pageIndex()` to skip pdfjs entirely.
390
+
391
+ ---
392
+
393
+ ## Versioning
394
+
395
+ This package follows [Semantic Versioning](https://semver.org/):
396
+
397
+ - **Patch** (`0.1.x`) — bug fixes, no API changes
398
+ - **Minor** (`0.x.0`) — new features, backward compatible
399
+ - **Major** (`x.0.0`) — breaking changes to the public API
400
+
401
+ ---
402
+
403
+ ## License
404
+
405
+ MIT © [subham11](https://github.com/subham11)
@@ -0,0 +1,4 @@
1
+ import type { PageIndexOptions, MdPageIndexOptions } from './types';
2
+ export declare const DEFAULT_PDF_OPTIONS: Required<Omit<PageIndexOptions, 'tokenCounter' | 'onProgress'>>;
3
+ export declare const DEFAULT_MD_OPTIONS: Required<Omit<MdPageIndexOptions, 'tokenCounter' | 'onProgress'>>;
4
+ //# sourceMappingURL=config.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,SAAS,CAAC;AAEpE,eAAO,MAAM,mBAAmB,EAAE,QAAQ,CAAC,IAAI,CAAC,gBAAgB,EAAE,cAAc,GAAG,YAAY,CAAC,CAQ/F,CAAC;AAEF,eAAO,MAAM,kBAAkB,EAAE,QAAQ,CAAC,IAAI,CAAC,kBAAkB,EAAE,cAAc,GAAG,YAAY,CAAC,CAQhG,CAAC"}
package/dist/config.js ADDED
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.DEFAULT_MD_OPTIONS = exports.DEFAULT_PDF_OPTIONS = void 0;
4
+ exports.DEFAULT_PDF_OPTIONS = {
5
+ tocCheckPageNum: 20,
6
+ maxPageNumEachNode: 10,
7
+ maxTokenNumEachNode: 20000,
8
+ ifAddNodeId: true,
9
+ ifAddNodeSummary: true,
10
+ ifAddDocDescription: false,
11
+ ifAddNodeText: false,
12
+ };
13
+ exports.DEFAULT_MD_OPTIONS = {
14
+ ifThinning: false,
15
+ minTokenThreshold: 5000,
16
+ ifAddNodeSummary: true,
17
+ summaryTokenThreshold: 200,
18
+ ifAddDocDescription: false,
19
+ ifAddNodeText: false,
20
+ ifAddNodeId: true,
21
+ };
22
+ //# sourceMappingURL=config.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;;AAEa,QAAA,mBAAmB,GAAoE;IAClG,eAAe,EAAE,EAAE;IACnB,kBAAkB,EAAE,EAAE;IACtB,mBAAmB,EAAE,KAAK;IAC1B,WAAW,EAAE,IAAI;IACjB,gBAAgB,EAAE,IAAI;IACtB,mBAAmB,EAAE,KAAK;IAC1B,aAAa,EAAE,KAAK;CACrB,CAAC;AAEW,QAAA,kBAAkB,GAAsE;IACnG,UAAU,EAAE,KAAK;IACjB,iBAAiB,EAAE,IAAI;IACvB,gBAAgB,EAAE,IAAI;IACtB,qBAAqB,EAAE,GAAG;IAC1B,mBAAmB,EAAE,KAAK;IAC1B,aAAa,EAAE,KAAK;IACpB,WAAW,EAAE,IAAI;CAClB,CAAC"}
@@ -0,0 +1,49 @@
1
+ /**
2
+ * react-native-pageindex
3
+ *
4
+ * Vectorless, reasoning-based RAG — builds a hierarchical tree index from
5
+ * PDF or Markdown documents using any LLM provider.
6
+ *
7
+ * @example — Quick start with OpenAI
8
+ * ```ts
9
+ * import { pageIndex, pageIndexMd } from 'react-native-pageindex';
10
+ * import OpenAI from 'openai';
11
+ *
12
+ * const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
13
+ *
14
+ * // LLM provider callback (works with any AI provider)
15
+ * const llm = async (prompt, opts) => {
16
+ * const res = await openai.chat.completions.create({
17
+ * model: 'gpt-4o',
18
+ * messages: [
19
+ * ...(opts?.chatHistory ?? []),
20
+ * { role: 'user', content: prompt },
21
+ * ],
22
+ * });
23
+ * return {
24
+ * content: res.choices[0].message.content ?? '',
25
+ * finishReason: res.choices[0].finish_reason ?? 'stop',
26
+ * };
27
+ * };
28
+ *
29
+ * // PDF (pre-extracted pages)
30
+ * const result = await pageIndex({ pages: myPages, llm, docName: 'report' });
31
+ *
32
+ * // Markdown
33
+ * const result = await pageIndexMd({ content: markdownString, llm });
34
+ * ```
35
+ */
36
+ export { pageIndex } from './pageIndex';
37
+ export { pageIndexMd } from './pageIndexMd';
38
+ export { pageIndexDocument } from './pageIndexDocument';
39
+ export type { PageIndexDocumentInput, PageIndexDocumentOptions } from './pageIndexDocument';
40
+ export { buildReverseIndex, searchReverseIndex } from './reverseIndex';
41
+ export { extractPdfPages } from './utils/pdf';
42
+ export { extractDocxPages } from './parsers/docx';
43
+ export { extractCsvPages } from './parsers/csv';
44
+ export { extractXlsxPages } from './parsers/xlsx';
45
+ export type { PageData, LLMMessage, LLMResult, LLMFinishReason, LLMProvider, TokenCounter, ProgressInfo, ProgressCallback, TreeNode, PageIndexResult, PageIndexOptions, MdPageIndexOptions, DocumentFileType, CsvParseOptions, XlsxParseOptions, ReverseIndex, ReverseIndexEntry, SearchResult, ReverseIndexOptions, } from './types';
46
+ export { defaultTokenCounter } from './utils/tokens';
47
+ export { extractJson, getJsonContent } from './utils/json';
48
+ export { writeNodeId, structureToList, getNodes, getLeafNodes, addNodeText, removeStructureText, removeFields, deepClone, } from './utils/tree';
49
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAGH,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAG5C,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,YAAY,EAAE,sBAAsB,EAAE,wBAAwB,EAAE,MAAM,qBAAqB,CAAC;AAG5F,OAAO,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAC;AAGvE,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAGlD,YAAY,EACV,QAAQ,EACR,UAAU,EACV,SAAS,EACT,eAAe,EACf,WAAW,EACX,YAAY,EACZ,YAAY,EACZ,gBAAgB,EAChB,QAAQ,EACR,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,gBAAgB,EAChB,eAAe,EACf,gBAAgB,EAChB,YAAY,EACZ,iBAAiB,EACjB,YAAY,EACZ,mBAAmB,GACpB,MAAM,SAAS,CAAC;AAGjB,OAAO,EAAE,mBAAmB,EAAE,MAAM,gBAAgB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC3D,OAAO,EACL,WAAW,EACX,eAAe,EACf,QAAQ,EACR,YAAY,EACZ,WAAW,EACX,mBAAmB,EACnB,YAAY,EACZ,SAAS,GACV,MAAM,cAAc,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,75 @@
1
+ "use strict";
2
+ /**
3
+ * react-native-pageindex
4
+ *
5
+ * Vectorless, reasoning-based RAG — builds a hierarchical tree index from
6
+ * PDF or Markdown documents using any LLM provider.
7
+ *
8
+ * @example — Quick start with OpenAI
9
+ * ```ts
10
+ * import { pageIndex, pageIndexMd } from 'react-native-pageindex';
11
+ * import OpenAI from 'openai';
12
+ *
13
+ * const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
14
+ *
15
+ * // LLM provider callback (works with any AI provider)
16
+ * const llm = async (prompt, opts) => {
17
+ * const res = await openai.chat.completions.create({
18
+ * model: 'gpt-4o',
19
+ * messages: [
20
+ * ...(opts?.chatHistory ?? []),
21
+ * { role: 'user', content: prompt },
22
+ * ],
23
+ * });
24
+ * return {
25
+ * content: res.choices[0].message.content ?? '',
26
+ * finishReason: res.choices[0].finish_reason ?? 'stop',
27
+ * };
28
+ * };
29
+ *
30
+ * // PDF (pre-extracted pages)
31
+ * const result = await pageIndex({ pages: myPages, llm, docName: 'report' });
32
+ *
33
+ * // Markdown
34
+ * const result = await pageIndexMd({ content: markdownString, llm });
35
+ * ```
36
+ */
37
+ Object.defineProperty(exports, "__esModule", { value: true });
38
+ exports.deepClone = exports.removeFields = exports.removeStructureText = exports.addNodeText = exports.getLeafNodes = exports.getNodes = exports.structureToList = exports.writeNodeId = exports.getJsonContent = exports.extractJson = exports.defaultTokenCounter = exports.extractXlsxPages = exports.extractCsvPages = exports.extractDocxPages = exports.extractPdfPages = exports.searchReverseIndex = exports.buildReverseIndex = exports.pageIndexDocument = exports.pageIndexMd = exports.pageIndex = void 0;
39
+ // Main APIs
40
+ var pageIndex_1 = require("./pageIndex");
41
+ Object.defineProperty(exports, "pageIndex", { enumerable: true, get: function () { return pageIndex_1.pageIndex; } });
42
+ var pageIndexMd_1 = require("./pageIndexMd");
43
+ Object.defineProperty(exports, "pageIndexMd", { enumerable: true, get: function () { return pageIndexMd_1.pageIndexMd; } });
44
+ // Unified multi-format entrypoint
45
+ var pageIndexDocument_1 = require("./pageIndexDocument");
46
+ Object.defineProperty(exports, "pageIndexDocument", { enumerable: true, get: function () { return pageIndexDocument_1.pageIndexDocument; } });
47
+ // Reverse / inverted index
48
+ var reverseIndex_1 = require("./reverseIndex");
49
+ Object.defineProperty(exports, "buildReverseIndex", { enumerable: true, get: function () { return reverseIndex_1.buildReverseIndex; } });
50
+ Object.defineProperty(exports, "searchReverseIndex", { enumerable: true, get: function () { return reverseIndex_1.searchReverseIndex; } });
51
+ // Format-specific parsers (each requires an optional dep — see README)
52
+ var pdf_1 = require("./utils/pdf");
53
+ Object.defineProperty(exports, "extractPdfPages", { enumerable: true, get: function () { return pdf_1.extractPdfPages; } });
54
+ var docx_1 = require("./parsers/docx");
55
+ Object.defineProperty(exports, "extractDocxPages", { enumerable: true, get: function () { return docx_1.extractDocxPages; } });
56
+ var csv_1 = require("./parsers/csv");
57
+ Object.defineProperty(exports, "extractCsvPages", { enumerable: true, get: function () { return csv_1.extractCsvPages; } });
58
+ var xlsx_1 = require("./parsers/xlsx");
59
+ Object.defineProperty(exports, "extractXlsxPages", { enumerable: true, get: function () { return xlsx_1.extractXlsxPages; } });
60
+ // Utilities (useful for downstream tree-search / RAG pipelines)
61
+ var tokens_1 = require("./utils/tokens");
62
+ Object.defineProperty(exports, "defaultTokenCounter", { enumerable: true, get: function () { return tokens_1.defaultTokenCounter; } });
63
+ var json_1 = require("./utils/json");
64
+ Object.defineProperty(exports, "extractJson", { enumerable: true, get: function () { return json_1.extractJson; } });
65
+ Object.defineProperty(exports, "getJsonContent", { enumerable: true, get: function () { return json_1.getJsonContent; } });
66
+ var tree_1 = require("./utils/tree");
67
+ Object.defineProperty(exports, "writeNodeId", { enumerable: true, get: function () { return tree_1.writeNodeId; } });
68
+ Object.defineProperty(exports, "structureToList", { enumerable: true, get: function () { return tree_1.structureToList; } });
69
+ Object.defineProperty(exports, "getNodes", { enumerable: true, get: function () { return tree_1.getNodes; } });
70
+ Object.defineProperty(exports, "getLeafNodes", { enumerable: true, get: function () { return tree_1.getLeafNodes; } });
71
+ Object.defineProperty(exports, "addNodeText", { enumerable: true, get: function () { return tree_1.addNodeText; } });
72
+ Object.defineProperty(exports, "removeStructureText", { enumerable: true, get: function () { return tree_1.removeStructureText; } });
73
+ Object.defineProperty(exports, "removeFields", { enumerable: true, get: function () { return tree_1.removeFields; } });
74
+ Object.defineProperty(exports, "deepClone", { enumerable: true, get: function () { return tree_1.deepClone; } });
75
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;;;AAEH,YAAY;AACZ,yCAAwC;AAA/B,sGAAA,SAAS,OAAA;AAClB,6CAA4C;AAAnC,0GAAA,WAAW,OAAA;AAEpB,kCAAkC;AAClC,yDAAwD;AAA/C,sHAAA,iBAAiB,OAAA;AAG1B,2BAA2B;AAC3B,+CAAuE;AAA9D,iHAAA,iBAAiB,OAAA;AAAE,kHAAA,kBAAkB,OAAA;AAE9C,uEAAuE;AACvE,mCAA8C;AAArC,sGAAA,eAAe,OAAA;AACxB,uCAAkD;AAAzC,wGAAA,gBAAgB,OAAA;AACzB,qCAAgD;AAAvC,sGAAA,eAAe,OAAA;AACxB,uCAAkD;AAAzC,wGAAA,gBAAgB,OAAA;AAyBzB,gEAAgE;AAChE,yCAAqD;AAA5C,6GAAA,mBAAmB,OAAA;AAC5B,qCAA2D;AAAlD,mGAAA,WAAW,OAAA;AAAE,sGAAA,cAAc,OAAA;AACpC,qCASsB;AARpB,mGAAA,WAAW,OAAA;AACX,uGAAA,eAAe,OAAA;AACf,gGAAA,QAAQ,OAAA;AACR,oGAAA,YAAY,OAAA;AACZ,mGAAA,WAAW,OAAA;AACX,2GAAA,mBAAmB,OAAA;AACnB,oGAAA,YAAY,OAAA;AACZ,iGAAA,SAAS,OAAA"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * PDF pipeline — port of pageindex/page_index.py
3
+ *
4
+ * Processes PDF pages (as pre-extracted text + token counts) and builds a
5
+ * hierarchical tree index using LLM reasoning. No PDF parser is included
6
+ * here — pass `PageData[]` directly, or use the `extractPdfPages()` helper
7
+ * from `./utils/pdf` (requires pdfjs-dist to be installed).
8
+ */
9
+ import type { LLMProvider, PageData, PageIndexOptions, PageIndexResult } from './types';
10
+ /**
11
+ * Builds a hierarchical tree index from a PDF document.
12
+ *
13
+ * Supply either `pdf` (raw PDF bytes, requires pdfjs-dist) or pre-extracted
14
+ * `pages` (array of `{text, tokenCount}` — one entry per page).
15
+ *
16
+ * @example — with OpenAI + progress bar
17
+ * ```ts
18
+ * import { pageIndex } from 'react-native-pageindex';
19
+ * import OpenAI from 'openai';
20
+ *
21
+ * const openai = new OpenAI({ apiKey: '...' });
22
+ *
23
+ * const result = await pageIndex({
24
+ * pages: myExtractedPages,
25
+ * docName: 'annual-report',
26
+ * llm: async (prompt, opts) => {
27
+ * const res = await openai.chat.completions.create({
28
+ * model: 'gpt-4o',
29
+ * messages: [...(opts?.chatHistory ?? []), { role: 'user', content: prompt }],
30
+ * });
31
+ * return { content: res.choices[0].message.content ?? '', finishReason: res.choices[0].finish_reason ?? 'stop' };
32
+ * },
33
+ * options: {
34
+ * onProgress: ({ step, percent, detail }) => {
35
+ * console.log(`[${percent}%] ${step}${detail ? ` — ${detail}` : ''}`);
36
+ * },
37
+ * },
38
+ * });
39
+ * ```
40
+ */
41
+ export declare function pageIndex(input: {
42
+ pdf?: ArrayBuffer | Uint8Array;
43
+ pages?: PageData[];
44
+ llm: LLMProvider;
45
+ docName?: string;
46
+ options?: PageIndexOptions;
47
+ }): Promise<PageIndexResult>;
48
+ //# sourceMappingURL=pageIndex.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pageIndex.d.ts","sourceRoot":"","sources":["../src/pageIndex.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EACV,WAAW,EAEX,QAAQ,EACR,gBAAgB,EAChB,eAAe,EAEhB,MAAM,SAAS,CAAC;AAulCjB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,wBAAsB,SAAS,CAAC,KAAK,EAAE;IACrC,GAAG,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC;IAC/B,KAAK,CAAC,EAAE,QAAQ,EAAE,CAAC;IACnB,GAAG,EAAE,WAAW,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,gBAAgB,CAAC;CAC5B,GAAG,OAAO,CAAC,eAAe,CAAC,CAwD3B"}