react-native-pageindex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/LICENSE +21 -0
  3. package/README.md +405 -0
  4. package/dist/config.d.ts +4 -0
  5. package/dist/config.d.ts.map +1 -0
  6. package/dist/config.js +22 -0
  7. package/dist/config.js.map +1 -0
  8. package/dist/index.d.ts +49 -0
  9. package/dist/index.d.ts.map +1 -0
  10. package/dist/index.js +75 -0
  11. package/dist/index.js.map +1 -0
  12. package/dist/pageIndex.d.ts +48 -0
  13. package/dist/pageIndex.d.ts.map +1 -0
  14. package/dist/pageIndex.js +962 -0
  15. package/dist/pageIndex.js.map +1 -0
  16. package/dist/pageIndexDocument.d.ts +85 -0
  17. package/dist/pageIndexDocument.d.ts.map +1 -0
  18. package/dist/pageIndexDocument.js +145 -0
  19. package/dist/pageIndexDocument.js.map +1 -0
  20. package/dist/pageIndexMd.d.ts +31 -0
  21. package/dist/pageIndexMd.d.ts.map +1 -0
  22. package/dist/pageIndexMd.js +260 -0
  23. package/dist/pageIndexMd.js.map +1 -0
  24. package/dist/parsers/csv.d.ts +17 -0
  25. package/dist/parsers/csv.d.ts.map +1 -0
  26. package/dist/parsers/csv.js +147 -0
  27. package/dist/parsers/csv.js.map +1 -0
  28. package/dist/parsers/docx.d.ts +20 -0
  29. package/dist/parsers/docx.d.ts.map +1 -0
  30. package/dist/parsers/docx.js +134 -0
  31. package/dist/parsers/docx.js.map +1 -0
  32. package/dist/parsers/xlsx.d.ts +19 -0
  33. package/dist/parsers/xlsx.d.ts.map +1 -0
  34. package/dist/parsers/xlsx.js +121 -0
  35. package/dist/parsers/xlsx.js.map +1 -0
  36. package/dist/reverseIndex.d.ts +39 -0
  37. package/dist/reverseIndex.d.ts.map +1 -0
  38. package/dist/reverseIndex.js +248 -0
  39. package/dist/reverseIndex.js.map +1 -0
  40. package/dist/types.d.ts +190 -0
  41. package/dist/types.d.ts.map +1 -0
  42. package/dist/types.js +4 -0
  43. package/dist/types.js.map +1 -0
  44. package/dist/utils/json.d.ts +13 -0
  45. package/dist/utils/json.d.ts.map +1 -0
  46. package/dist/utils/json.js +69 -0
  47. package/dist/utils/json.js.map +1 -0
  48. package/dist/utils/pdf.d.ts +20 -0
  49. package/dist/utils/pdf.d.ts.map +1 -0
  50. package/dist/utils/pdf.js +96 -0
  51. package/dist/utils/pdf.js.map +1 -0
  52. package/dist/utils/progress.d.ts +29 -0
  53. package/dist/utils/progress.d.ts.map +1 -0
  54. package/dist/utils/progress.js +59 -0
  55. package/dist/utils/progress.js.map +1 -0
  56. package/dist/utils/tokens.d.ts +7 -0
  57. package/dist/utils/tokens.d.ts.map +1 -0
  58. package/dist/utils/tokens.js +12 -0
  59. package/dist/utils/tokens.js.map +1 -0
  60. package/dist/utils/tree.d.ts +88 -0
  61. package/dist/utils/tree.d.ts.map +1 -0
  62. package/dist/utils/tree.js +365 -0
  63. package/dist/utils/tree.js.map +1 -0
  64. package/package.json +76 -0
@@ -0,0 +1,147 @@
1
+ "use strict";
2
+ /**
3
+ * CSV parser — no external dependencies.
4
+ *
5
+ * Converts a CSV string or buffer into page-like chunks so it can be fed
6
+ * into pageIndex(). Each "page" is a fixed number of rows (rowsPerPage)
7
+ * formatted as a plain-text table.
8
+ */
9
+ Object.defineProperty(exports, "__esModule", { value: true });
10
+ exports.extractCsvPages = extractCsvPages;
11
+ const tokens_1 = require("../utils/tokens");
12
+ // ─── Public API ───────────────────────────────────────────────────────────────
13
+ /**
14
+ * Parses a CSV file into `PageData[]` chunks.
15
+ *
16
+ * @param data CSV as a UTF-8 string or raw bytes (ArrayBuffer / Uint8Array)
17
+ * @param options Parsing options (delimiter, rowsPerPage, hasHeader)
18
+ * @param counter Token counter function
19
+ */
20
+ async function extractCsvPages(data, options = {}, counter = tokens_1.defaultTokenCounter) {
21
+ const text = dataToString(data);
22
+ const { rowsPerPage = 100, hasHeader = true } = options;
23
+ const delimiter = options.delimiter ?? detectDelimiter(text);
24
+ const rows = parseCsv(text, delimiter);
25
+ if (rows.length === 0)
26
+ return [];
27
+ const header = hasHeader ? rows[0] : null;
28
+ const dataRows = hasHeader ? rows.slice(1) : rows;
29
+ if (dataRows.length === 0) {
30
+ const page = formatChunk(header, [], header);
31
+ return [{ text: page, tokenCount: counter(page) }];
32
+ }
33
+ // Split dataRows into page-sized chunks
34
+ const pages = [];
35
+ for (let i = 0; i < dataRows.length; i += rowsPerPage) {
36
+ const chunk = dataRows.slice(i, i + rowsPerPage);
37
+ const pageNum = Math.floor(i / rowsPerPage) + 1;
38
+ const totalPages = Math.ceil(dataRows.length / rowsPerPage);
39
+ const text = formatChunk(header, chunk, header, pageNum, totalPages);
40
+ pages.push({ text, tokenCount: counter(text) });
41
+ }
42
+ return pages;
43
+ }
44
+ // ─── Internal helpers ─────────────────────────────────────────────────────────
45
+ function dataToString(data) {
46
+ if (typeof data === 'string')
47
+ return data;
48
+ const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
49
+ return new TextDecoder('utf-8').decode(bytes);
50
+ }
51
+ /** Detects the most likely delimiter by scoring candidates on consistency */
52
+ function detectDelimiter(text) {
53
+ const candidates = [',', ';', '\t', '|'];
54
+ const firstLine = text.split('\n')[0] ?? '';
55
+ let best = ',';
56
+ let bestCount = 0;
57
+ for (const d of candidates) {
58
+ const count = firstLine.split(d).length - 1;
59
+ if (count > bestCount) {
60
+ bestCount = count;
61
+ best = d;
62
+ }
63
+ }
64
+ return best;
65
+ }
66
+ /** RFC 4180-compatible CSV parser */
67
+ function parseCsv(text, delimiter) {
68
+ const rows = [];
69
+ const lines = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n').split('\n');
70
+ let currentRow = [];
71
+ let currentField = '';
72
+ let inQuotes = false;
73
+ for (const line of lines) {
74
+ if (!inQuotes && line.trim() === '' && currentRow.length === 0)
75
+ continue;
76
+ for (let i = 0; i <= line.length; i++) {
77
+ const ch = line[i];
78
+ if (i === line.length) {
79
+ // End of line
80
+ if (inQuotes) {
81
+ currentField += '\n';
82
+ }
83
+ else {
84
+ currentRow.push(currentField.trim());
85
+ currentField = '';
86
+ }
87
+ break;
88
+ }
89
+ if (inQuotes) {
90
+ if (ch === '"') {
91
+ if (line[i + 1] === '"') {
92
+ currentField += '"';
93
+ i++;
94
+ }
95
+ else
96
+ inQuotes = false;
97
+ }
98
+ else {
99
+ currentField += ch;
100
+ }
101
+ }
102
+ else {
103
+ if (ch === '"') {
104
+ inQuotes = true;
105
+ }
106
+ else if (ch === delimiter) {
107
+ currentRow.push(currentField.trim());
108
+ currentField = '';
109
+ }
110
+ else {
111
+ currentField += ch;
112
+ }
113
+ }
114
+ }
115
+ if (!inQuotes) {
116
+ if (currentRow.length > 0) {
117
+ rows.push(currentRow);
118
+ currentRow = [];
119
+ }
120
+ }
121
+ }
122
+ // Flush last row
123
+ if (currentRow.length > 0 || currentField.trim()) {
124
+ currentRow.push(currentField.trim());
125
+ rows.push(currentRow);
126
+ }
127
+ return rows;
128
+ }
129
+ /** Formats a chunk of rows as a plain-text table */
130
+ function formatChunk(header, rows, allHeaders, pageNum, totalPages) {
131
+ const lines = [];
132
+ if (pageNum !== undefined && totalPages !== undefined) {
133
+ lines.push(`[CSV Data — Rows ${((pageNum - 1) * (rows.length || 1)) + 1}–${(pageNum - 1) * (rows.length || 1) + rows.length} of total, Page ${pageNum}/${totalPages}]`);
134
+ lines.push('');
135
+ }
136
+ const effectiveHeader = header ?? allHeaders;
137
+ if (effectiveHeader) {
138
+ // Column header row
139
+ lines.push(effectiveHeader.join(' | '));
140
+ lines.push(effectiveHeader.map((h) => '-'.repeat(Math.max(h.length, 3))).join('-|-'));
141
+ }
142
+ for (const row of rows) {
143
+ lines.push(row.join(' | '));
144
+ }
145
+ return lines.join('\n');
146
+ }
147
+ //# sourceMappingURL=csv.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"csv.js","sourceRoot":"","sources":["../../src/parsers/csv.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG;;AAcH,0CAgCC;AA3CD,4CAAsD;AAEtD,iFAAiF;AAEjF;;;;;;GAMG;AACI,KAAK,UAAU,eAAe,CACnC,IAAuC,EACvC,UAA2B,EAAE,EAC7B,UAAwB,4BAAmB;IAE3C,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IAChC,MAAM,EAAE,WAAW,GAAG,GAAG,EAAE,SAAS,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC;IAExD,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,eAAe,CAAC,IAAI,CAAC,CAAC;IAC7D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IAEvC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAElD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;QAC7C,OAAO,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACrD,CAAC;IAED,wCAAwC;IACxC,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC;QACtD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;QAChD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,WAAW,CAAC,CAAC;QAC5D,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QACrE,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,iFAAiF;AAEjF,SAAS,YAAY,CAAC,IAAuC;IAC3D,IAAI,OAAO,IAAI,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC;IAC1C,MAAM,KAAK,GAAG,IAAI,YAAY,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC;IACvE,OAAO,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAChD,CAAC;AAED,6EAA6E;AAC7E,SAAS,eAAe,CAAC,IAAY;IACnC,MAAM,UAAU,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC5C,IAAI,IAAI,GAAG,GAAG,CAAC;IACf,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAC5C,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YAAC,SAAS,GAAG,KAAK,CAAC;YAAC,IAAI,GAAG,CAAC,CAAC;QAAC,CAAC;IACzD,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,qCAAqC;AACrC,SAAS,QAAQ,CAAC,IAAY,EAAE,SAAiB;IAC/C,MAAM,IAAI,GAAe,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC3E,IAAI,UAAU,GAAa,EAAE,CAAC;IAC9B,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,IAAI,QAAQ,GAAG,KAAK,CAAC;IAErB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAEzE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAEnB,IAAI,CAAC,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;gBACtB,cAAc;gBACd,IAAI,QAAQ,EAAE,CAAC;oBACb,YAAY,IAAI,IAAI,CAAC;gBACvB,CAAC;qBAAM,CAAC;oBACN,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;oBACrC,YAAY,GAAG,EAAE,CAAC;gBACpB,CAAC;gBACD,MAAM;YACR,CAAC;YAED,IAAI,QAAQ,EAAE,CAAC;gBACb,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;oBACf,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC;wBAAC,YAAY,IAAI,GAAG,CAAC;wBAAC,CAAC,EAAE,CAAC;oBAAC,CAAC;;wBACjD,QAAQ,GAAG,KAAK,CAAC;gBACxB,CAAC;qBAAM,CAAC;oBACN,YAAY,IAAI,EAAE,CAAC;gBACrB,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;oBACf,QAAQ,GAAG,IAAI,CAAC;gBAClB,CAAC;qBAAM,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;oBAC5B,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;oBACrC,YAAY,GAAG,EAAE,CAAC;gBACpB,CAAC;qBAAM,CAAC;oBACN,YAAY,IAAI,EAAE,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACtB,UAAU,GAAG,EAAE,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,iBAAiB;IACjB,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;QACjD,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;QACrC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IACxB,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,oDAAoD;AACpD,SAAS,WAAW,CAClB,MAAuB,EACvB,IAAgB,EAChB,UAA2B,EAC3B,OAAgB,EAChB,UAAmB;IAEnB,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,OAAO,KAAK,SAAS,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;QACtD,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,mBAAmB,OAAO,IAAI,UAAU,GAAG,CAAC,CAAC;QACxK,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,MAAM,eAAe,GAAG,MAAM,IAAI,UAAU,CAAC;IAC7C,IAAI,eAAe,EAAE,CAAC;QACpB,oBAAoB;QACpB,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;QACxC,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IACxF,CAAC;IAED,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IAC9B,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
@@ -0,0 +1,20 @@
1
+ /**
2
+ * DOCX parser — extracts per-section PageData from a Word document.
3
+ *
4
+ * Requires `mammoth` to be installed:
5
+ * npm install mammoth
6
+ *
7
+ * Sections are determined by heading styles (Heading 1/2/3…).
8
+ * If no headings are found, the document is returned as a single page.
9
+ */
10
+ import type { PageData, TokenCounter } from '../types';
11
+ /**
12
+ * Extracts text from a DOCX file and segments it into page-like chunks
13
+ * using heading boundaries.
14
+ *
15
+ * @param data Raw DOCX bytes (ArrayBuffer or Uint8Array)
16
+ * @param counter Token counter function
17
+ * @returns Array of `{ text, tokenCount }` — one per heading section
18
+ */
19
+ export declare function extractDocxPages(data: ArrayBuffer | Uint8Array, counter?: TokenCounter): Promise<PageData[]>;
20
+ //# sourceMappingURL=docx.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docx.d.ts","sourceRoot":"","sources":["../../src/parsers/docx.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAGvD;;;;;;;GAOG;AACH,wBAAsB,gBAAgB,CACpC,IAAI,EAAE,WAAW,GAAG,UAAU,EAC9B,OAAO,GAAE,YAAkC,GAC1C,OAAO,CAAC,QAAQ,EAAE,CAAC,CAsCrB"}
@@ -0,0 +1,134 @@
1
+ "use strict";
2
+ /**
3
+ * DOCX parser — extracts per-section PageData from a Word document.
4
+ *
5
+ * Requires `mammoth` to be installed:
6
+ * npm install mammoth
7
+ *
8
+ * Sections are determined by heading styles (Heading 1/2/3…).
9
+ * If no headings are found, the document is returned as a single page.
10
+ */
11
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
12
+ if (k2 === undefined) k2 = k;
13
+ var desc = Object.getOwnPropertyDescriptor(m, k);
14
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
15
+ desc = { enumerable: true, get: function() { return m[k]; } };
16
+ }
17
+ Object.defineProperty(o, k2, desc);
18
+ }) : (function(o, m, k, k2) {
19
+ if (k2 === undefined) k2 = k;
20
+ o[k2] = m[k];
21
+ }));
22
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
23
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
24
+ }) : function(o, v) {
25
+ o["default"] = v;
26
+ });
27
+ var __importStar = (this && this.__importStar) || (function () {
28
+ var ownKeys = function(o) {
29
+ ownKeys = Object.getOwnPropertyNames || function (o) {
30
+ var ar = [];
31
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
32
+ return ar;
33
+ };
34
+ return ownKeys(o);
35
+ };
36
+ return function (mod) {
37
+ if (mod && mod.__esModule) return mod;
38
+ var result = {};
39
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
40
+ __setModuleDefault(result, mod);
41
+ return result;
42
+ };
43
+ })();
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.extractDocxPages = extractDocxPages;
46
+ const tokens_1 = require("../utils/tokens");
47
+ /**
48
+ * Extracts text from a DOCX file and segments it into page-like chunks
49
+ * using heading boundaries.
50
+ *
51
+ * @param data Raw DOCX bytes (ArrayBuffer or Uint8Array)
52
+ * @param counter Token counter function
53
+ * @returns Array of `{ text, tokenCount }` — one per heading section
54
+ */
55
+ async function extractDocxPages(data, counter = tokens_1.defaultTokenCounter) {
56
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
57
+ let mammoth;
58
+ try {
59
+ mammoth = await Promise.resolve(`${'mammoth'}`).then(s => __importStar(require(s)));
60
+ }
61
+ catch {
62
+ throw new Error('[PageIndex] mammoth is not installed. Run: npm install mammoth (or yarn add mammoth)');
63
+ }
64
+ // Normalise to Buffer/ArrayBuffer
65
+ const buffer = data instanceof Uint8Array
66
+ ? data.buffer.slice(data.byteOffset, data.byteOffset + data.byteLength)
67
+ : data;
68
+ // Extract raw text with heading markers preserved
69
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
70
+ const rawResult = await mammoth.extractRawText({ arrayBuffer: buffer });
71
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
72
+ const fullText = rawResult.value;
73
+ // Also extract HTML to detect headings
74
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
75
+ const htmlResult = await mammoth.convertToHtml({ arrayBuffer: buffer });
76
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
77
+ const html = htmlResult.value;
78
+ // Split HTML into sections by heading tags (h1–h6)
79
+ const sections = splitHtmlBySections(html, fullText);
80
+ if (sections.length === 0) {
81
+ // No headings found — return the full document as one page
82
+ return [{ text: fullText, tokenCount: counter(fullText) }];
83
+ }
84
+ return sections.map(({ text }) => ({ text, tokenCount: counter(text) }));
85
+ }
86
+ function stripHtmlTags(html) {
87
+ return html
88
+ .replace(/<br\s*\/?>/gi, '\n')
89
+ .replace(/<\/p>/gi, '\n')
90
+ .replace(/<\/li>/gi, '\n')
91
+ .replace(/<[^>]+>/g, '')
92
+ .replace(/&amp;/g, '&')
93
+ .replace(/&lt;/g, '<')
94
+ .replace(/&gt;/g, '>')
95
+ .replace(/&nbsp;/g, ' ')
96
+ .replace(/&quot;/g, '"')
97
+ .replace(/&#39;/g, "'")
98
+ .replace(/\n{3,}/g, '\n\n')
99
+ .trim();
100
+ }
101
+ function splitHtmlBySections(html, _fallbackText) {
102
+ // Match heading tags h1–h6
103
+ const headingRegex = /<(h[1-6])[^>]*>([\s\S]*?)<\/\1>/gi;
104
+ const headingMatches = [];
105
+ let m;
106
+ while ((m = headingRegex.exec(html)) !== null) {
107
+ headingMatches.push({
108
+ index: m.index,
109
+ title: stripHtmlTags(m[2]).trim(),
110
+ fullMatch: m[0],
111
+ });
112
+ }
113
+ if (headingMatches.length === 0)
114
+ return [];
115
+ const sections = [];
116
+ for (let i = 0; i < headingMatches.length; i++) {
117
+ const current = headingMatches[i];
118
+ const nextIndex = i + 1 < headingMatches.length ? headingMatches[i + 1].index : html.length;
119
+ const sectionHtml = html.slice(current.index, nextIndex);
120
+ const text = stripHtmlTags(sectionHtml);
121
+ sections.push({ title: current.title, text });
122
+ }
123
+ // Prepend any content before the first heading as a "Preface" section
124
+ const firstHeadingIndex = headingMatches[0].index;
125
+ if (firstHeadingIndex > 0) {
126
+ const prefaceHtml = html.slice(0, firstHeadingIndex);
127
+ const prefaceText = stripHtmlTags(prefaceHtml);
128
+ if (prefaceText.trim()) {
129
+ sections.unshift({ title: 'Preface', text: prefaceText });
130
+ }
131
+ }
132
+ return sections;
133
+ }
134
+ //# sourceMappingURL=docx.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docx.js","sourceRoot":"","sources":["../../src/parsers/docx.ts"],"names":[],"mappings":";AAAA;;;;;;;;GAQG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAaH,4CAyCC;AAnDD,4CAAsD;AAEtD;;;;;;;GAOG;AACI,KAAK,UAAU,gBAAgB,CACpC,IAA8B,EAC9B,UAAwB,4BAAmB;IAE3C,8DAA8D;IAC9D,IAAI,OAAY,CAAC;IACjB,IAAI,CAAC;QACH,OAAO,GAAG,yBAAuC,SAAmB,uCAAC,CAAC;IACxE,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,uFAAuF,CACxF,CAAC;IACJ,CAAC;IAED,kCAAkC;IAClC,MAAM,MAAM,GACV,IAAI,YAAY,UAAU;QACxB,CAAC,CAAE,IAAI,CAAC,MAAsB,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC;QACxF,CAAC,CAAC,IAAI,CAAC;IAEX,kDAAkD;IAClD,yGAAyG;IACzG,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC;IACxE,sEAAsE;IACtE,MAAM,QAAQ,GAAW,SAAS,CAAC,KAAe,CAAC;IAEnD,uCAAuC;IACvC,yGAAyG;IACzG,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC;IACxE,sEAAsE;IACtE,MAAM,IAAI,GAAW,UAAU,CAAC,KAAe,CAAC;IAEhD,mDAAmD;IACnD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IAErD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,2DAA2D;QAC3D,OAAO,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;AAC3E,CAAC;AASD,SAAS,aAAa,CAAC,IAAY;IACjC,OAAO,IAAI;SACR,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC;SAC7B,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC;SACxB,OAAO,CAAC,UAAU,EAAE,IAAI,CAAC;SACzB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY,EAAE,aAAqB;IAC9D,2BAA2B;IAC3B,MAAM,YAAY,GAAG,mCAAmC,CAAC;IACzD,MAAM,cAAc,GAA+D,EAAE,CAAC;IACtF,IAAI,CAAyB,CAAC;IAE9B,OAAO,CAAC,CAAC,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC9C,cAAc,CAAC,IAAI,CAAC;YAClB,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,KAAK,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACjC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC;SAChB,CAAC,CAAC;IACL,CAAC;IAED,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE3C,MAAM,QAAQ,GAAkB,EAAE,CAAC;IAEnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,MAAM,OAAO,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,SAAS,GACb,CAAC,GAAG,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;QAC5E,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;QACxC,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAChD,CAAC;IAED,sEAAsE;IACtE,MAAM,iBAAiB,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;IAClD,IAAI,iBAAiB,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC;QACrD,MAAM,WAAW,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;QAC/C,IAAI,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;YACvB,QAAQ,CAAC,OAAO,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;QAC5D,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC"}
@@ -0,0 +1,19 @@
1
+ /**
2
+ * XLSX / XLS / ODS parser — extracts per-sheet PageData.
3
+ *
4
+ * Requires `xlsx` (SheetJS) to be installed:
5
+ * npm install xlsx
6
+ *
7
+ * Each worksheet becomes one or more "pages" (chunked by rowsPerChunk).
8
+ * Cells are rendered as a plain-text table.
9
+ */
10
+ import type { PageData, TokenCounter, XlsxParseOptions } from '../types';
11
+ /**
12
+ * Reads an XLSX/XLS/ODS/CSV spreadsheet and returns `PageData[]`.
13
+ *
14
+ * @param data Raw spreadsheet bytes (ArrayBuffer or Uint8Array)
15
+ * @param options Parsing options (sheets, rowsPerChunk)
16
+ * @param counter Token counter function
17
+ */
18
+ export declare function extractXlsxPages(data: ArrayBuffer | Uint8Array, options?: XlsxParseOptions, counter?: TokenCounter): Promise<PageData[]>;
19
+ //# sourceMappingURL=xlsx.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"xlsx.d.ts","sourceRoot":"","sources":["../../src/parsers/xlsx.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CACpC,IAAI,EAAE,WAAW,GAAG,UAAU,EAC9B,OAAO,GAAE,gBAAqB,EAC9B,OAAO,GAAE,YAAkC,GAC1C,OAAO,CAAC,QAAQ,EAAE,CAAC,CA+DrB"}
@@ -0,0 +1,121 @@
1
+ "use strict";
2
+ /**
3
+ * XLSX / XLS / ODS parser — extracts per-sheet PageData.
4
+ *
5
+ * Requires `xlsx` (SheetJS) to be installed:
6
+ * npm install xlsx
7
+ *
8
+ * Each worksheet becomes one or more "pages" (chunked by rowsPerChunk).
9
+ * Cells are rendered as a plain-text table.
10
+ */
11
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
12
+ if (k2 === undefined) k2 = k;
13
+ var desc = Object.getOwnPropertyDescriptor(m, k);
14
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
15
+ desc = { enumerable: true, get: function() { return m[k]; } };
16
+ }
17
+ Object.defineProperty(o, k2, desc);
18
+ }) : (function(o, m, k, k2) {
19
+ if (k2 === undefined) k2 = k;
20
+ o[k2] = m[k];
21
+ }));
22
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
23
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
24
+ }) : function(o, v) {
25
+ o["default"] = v;
26
+ });
27
+ var __importStar = (this && this.__importStar) || (function () {
28
+ var ownKeys = function(o) {
29
+ ownKeys = Object.getOwnPropertyNames || function (o) {
30
+ var ar = [];
31
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
32
+ return ar;
33
+ };
34
+ return ownKeys(o);
35
+ };
36
+ return function (mod) {
37
+ if (mod && mod.__esModule) return mod;
38
+ var result = {};
39
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
40
+ __setModuleDefault(result, mod);
41
+ return result;
42
+ };
43
+ })();
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.extractXlsxPages = extractXlsxPages;
46
+ const tokens_1 = require("../utils/tokens");
47
+ /**
48
+ * Reads an XLSX/XLS/ODS/CSV spreadsheet and returns `PageData[]`.
49
+ *
50
+ * @param data Raw spreadsheet bytes (ArrayBuffer or Uint8Array)
51
+ * @param options Parsing options (sheets, rowsPerChunk)
52
+ * @param counter Token counter function
53
+ */
54
+ async function extractXlsxPages(data, options = {}, counter = tokens_1.defaultTokenCounter) {
55
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
56
+ let XLSX;
57
+ try {
58
+ XLSX = await Promise.resolve(`${'xlsx'}`).then(s => __importStar(require(s)));
59
+ }
60
+ catch {
61
+ throw new Error('[PageIndex] xlsx is not installed. Run: npm install xlsx (or yarn add xlsx)');
62
+ }
63
+ const { sheets: targetSheets, rowsPerChunk = 200 } = options;
64
+ // Normalise to Uint8Array
65
+ const bytes = data instanceof Uint8Array
66
+ ? data
67
+ : new Uint8Array(data);
68
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
69
+ const workbook = XLSX.read(bytes, { type: 'array' });
70
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
71
+ const sheetNames = workbook.SheetNames;
72
+ const selectedSheets = targetSheets
73
+ ? sheetNames.filter((n) => targetSheets.includes(n))
74
+ : sheetNames;
75
+ const pages = [];
76
+ for (const sheetName of selectedSheets) {
77
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
78
+ const worksheet = workbook.Sheets[sheetName];
79
+ // Convert sheet to array-of-arrays (rows × cols)
80
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
81
+ const aoa = XLSX.utils.sheet_to_json(worksheet, {
82
+ header: 1,
83
+ defval: '',
84
+ blankrows: false,
85
+ });
86
+ if (aoa.length === 0)
87
+ continue;
88
+ // Determine header row
89
+ const header = aoa[0].map(String);
90
+ const dataRows = aoa.slice(1);
91
+ if (dataRows.length === 0) {
92
+ const t = formatSheetChunk(sheetName, header, [], 1, 1);
93
+ pages.push({ text: t, tokenCount: counter(t) });
94
+ continue;
95
+ }
96
+ const totalChunks = Math.ceil(dataRows.length / rowsPerChunk);
97
+ for (let c = 0; c < totalChunks; c++) {
98
+ const chunk = dataRows.slice(c * rowsPerChunk, (c + 1) * rowsPerChunk);
99
+ const t = formatSheetChunk(sheetName, header, chunk, c + 1, totalChunks);
100
+ pages.push({ text: t, tokenCount: counter(t) });
101
+ }
102
+ }
103
+ return pages;
104
+ }
105
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
106
+ function formatSheetChunk(sheetName, header, rows, chunkNum, totalChunks) {
107
+ const lines = [];
108
+ lines.push(`=== Sheet: ${sheetName} (Part ${chunkNum}/${totalChunks}) ===`);
109
+ lines.push('');
110
+ // Column widths for alignment
111
+ const colWidths = header.map((h, ci) => Math.min(40, Math.max(h.length, ...rows.map((r) => String(r[ci] ?? '').length))));
112
+ // Header row
113
+ lines.push(header.map((h, i) => h.padEnd(colWidths[i])).join(' | '));
114
+ lines.push(colWidths.map((w) => '-'.repeat(w)).join('-|-'));
115
+ // Data rows
116
+ for (const row of rows) {
117
+ lines.push(header.map((_, i) => String(row[i] ?? '').padEnd(colWidths[i])).join(' | '));
118
+ }
119
+ return lines.join('\n');
120
+ }
121
+ //# sourceMappingURL=xlsx.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"xlsx.js","sourceRoot":"","sources":["../../src/parsers/xlsx.ts"],"names":[],"mappings":";AAAA;;;;;;;;GAQG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAYH,4CAmEC;AA5ED,4CAAsD;AAEtD;;;;;;GAMG;AACI,KAAK,UAAU,gBAAgB,CACpC,IAA8B,EAC9B,UAA4B,EAAE,EAC9B,UAAwB,4BAAmB;IAE3C,8DAA8D;IAC9D,IAAI,IAAS,CAAC;IACd,IAAI,CAAC;QACH,IAAI,GAAG,yBAAuC,MAAgB,uCAAC,CAAC;IAClE,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,8EAA8E,CAC/E,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,YAAY,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC;IAE7D,0BAA0B;IAC1B,MAAM,KAAK,GACT,IAAI,YAAY,UAAU;QACxB,CAAC,CAAC,IAAI;QACN,CAAC,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC;IAE3B,yGAAyG;IACzG,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IACrD,sEAAsE;IACtE,MAAM,UAAU,GAAa,QAAQ,CAAC,UAAsB,CAAC;IAE7D,MAAM,cAAc,GAAG,YAAY;QACjC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;QACpD,CAAC,CAAC,UAAU,CAAC;IAEf,MAAM,KAAK,GAAe,EAAE,CAAC;IAE7B,KAAK,MAAM,SAAS,IAAI,cAAc,EAAE,CAAC;QACvC,sEAAsE;QACtE,MAAM,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7C,iDAAiD;QACjD,yGAAyG;QACzG,MAAM,GAAG,GAAgB,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,SAAS,EAAE;YAC3D,MAAM,EAAE,CAAC;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,KAAK;SACjB,CAAgB,CAAC;QAElB,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAE/B,uBAAuB;QACvB,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAE9B,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,CAAC,GAAG,gBAAgB,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;YACxD,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,YAAY,CAAC,CAAC;QAC9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,GAAG,YAAY,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC;YACvE,MAAM,CAAC,GAAG,gBAAgB,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,GAAG,CAAC,EAAE,WAAW,CAAC,CAAC;YACzE,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,iFAAiF;AAEjF,SAAS,gBAAgB,CACvB,SAAiB,EACjB,MAAgB,EAChB,IAAiB,EACjB,QAAgB,EAChB,WAAmB;IAEnB,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,cAAc,SAAS,UAAU,QAAQ,IAAI,WAAW,OAAO,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,8BAA8B;IAC9B,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CACrC,IAAI,CAAC,GAAG,CACN,EAAE,EACF,IAAI,CAAC,GAAG,CACN,CAAC,CAAC,MAAM,EACR,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAC/C,CACF,CACF,CAAC;IAEF,aAAa;IACb,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IAE5D,YAAY;IACZ,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,KAAK,CAAC,IAAI,CACR,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAC5E,CAAC;IACJ,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Reverse (inverted) index — maps terms → tree nodes that contain them.
3
+ *
4
+ * Two modes:
5
+ * 'keyword' — fast, no LLM. Extracts stopword-filtered terms with TF scoring.
6
+ * 'llm' — slower, semantic. Uses LLM to extract concept terms per node.
7
+ */
8
+ import type { PageIndexResult, PageData, LLMProvider, ReverseIndex, ReverseIndexOptions, SearchResult } from './types';
9
+ /**
10
+ * Builds an inverted index from a `PageIndexResult`.
11
+ *
12
+ * In **keyword** mode (default), terms are extracted via stopword-filtered TF
13
+ * scoring — fast, no LLM calls needed.
14
+ *
15
+ * In **llm** mode, the LLM extracts semantic concept terms from each node's
16
+ * title + summary — slower but catches synonyms/concepts.
17
+ *
18
+ * @param result The forward-index output from `pageIndex()` / `pageIndexMd()`
19
+ * @param pages Original page data (optional; used for extra keyword signal)
20
+ * @param llm LLM provider (required for mode 'llm')
21
+ * @param options Index options
22
+ */
23
+ export declare function buildReverseIndex(input: {
24
+ result: PageIndexResult;
25
+ pages?: PageData[];
26
+ llm?: LLMProvider;
27
+ options?: ReverseIndexOptions;
28
+ }): Promise<ReverseIndex>;
29
+ /**
30
+ * Queries the reverse index for one or more terms.
31
+ * Multi-word queries are split and each term is looked up separately;
32
+ * nodes matching multiple terms get a combined score boost.
33
+ *
34
+ * @param index The reverse index (from `buildReverseIndex`)
35
+ * @param query Free-text query string
36
+ * @param topK Max results to return (default: 10)
37
+ */
38
+ export declare function searchReverseIndex(index: ReverseIndex, query: string, topK?: number): SearchResult[];
39
+ //# sourceMappingURL=reverseIndex.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reverseIndex.d.ts","sourceRoot":"","sources":["../src/reverseIndex.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,QAAQ,EACR,WAAW,EACX,YAAY,EAEZ,mBAAmB,EACnB,YAAY,EAEb,MAAM,SAAS,CAAC;AA2GjB;;;;;;;;;;;;;GAaG;AACH,wBAAsB,iBAAiB,CAAC,KAAK,EAAE;IAC7C,MAAM,EAAE,eAAe,CAAC;IACxB,KAAK,CAAC,EAAE,QAAQ,EAAE,CAAC;IACnB,GAAG,CAAC,EAAE,WAAW,CAAC;IAClB,OAAO,CAAC,EAAE,mBAAmB,CAAC;CAC/B,GAAG,OAAO,CAAC,YAAY,CAAC,CA4FxB;AAID;;;;;;;;GAQG;AACH,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,YAAY,EACnB,KAAK,EAAE,MAAM,EACb,IAAI,SAAK,GACR,YAAY,EAAE,CAgDhB"}