react-native-pageindex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/LICENSE +21 -0
- package/README.md +405 -0
- package/dist/config.d.ts +4 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +22 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +49 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +75 -0
- package/dist/index.js.map +1 -0
- package/dist/pageIndex.d.ts +48 -0
- package/dist/pageIndex.d.ts.map +1 -0
- package/dist/pageIndex.js +962 -0
- package/dist/pageIndex.js.map +1 -0
- package/dist/pageIndexDocument.d.ts +85 -0
- package/dist/pageIndexDocument.d.ts.map +1 -0
- package/dist/pageIndexDocument.js +145 -0
- package/dist/pageIndexDocument.js.map +1 -0
- package/dist/pageIndexMd.d.ts +31 -0
- package/dist/pageIndexMd.d.ts.map +1 -0
- package/dist/pageIndexMd.js +260 -0
- package/dist/pageIndexMd.js.map +1 -0
- package/dist/parsers/csv.d.ts +17 -0
- package/dist/parsers/csv.d.ts.map +1 -0
- package/dist/parsers/csv.js +147 -0
- package/dist/parsers/csv.js.map +1 -0
- package/dist/parsers/docx.d.ts +20 -0
- package/dist/parsers/docx.d.ts.map +1 -0
- package/dist/parsers/docx.js +134 -0
- package/dist/parsers/docx.js.map +1 -0
- package/dist/parsers/xlsx.d.ts +19 -0
- package/dist/parsers/xlsx.d.ts.map +1 -0
- package/dist/parsers/xlsx.js +121 -0
- package/dist/parsers/xlsx.js.map +1 -0
- package/dist/reverseIndex.d.ts +39 -0
- package/dist/reverseIndex.d.ts.map +1 -0
- package/dist/reverseIndex.js +248 -0
- package/dist/reverseIndex.js.map +1 -0
- package/dist/types.d.ts +190 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/json.d.ts +13 -0
- package/dist/utils/json.d.ts.map +1 -0
- package/dist/utils/json.js +69 -0
- package/dist/utils/json.js.map +1 -0
- package/dist/utils/pdf.d.ts +20 -0
- package/dist/utils/pdf.d.ts.map +1 -0
- package/dist/utils/pdf.js +96 -0
- package/dist/utils/pdf.js.map +1 -0
- package/dist/utils/progress.d.ts +29 -0
- package/dist/utils/progress.d.ts.map +1 -0
- package/dist/utils/progress.js +59 -0
- package/dist/utils/progress.js.map +1 -0
- package/dist/utils/tokens.d.ts +7 -0
- package/dist/utils/tokens.d.ts.map +1 -0
- package/dist/utils/tokens.js +12 -0
- package/dist/utils/tokens.js.map +1 -0
- package/dist/utils/tree.d.ts +88 -0
- package/dist/utils/tree.d.ts.map +1 -0
- package/dist/utils/tree.js +365 -0
- package/dist/utils/tree.js.map +1 -0
- package/package.json +76 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* CSV parser — no external dependencies.
|
|
4
|
+
*
|
|
5
|
+
* Converts a CSV string or buffer into page-like chunks so it can be fed
|
|
6
|
+
* into pageIndex(). Each "page" is a fixed number of rows (rowsPerPage)
|
|
7
|
+
* formatted as a plain-text table.
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.extractCsvPages = extractCsvPages;
|
|
11
|
+
const tokens_1 = require("../utils/tokens");
|
|
12
|
+
// ─── Public API ───────────────────────────────────────────────────────────────
|
|
13
|
+
/**
|
|
14
|
+
* Parses a CSV file into `PageData[]` chunks.
|
|
15
|
+
*
|
|
16
|
+
* @param data CSV as a UTF-8 string or raw bytes (ArrayBuffer / Uint8Array)
|
|
17
|
+
* @param options Parsing options (delimiter, rowsPerPage, hasHeader)
|
|
18
|
+
* @param counter Token counter function
|
|
19
|
+
*/
|
|
20
|
+
async function extractCsvPages(data, options = {}, counter = tokens_1.defaultTokenCounter) {
|
|
21
|
+
const text = dataToString(data);
|
|
22
|
+
const { rowsPerPage = 100, hasHeader = true } = options;
|
|
23
|
+
const delimiter = options.delimiter ?? detectDelimiter(text);
|
|
24
|
+
const rows = parseCsv(text, delimiter);
|
|
25
|
+
if (rows.length === 0)
|
|
26
|
+
return [];
|
|
27
|
+
const header = hasHeader ? rows[0] : null;
|
|
28
|
+
const dataRows = hasHeader ? rows.slice(1) : rows;
|
|
29
|
+
if (dataRows.length === 0) {
|
|
30
|
+
const page = formatChunk(header, [], header);
|
|
31
|
+
return [{ text: page, tokenCount: counter(page) }];
|
|
32
|
+
}
|
|
33
|
+
// Split dataRows into page-sized chunks
|
|
34
|
+
const pages = [];
|
|
35
|
+
for (let i = 0; i < dataRows.length; i += rowsPerPage) {
|
|
36
|
+
const chunk = dataRows.slice(i, i + rowsPerPage);
|
|
37
|
+
const pageNum = Math.floor(i / rowsPerPage) + 1;
|
|
38
|
+
const totalPages = Math.ceil(dataRows.length / rowsPerPage);
|
|
39
|
+
const text = formatChunk(header, chunk, header, pageNum, totalPages);
|
|
40
|
+
pages.push({ text, tokenCount: counter(text) });
|
|
41
|
+
}
|
|
42
|
+
return pages;
|
|
43
|
+
}
|
|
44
|
+
// ─── Internal helpers ─────────────────────────────────────────────────────────
|
|
45
|
+
function dataToString(data) {
|
|
46
|
+
if (typeof data === 'string')
|
|
47
|
+
return data;
|
|
48
|
+
const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
|
|
49
|
+
return new TextDecoder('utf-8').decode(bytes);
|
|
50
|
+
}
|
|
51
|
+
/** Detects the most likely delimiter by scoring candidates on consistency */
|
|
52
|
+
function detectDelimiter(text) {
|
|
53
|
+
const candidates = [',', ';', '\t', '|'];
|
|
54
|
+
const firstLine = text.split('\n')[0] ?? '';
|
|
55
|
+
let best = ',';
|
|
56
|
+
let bestCount = 0;
|
|
57
|
+
for (const d of candidates) {
|
|
58
|
+
const count = firstLine.split(d).length - 1;
|
|
59
|
+
if (count > bestCount) {
|
|
60
|
+
bestCount = count;
|
|
61
|
+
best = d;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return best;
|
|
65
|
+
}
|
|
66
|
+
/** RFC 4180-compatible CSV parser */
|
|
67
|
+
function parseCsv(text, delimiter) {
|
|
68
|
+
const rows = [];
|
|
69
|
+
const lines = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n').split('\n');
|
|
70
|
+
let currentRow = [];
|
|
71
|
+
let currentField = '';
|
|
72
|
+
let inQuotes = false;
|
|
73
|
+
for (const line of lines) {
|
|
74
|
+
if (!inQuotes && line.trim() === '' && currentRow.length === 0)
|
|
75
|
+
continue;
|
|
76
|
+
for (let i = 0; i <= line.length; i++) {
|
|
77
|
+
const ch = line[i];
|
|
78
|
+
if (i === line.length) {
|
|
79
|
+
// End of line
|
|
80
|
+
if (inQuotes) {
|
|
81
|
+
currentField += '\n';
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
currentRow.push(currentField.trim());
|
|
85
|
+
currentField = '';
|
|
86
|
+
}
|
|
87
|
+
break;
|
|
88
|
+
}
|
|
89
|
+
if (inQuotes) {
|
|
90
|
+
if (ch === '"') {
|
|
91
|
+
if (line[i + 1] === '"') {
|
|
92
|
+
currentField += '"';
|
|
93
|
+
i++;
|
|
94
|
+
}
|
|
95
|
+
else
|
|
96
|
+
inQuotes = false;
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
currentField += ch;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
if (ch === '"') {
|
|
104
|
+
inQuotes = true;
|
|
105
|
+
}
|
|
106
|
+
else if (ch === delimiter) {
|
|
107
|
+
currentRow.push(currentField.trim());
|
|
108
|
+
currentField = '';
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
currentField += ch;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
if (!inQuotes) {
|
|
116
|
+
if (currentRow.length > 0) {
|
|
117
|
+
rows.push(currentRow);
|
|
118
|
+
currentRow = [];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
// Flush last row
|
|
123
|
+
if (currentRow.length > 0 || currentField.trim()) {
|
|
124
|
+
currentRow.push(currentField.trim());
|
|
125
|
+
rows.push(currentRow);
|
|
126
|
+
}
|
|
127
|
+
return rows;
|
|
128
|
+
}
|
|
129
|
+
/** Formats a chunk of rows as a plain-text table */
|
|
130
|
+
function formatChunk(header, rows, allHeaders, pageNum, totalPages) {
|
|
131
|
+
const lines = [];
|
|
132
|
+
if (pageNum !== undefined && totalPages !== undefined) {
|
|
133
|
+
lines.push(`[CSV Data — Rows ${((pageNum - 1) * (rows.length || 1)) + 1}–${(pageNum - 1) * (rows.length || 1) + rows.length} of total, Page ${pageNum}/${totalPages}]`);
|
|
134
|
+
lines.push('');
|
|
135
|
+
}
|
|
136
|
+
const effectiveHeader = header ?? allHeaders;
|
|
137
|
+
if (effectiveHeader) {
|
|
138
|
+
// Column header row
|
|
139
|
+
lines.push(effectiveHeader.join(' | '));
|
|
140
|
+
lines.push(effectiveHeader.map((h) => '-'.repeat(Math.max(h.length, 3))).join('-|-'));
|
|
141
|
+
}
|
|
142
|
+
for (const row of rows) {
|
|
143
|
+
lines.push(row.join(' | '));
|
|
144
|
+
}
|
|
145
|
+
return lines.join('\n');
|
|
146
|
+
}
|
|
147
|
+
//# sourceMappingURL=csv.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"csv.js","sourceRoot":"","sources":["../../src/parsers/csv.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG;;AAcH,0CAgCC;AA3CD,4CAAsD;AAEtD,iFAAiF;AAEjF;;;;;;GAMG;AACI,KAAK,UAAU,eAAe,CACnC,IAAuC,EACvC,UAA2B,EAAE,EAC7B,UAAwB,4BAAmB;IAE3C,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IAChC,MAAM,EAAE,WAAW,GAAG,GAAG,EAAE,SAAS,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC;IAExD,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,eAAe,CAAC,IAAI,CAAC,CAAC;IAC7D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IAEvC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAElD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;QAC7C,OAAO,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACrD,CAAC;IAED,wCAAwC;IACxC,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC;QACtD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;QAChD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,WAAW,CAAC,CAAC;QAC5D,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QACrE,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,iFAAiF;AAEjF,SAAS,YAAY,CAAC,IAAuC;IAC3D,IAAI,OAAO,IAAI,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC;IAC1C,MAAM,KAAK,GAAG,IAAI,YAAY,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC;IACvE,OAAO,IAAI,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAChD,CAAC;AAED,6EAA6E;AAC7E,SAAS,eAAe,CAAC,IAAY;IACnC,MAAM,UAAU,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC5C,IAAI,IAAI,GAAG,GAAG,CAAC;IACf,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAC5C,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YAAC,SAAS,GAAG,KAAK,CAAC;YAAC,IAAI,GAAG,CAAC,CAAC;QAAC,CAAC;IACzD,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,qCAAqC;AACrC,SAAS,QAAQ,CAAC,IAAY,EAAE,SAAiB;IAC/C,MAAM,IAAI,GAAe,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC3E,IAAI,UAAU,GAAa,EAAE,CAAC;IAC9B,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,IAAI,QAAQ,GAAG,KAAK,CAAC;IAErB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAEzE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAEnB,IAAI,CAAC,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;gBACtB,cAAc;gBACd,IAAI,QAAQ,EAAE,CAAC;oBACb,YAAY,IAAI,IAAI,CAAC;gBACvB,CAAC;qBAAM,CAAC;oBACN,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;oBACrC,YAAY,GAAG,EAAE,CAAC;gBACpB,CAAC;gBACD,MAAM;YACR,CAAC;YAED,IAAI,QAAQ,EAAE,CAAC;gBACb,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;oBACf,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC;wBAAC,YAAY,IAAI,GAAG,CAAC;wBAAC,CAAC,EAAE,CAAC;oBAAC,CAAC;;wBACjD,QAAQ,GAAG,KAAK,CAAC;gBACxB,CAAC;qBAAM,CAAC;oBACN,YAAY,IAAI,EAAE,CAAC;gBACrB,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;oBACf,QAAQ,GAAG,IAAI,CAAC;gBAClB,CAAC;qBAAM,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;oBAC5B,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;oBACrC,YAAY,GAAG,EAAE,CAAC;gBACpB,CAAC;qBAAM,CAAC;oBACN,YAAY,IAAI,EAAE,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACtB,UAAU,GAAG,EAAE,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,iBAAiB;IACjB,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;QACjD,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;QACrC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IACxB,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,oDAAoD;AACpD,SAAS,WAAW,CAClB,MAAuB,EACvB,IAAgB,EAChB,UAA2B,EAC3B,OAAgB,EAChB,UAAmB;IAEnB,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,OAAO,KAAK,SAAS,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;QACtD,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,mBAAmB,OAAO,IAAI,UAAU,GAAG,CAAC,CAAC;QACxK,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,MAAM,eAAe,GAAG,MAAM,IAAI,UAAU,CAAC;IAC7C,IAAI,eAAe,EAAE,CAAC;QACpB,oBAAoB;QACpB,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;QACxC,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IACxF,CAAC;IAED,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IAC9B,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOCX parser — extracts per-section PageData from a Word document.
|
|
3
|
+
*
|
|
4
|
+
* Requires `mammoth` to be installed:
|
|
5
|
+
* npm install mammoth
|
|
6
|
+
*
|
|
7
|
+
* Sections are determined by heading styles (Heading 1/2/3…).
|
|
8
|
+
* If no headings are found, the document is returned as a single page.
|
|
9
|
+
*/
|
|
10
|
+
import type { PageData, TokenCounter } from '../types';
|
|
11
|
+
/**
|
|
12
|
+
* Extracts text from a DOCX file and segments it into page-like chunks
|
|
13
|
+
* using heading boundaries.
|
|
14
|
+
*
|
|
15
|
+
* @param data Raw DOCX bytes (ArrayBuffer or Uint8Array)
|
|
16
|
+
* @param counter Token counter function
|
|
17
|
+
* @returns Array of `{ text, tokenCount }` — one per heading section
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractDocxPages(data: ArrayBuffer | Uint8Array, counter?: TokenCounter): Promise<PageData[]>;
|
|
20
|
+
//# sourceMappingURL=docx.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx.d.ts","sourceRoot":"","sources":["../../src/parsers/docx.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAGvD;;;;;;;GAOG;AACH,wBAAsB,gBAAgB,CACpC,IAAI,EAAE,WAAW,GAAG,UAAU,EAC9B,OAAO,GAAE,YAAkC,GAC1C,OAAO,CAAC,QAAQ,EAAE,CAAC,CAsCrB"}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* DOCX parser — extracts per-section PageData from a Word document.
|
|
4
|
+
*
|
|
5
|
+
* Requires `mammoth` to be installed:
|
|
6
|
+
* npm install mammoth
|
|
7
|
+
*
|
|
8
|
+
* Sections are determined by heading styles (Heading 1/2/3…).
|
|
9
|
+
* If no headings are found, the document is returned as a single page.
|
|
10
|
+
*/
|
|
11
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
12
|
+
if (k2 === undefined) k2 = k;
|
|
13
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
14
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
15
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
16
|
+
}
|
|
17
|
+
Object.defineProperty(o, k2, desc);
|
|
18
|
+
}) : (function(o, m, k, k2) {
|
|
19
|
+
if (k2 === undefined) k2 = k;
|
|
20
|
+
o[k2] = m[k];
|
|
21
|
+
}));
|
|
22
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
23
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
24
|
+
}) : function(o, v) {
|
|
25
|
+
o["default"] = v;
|
|
26
|
+
});
|
|
27
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
28
|
+
var ownKeys = function(o) {
|
|
29
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
30
|
+
var ar = [];
|
|
31
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
32
|
+
return ar;
|
|
33
|
+
};
|
|
34
|
+
return ownKeys(o);
|
|
35
|
+
};
|
|
36
|
+
return function (mod) {
|
|
37
|
+
if (mod && mod.__esModule) return mod;
|
|
38
|
+
var result = {};
|
|
39
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
40
|
+
__setModuleDefault(result, mod);
|
|
41
|
+
return result;
|
|
42
|
+
};
|
|
43
|
+
})();
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.extractDocxPages = extractDocxPages;
|
|
46
|
+
const tokens_1 = require("../utils/tokens");
|
|
47
|
+
/**
|
|
48
|
+
* Extracts text from a DOCX file and segments it into page-like chunks
|
|
49
|
+
* using heading boundaries.
|
|
50
|
+
*
|
|
51
|
+
* @param data Raw DOCX bytes (ArrayBuffer or Uint8Array)
|
|
52
|
+
* @param counter Token counter function
|
|
53
|
+
* @returns Array of `{ text, tokenCount }` — one per heading section
|
|
54
|
+
*/
|
|
55
|
+
async function extractDocxPages(data, counter = tokens_1.defaultTokenCounter) {
|
|
56
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
57
|
+
let mammoth;
|
|
58
|
+
try {
|
|
59
|
+
mammoth = await Promise.resolve(`${'mammoth'}`).then(s => __importStar(require(s)));
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
throw new Error('[PageIndex] mammoth is not installed. Run: npm install mammoth (or yarn add mammoth)');
|
|
63
|
+
}
|
|
64
|
+
// Normalise to Buffer/ArrayBuffer
|
|
65
|
+
const buffer = data instanceof Uint8Array
|
|
66
|
+
? data.buffer.slice(data.byteOffset, data.byteOffset + data.byteLength)
|
|
67
|
+
: data;
|
|
68
|
+
// Extract raw text with heading markers preserved
|
|
69
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
|
|
70
|
+
const rawResult = await mammoth.extractRawText({ arrayBuffer: buffer });
|
|
71
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
|
72
|
+
const fullText = rawResult.value;
|
|
73
|
+
// Also extract HTML to detect headings
|
|
74
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
|
|
75
|
+
const htmlResult = await mammoth.convertToHtml({ arrayBuffer: buffer });
|
|
76
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
|
77
|
+
const html = htmlResult.value;
|
|
78
|
+
// Split HTML into sections by heading tags (h1–h6)
|
|
79
|
+
const sections = splitHtmlBySections(html, fullText);
|
|
80
|
+
if (sections.length === 0) {
|
|
81
|
+
// No headings found — return the full document as one page
|
|
82
|
+
return [{ text: fullText, tokenCount: counter(fullText) }];
|
|
83
|
+
}
|
|
84
|
+
return sections.map(({ text }) => ({ text, tokenCount: counter(text) }));
|
|
85
|
+
}
|
|
86
|
+
function stripHtmlTags(html) {
|
|
87
|
+
return html
|
|
88
|
+
.replace(/<br\s*\/?>/gi, '\n')
|
|
89
|
+
.replace(/<\/p>/gi, '\n')
|
|
90
|
+
.replace(/<\/li>/gi, '\n')
|
|
91
|
+
.replace(/<[^>]+>/g, '')
|
|
92
|
+
.replace(/&/g, '&')
|
|
93
|
+
.replace(/</g, '<')
|
|
94
|
+
.replace(/>/g, '>')
|
|
95
|
+
.replace(/ /g, ' ')
|
|
96
|
+
.replace(/"/g, '"')
|
|
97
|
+
.replace(/'/g, "'")
|
|
98
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
99
|
+
.trim();
|
|
100
|
+
}
|
|
101
|
+
function splitHtmlBySections(html, _fallbackText) {
|
|
102
|
+
// Match heading tags h1–h6
|
|
103
|
+
const headingRegex = /<(h[1-6])[^>]*>([\s\S]*?)<\/\1>/gi;
|
|
104
|
+
const headingMatches = [];
|
|
105
|
+
let m;
|
|
106
|
+
while ((m = headingRegex.exec(html)) !== null) {
|
|
107
|
+
headingMatches.push({
|
|
108
|
+
index: m.index,
|
|
109
|
+
title: stripHtmlTags(m[2]).trim(),
|
|
110
|
+
fullMatch: m[0],
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
if (headingMatches.length === 0)
|
|
114
|
+
return [];
|
|
115
|
+
const sections = [];
|
|
116
|
+
for (let i = 0; i < headingMatches.length; i++) {
|
|
117
|
+
const current = headingMatches[i];
|
|
118
|
+
const nextIndex = i + 1 < headingMatches.length ? headingMatches[i + 1].index : html.length;
|
|
119
|
+
const sectionHtml = html.slice(current.index, nextIndex);
|
|
120
|
+
const text = stripHtmlTags(sectionHtml);
|
|
121
|
+
sections.push({ title: current.title, text });
|
|
122
|
+
}
|
|
123
|
+
// Prepend any content before the first heading as a "Preface" section
|
|
124
|
+
const firstHeadingIndex = headingMatches[0].index;
|
|
125
|
+
if (firstHeadingIndex > 0) {
|
|
126
|
+
const prefaceHtml = html.slice(0, firstHeadingIndex);
|
|
127
|
+
const prefaceText = stripHtmlTags(prefaceHtml);
|
|
128
|
+
if (prefaceText.trim()) {
|
|
129
|
+
sections.unshift({ title: 'Preface', text: prefaceText });
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
return sections;
|
|
133
|
+
}
|
|
134
|
+
//# sourceMappingURL=docx.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx.js","sourceRoot":"","sources":["../../src/parsers/docx.ts"],"names":[],"mappings":";AAAA;;;;;;;;GAQG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAaH,4CAyCC;AAnDD,4CAAsD;AAEtD;;;;;;;GAOG;AACI,KAAK,UAAU,gBAAgB,CACpC,IAA8B,EAC9B,UAAwB,4BAAmB;IAE3C,8DAA8D;IAC9D,IAAI,OAAY,CAAC;IACjB,IAAI,CAAC;QACH,OAAO,GAAG,yBAAuC,SAAmB,uCAAC,CAAC;IACxE,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,uFAAuF,CACxF,CAAC;IACJ,CAAC;IAED,kCAAkC;IAClC,MAAM,MAAM,GACV,IAAI,YAAY,UAAU;QACxB,CAAC,CAAE,IAAI,CAAC,MAAsB,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC;QACxF,CAAC,CAAC,IAAI,CAAC;IAEX,kDAAkD;IAClD,yGAAyG;IACzG,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC;IACxE,sEAAsE;IACtE,MAAM,QAAQ,GAAW,SAAS,CAAC,KAAe,CAAC;IAEnD,uCAAuC;IACvC,yGAAyG;IACzG,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC;IACxE,sEAAsE;IACtE,MAAM,IAAI,GAAW,UAAU,CAAC,KAAe,CAAC;IAEhD,mDAAmD;IACnD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IAErD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,2DAA2D;QAC3D,OAAO,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;AAC3E,CAAC;AASD,SAAS,aAAa,CAAC,IAAY;IACjC,OAAO,IAAI;SACR,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC;SAC7B,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC;SACxB,OAAO,CAAC,UAAU,EAAE,IAAI,CAAC;SACzB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY,EAAE,aAAqB;IAC9D,2BAA2B;IAC3B,MAAM,YAAY,GAAG,mCAAmC,CAAC;IACzD,MAAM,cAAc,GAA+D,EAAE,CAAC;IACtF,IAAI,CAAyB,CAAC;IAE9B,OAAO,CAAC,CAAC,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC9C,cAAc,CAAC,IAAI,CAAC;YAClB,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,KAAK,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACjC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC;SAChB,CAAC,CAAC;IACL,CAAC;IAED,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE3C,MAAM,QAAQ,GAAkB,EAAE,CAAC;IAEnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,MAAM,OAAO,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,SAAS,GACb,CAAC,GAAG,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;QAC5E,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;QACxC,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAChD,CAAC;IAED,sEAAsE;IACtE,MAAM,iBAAiB,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;IAClD,IAAI,iBAAiB,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC;QACrD,MAAM,WAAW,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;QAC/C,IAAI,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;YACvB,QAAQ,CAAC,OAAO,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;QAC5D,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* XLSX / XLS / ODS parser — extracts per-sheet PageData.
|
|
3
|
+
*
|
|
4
|
+
* Requires `xlsx` (SheetJS) to be installed:
|
|
5
|
+
* npm install xlsx
|
|
6
|
+
*
|
|
7
|
+
* Each worksheet becomes one or more "pages" (chunked by rowsPerChunk).
|
|
8
|
+
* Cells are rendered as a plain-text table.
|
|
9
|
+
*/
|
|
10
|
+
import type { PageData, TokenCounter, XlsxParseOptions } from '../types';
|
|
11
|
+
/**
|
|
12
|
+
* Reads an XLSX/XLS/ODS/CSV spreadsheet and returns `PageData[]`.
|
|
13
|
+
*
|
|
14
|
+
* @param data Raw spreadsheet bytes (ArrayBuffer or Uint8Array)
|
|
15
|
+
* @param options Parsing options (sheets, rowsPerChunk)
|
|
16
|
+
* @param counter Token counter function
|
|
17
|
+
*/
|
|
18
|
+
export declare function extractXlsxPages(data: ArrayBuffer | Uint8Array, options?: XlsxParseOptions, counter?: TokenCounter): Promise<PageData[]>;
|
|
19
|
+
//# sourceMappingURL=xlsx.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"xlsx.d.ts","sourceRoot":"","sources":["../../src/parsers/xlsx.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CACpC,IAAI,EAAE,WAAW,GAAG,UAAU,EAC9B,OAAO,GAAE,gBAAqB,EAC9B,OAAO,GAAE,YAAkC,GAC1C,OAAO,CAAC,QAAQ,EAAE,CAAC,CA+DrB"}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* XLSX / XLS / ODS parser — extracts per-sheet PageData.
|
|
4
|
+
*
|
|
5
|
+
* Requires `xlsx` (SheetJS) to be installed:
|
|
6
|
+
* npm install xlsx
|
|
7
|
+
*
|
|
8
|
+
* Each worksheet becomes one or more "pages" (chunked by rowsPerChunk).
|
|
9
|
+
* Cells are rendered as a plain-text table.
|
|
10
|
+
*/
|
|
11
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
12
|
+
if (k2 === undefined) k2 = k;
|
|
13
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
14
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
15
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
16
|
+
}
|
|
17
|
+
Object.defineProperty(o, k2, desc);
|
|
18
|
+
}) : (function(o, m, k, k2) {
|
|
19
|
+
if (k2 === undefined) k2 = k;
|
|
20
|
+
o[k2] = m[k];
|
|
21
|
+
}));
|
|
22
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
23
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
24
|
+
}) : function(o, v) {
|
|
25
|
+
o["default"] = v;
|
|
26
|
+
});
|
|
27
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
28
|
+
var ownKeys = function(o) {
|
|
29
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
30
|
+
var ar = [];
|
|
31
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
32
|
+
return ar;
|
|
33
|
+
};
|
|
34
|
+
return ownKeys(o);
|
|
35
|
+
};
|
|
36
|
+
return function (mod) {
|
|
37
|
+
if (mod && mod.__esModule) return mod;
|
|
38
|
+
var result = {};
|
|
39
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
40
|
+
__setModuleDefault(result, mod);
|
|
41
|
+
return result;
|
|
42
|
+
};
|
|
43
|
+
})();
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.extractXlsxPages = extractXlsxPages;
|
|
46
|
+
const tokens_1 = require("../utils/tokens");
|
|
47
|
+
/**
|
|
48
|
+
* Reads an XLSX/XLS/ODS/CSV spreadsheet and returns `PageData[]`.
|
|
49
|
+
*
|
|
50
|
+
* @param data Raw spreadsheet bytes (ArrayBuffer or Uint8Array)
|
|
51
|
+
* @param options Parsing options (sheets, rowsPerChunk)
|
|
52
|
+
* @param counter Token counter function
|
|
53
|
+
*/
|
|
54
|
+
async function extractXlsxPages(data, options = {}, counter = tokens_1.defaultTokenCounter) {
|
|
55
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
56
|
+
let XLSX;
|
|
57
|
+
try {
|
|
58
|
+
XLSX = await Promise.resolve(`${'xlsx'}`).then(s => __importStar(require(s)));
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
throw new Error('[PageIndex] xlsx is not installed. Run: npm install xlsx (or yarn add xlsx)');
|
|
62
|
+
}
|
|
63
|
+
const { sheets: targetSheets, rowsPerChunk = 200 } = options;
|
|
64
|
+
// Normalise to Uint8Array
|
|
65
|
+
const bytes = data instanceof Uint8Array
|
|
66
|
+
? data
|
|
67
|
+
: new Uint8Array(data);
|
|
68
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
|
|
69
|
+
const workbook = XLSX.read(bytes, { type: 'array' });
|
|
70
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
|
71
|
+
const sheetNames = workbook.SheetNames;
|
|
72
|
+
const selectedSheets = targetSheets
|
|
73
|
+
? sheetNames.filter((n) => targetSheets.includes(n))
|
|
74
|
+
: sheetNames;
|
|
75
|
+
const pages = [];
|
|
76
|
+
for (const sheetName of selectedSheets) {
|
|
77
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
|
78
|
+
const worksheet = workbook.Sheets[sheetName];
|
|
79
|
+
// Convert sheet to array-of-arrays (rows × cols)
|
|
80
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access
|
|
81
|
+
const aoa = XLSX.utils.sheet_to_json(worksheet, {
|
|
82
|
+
header: 1,
|
|
83
|
+
defval: '',
|
|
84
|
+
blankrows: false,
|
|
85
|
+
});
|
|
86
|
+
if (aoa.length === 0)
|
|
87
|
+
continue;
|
|
88
|
+
// Determine header row
|
|
89
|
+
const header = aoa[0].map(String);
|
|
90
|
+
const dataRows = aoa.slice(1);
|
|
91
|
+
if (dataRows.length === 0) {
|
|
92
|
+
const t = formatSheetChunk(sheetName, header, [], 1, 1);
|
|
93
|
+
pages.push({ text: t, tokenCount: counter(t) });
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
const totalChunks = Math.ceil(dataRows.length / rowsPerChunk);
|
|
97
|
+
for (let c = 0; c < totalChunks; c++) {
|
|
98
|
+
const chunk = dataRows.slice(c * rowsPerChunk, (c + 1) * rowsPerChunk);
|
|
99
|
+
const t = formatSheetChunk(sheetName, header, chunk, c + 1, totalChunks);
|
|
100
|
+
pages.push({ text: t, tokenCount: counter(t) });
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return pages;
|
|
104
|
+
}
|
|
105
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
106
|
+
function formatSheetChunk(sheetName, header, rows, chunkNum, totalChunks) {
|
|
107
|
+
const lines = [];
|
|
108
|
+
lines.push(`=== Sheet: ${sheetName} (Part ${chunkNum}/${totalChunks}) ===`);
|
|
109
|
+
lines.push('');
|
|
110
|
+
// Column widths for alignment
|
|
111
|
+
const colWidths = header.map((h, ci) => Math.min(40, Math.max(h.length, ...rows.map((r) => String(r[ci] ?? '').length))));
|
|
112
|
+
// Header row
|
|
113
|
+
lines.push(header.map((h, i) => h.padEnd(colWidths[i])).join(' | '));
|
|
114
|
+
lines.push(colWidths.map((w) => '-'.repeat(w)).join('-|-'));
|
|
115
|
+
// Data rows
|
|
116
|
+
for (const row of rows) {
|
|
117
|
+
lines.push(header.map((_, i) => String(row[i] ?? '').padEnd(colWidths[i])).join(' | '));
|
|
118
|
+
}
|
|
119
|
+
return lines.join('\n');
|
|
120
|
+
}
|
|
121
|
+
//# sourceMappingURL=xlsx.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"xlsx.js","sourceRoot":"","sources":["../../src/parsers/xlsx.ts"],"names":[],"mappings":";AAAA;;;;;;;;GAQG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAYH,4CAmEC;AA5ED,4CAAsD;AAEtD;;;;;;GAMG;AACI,KAAK,UAAU,gBAAgB,CACpC,IAA8B,EAC9B,UAA4B,EAAE,EAC9B,UAAwB,4BAAmB;IAE3C,8DAA8D;IAC9D,IAAI,IAAS,CAAC;IACd,IAAI,CAAC;QACH,IAAI,GAAG,yBAAuC,MAAgB,uCAAC,CAAC;IAClE,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,8EAA8E,CAC/E,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,YAAY,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC;IAE7D,0BAA0B;IAC1B,MAAM,KAAK,GACT,IAAI,YAAY,UAAU;QACxB,CAAC,CAAC,IAAI;QACN,CAAC,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC;IAE3B,yGAAyG;IACzG,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IACrD,sEAAsE;IACtE,MAAM,UAAU,GAAa,QAAQ,CAAC,UAAsB,CAAC;IAE7D,MAAM,cAAc,GAAG,YAAY;QACjC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;QACpD,CAAC,CAAC,UAAU,CAAC;IAEf,MAAM,KAAK,GAAe,EAAE,CAAC;IAE7B,KAAK,MAAM,SAAS,IAAI,cAAc,EAAE,CAAC;QACvC,sEAAsE;QACtE,MAAM,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7C,iDAAiD;QACjD,yGAAyG;QACzG,MAAM,GAAG,GAAgB,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,SAAS,EAAE;YAC3D,MAAM,EAAE,CAAC;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,KAAK;SACjB,CAAgB,CAAC;QAElB,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAE/B,uBAAuB;QACvB,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAE9B,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,CAAC,GAAG,gBAAgB,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;YACxD,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,YAAY,CAAC,CAAC;QAC9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,GAAG,YAAY,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC;YACvE,MAAM,CAAC,GAAG,gBAAgB,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,GAAG,CAAC,EAAE,WAAW,CAAC,CAAC;YACzE,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,iFAAiF;AAEjF,SAAS,gBAAgB,CACvB,SAAiB,EACjB,MAAgB,EAChB,IAAiB,EACjB,QAAgB,EAChB,WAAmB;IAEnB,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,cAAc,SAAS,UAAU,QAAQ,IAAI,WAAW,OAAO,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,8BAA8B;IAC9B,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CACrC,IAAI,CAAC,GAAG,CACN,EAAE,EACF,IAAI,CAAC,GAAG,CACN,CAAC,CAAC,MAAM,EACR,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAC/C,CACF,CACF,CAAC;IAEF,aAAa;IACb,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IAE5D,YAAY;IACZ,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,KAAK,CAAC,IAAI,CACR,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAC5E,CAAC;IACJ,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reverse (inverted) index — maps terms → tree nodes that contain them.
|
|
3
|
+
*
|
|
4
|
+
* Two modes:
|
|
5
|
+
* 'keyword' — fast, no LLM. Extracts stopword-filtered terms with TF scoring.
|
|
6
|
+
* 'llm' — slower, semantic. Uses LLM to extract concept terms per node.
|
|
7
|
+
*/
|
|
8
|
+
import type { PageIndexResult, PageData, LLMProvider, ReverseIndex, ReverseIndexOptions, SearchResult } from './types';
|
|
9
|
+
/**
|
|
10
|
+
* Builds an inverted index from a `PageIndexResult`.
|
|
11
|
+
*
|
|
12
|
+
* In **keyword** mode (default), terms are extracted via stopword-filtered TF
|
|
13
|
+
* scoring — fast, no LLM calls needed.
|
|
14
|
+
*
|
|
15
|
+
* In **llm** mode, the LLM extracts semantic concept terms from each node's
|
|
16
|
+
* title + summary — slower but catches synonyms/concepts.
|
|
17
|
+
*
|
|
18
|
+
* @param result The forward-index output from `pageIndex()` / `pageIndexMd()`
|
|
19
|
+
* @param pages Original page data (optional; used for extra keyword signal)
|
|
20
|
+
* @param llm LLM provider (required for mode 'llm')
|
|
21
|
+
* @param options Index options
|
|
22
|
+
*/
|
|
23
|
+
export declare function buildReverseIndex(input: {
|
|
24
|
+
result: PageIndexResult;
|
|
25
|
+
pages?: PageData[];
|
|
26
|
+
llm?: LLMProvider;
|
|
27
|
+
options?: ReverseIndexOptions;
|
|
28
|
+
}): Promise<ReverseIndex>;
|
|
29
|
+
/**
|
|
30
|
+
* Queries the reverse index for one or more terms.
|
|
31
|
+
* Multi-word queries are split and each term is looked up separately;
|
|
32
|
+
* nodes matching multiple terms get a combined score boost.
|
|
33
|
+
*
|
|
34
|
+
* @param index The reverse index (from `buildReverseIndex`)
|
|
35
|
+
* @param query Free-text query string
|
|
36
|
+
* @param topK Max results to return (default: 10)
|
|
37
|
+
*/
|
|
38
|
+
export declare function searchReverseIndex(index: ReverseIndex, query: string, topK?: number): SearchResult[];
|
|
39
|
+
//# sourceMappingURL=reverseIndex.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reverseIndex.d.ts","sourceRoot":"","sources":["../src/reverseIndex.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,QAAQ,EACR,WAAW,EACX,YAAY,EAEZ,mBAAmB,EACnB,YAAY,EAEb,MAAM,SAAS,CAAC;AA2GjB;;;;;;;;;;;;;GAaG;AACH,wBAAsB,iBAAiB,CAAC,KAAK,EAAE;IAC7C,MAAM,EAAE,eAAe,CAAC;IACxB,KAAK,CAAC,EAAE,QAAQ,EAAE,CAAC;IACnB,GAAG,CAAC,EAAE,WAAW,CAAC;IAClB,OAAO,CAAC,EAAE,mBAAmB,CAAC;CAC/B,GAAG,OAAO,CAAC,YAAY,CAAC,CA4FxB;AAID;;;;;;;;GAQG;AACH,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,YAAY,EACnB,KAAK,EAAE,MAAM,EACb,IAAI,SAAK,GACR,YAAY,EAAE,CAgDhB"}
|