any-extractor 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1040 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ getAnyExtractor: () => getAnyExtractor
34
+ });
35
+ module.exports = __toCommonJS(index_exports);
36
+
37
+ // src/extractors/any-extractor.ts
38
+ var import_file_type_mime = require("file-type-mime");
39
+
40
+ // src/util.ts
41
+ var import_undici = require("undici");
42
+ var import_yauzl = __toESM(require("yauzl"));
43
+
44
+ // src/constant.ts
45
+ var ERRORMSG = {
46
+ extensionUnsupported: (ext) => `Sorry, AnyExtractor currently support docx, pptx, xlsx, odt, odp, ods, pdf files only. Create a ticket in Issues on github to add support for ${ext} files. Stay tuned for further updates.`,
47
+ fileCorrupted: (filepath) => `Your file ${filepath} seems to be corrupted. If you are sure it is fine, please create a ticket in Issues on github with the file to reproduce error.`,
48
+ fileDoesNotExist: (filepath) => `File ${filepath} could not be found! Check if the file exists or verify if the relative path to the file is correct from your terminal's location.`,
49
+ locationNotFound: (location) => `Entered location ${location} is not reachable! Please make sure that the entered directory location exists. Check relative paths and reenter.`,
50
+ improperArguments: `Improper arguments`,
51
+ improperBuffers: `Error occured while reading the file buffers`,
52
+ invalidInput: `Invalid input type: Expected a Buffer or a valid file path`
53
+ };
54
+
55
+ // src/util.ts
56
+ var import_concat_stream = __toESM(require("concat-stream"));
57
+ var import_xmldom = require("@xmldom/xmldom");
58
+ var import_fs = require("fs");
59
+ async function readFile(filePath) {
60
+ return await import_fs.promises.readFile(filePath);
61
+ }
62
+ var readFileUrl = async (url, basicAuth) => {
63
+ const res = await (0, import_undici.fetch)(url, {
64
+ headers: {
65
+ ...basicAuth ? { Authorization: basicAuth } : {}
66
+ }
67
+ });
68
+ if (!res.ok) throw new Error(`Failed to fetch: ${res.statusText}`);
69
+ return Buffer.from(await res.arrayBuffer());
70
+ };
71
+ var extractFiles = (zipInput, filterFn) => {
72
+ return new Promise((res, rej) => {
73
+ const processZipfile = (zipfile) => {
74
+ const extractedFiles = [];
75
+ zipfile.readEntry();
76
+ function processEntry(entry) {
77
+ if (filterFn(entry.fileName)) {
78
+ zipfile.openReadStream(entry, (err, readStream) => {
79
+ if (err) return rej(err);
80
+ readStream.pipe(
81
+ (0, import_concat_stream.default)((data) => {
82
+ extractedFiles.push({
83
+ path: entry.fileName,
84
+ content: data
85
+ });
86
+ zipfile.readEntry();
87
+ })
88
+ );
89
+ });
90
+ } else zipfile.readEntry();
91
+ }
92
+ zipfile.on("entry", processEntry);
93
+ zipfile.on("end", () => res(extractedFiles));
94
+ zipfile.on("error", rej);
95
+ };
96
+ if (Buffer.isBuffer(zipInput)) {
97
+ import_yauzl.default.fromBuffer(zipInput, { lazyEntries: true }, (err, zipfile) => {
98
+ if (err) return rej(err);
99
+ processZipfile(zipfile);
100
+ });
101
+ } else if (typeof zipInput === "string") {
102
+ import_yauzl.default.open(zipInput, { lazyEntries: true }, (err, zipfile) => {
103
+ if (err) return rej(err);
104
+ processZipfile(zipfile);
105
+ });
106
+ } else rej(ERRORMSG.invalidInput);
107
+ });
108
+ };
109
+ var parseString = (xml) => {
110
+ const parser = new import_xmldom.DOMParser();
111
+ return parser.parseFromString(xml, "text/xml");
112
+ };
113
+ function isValidUrl(str) {
114
+ try {
115
+ if (!str) return false;
116
+ new URL(str);
117
+ return true;
118
+ } catch {
119
+ return false;
120
+ }
121
+ }
122
+
123
+ // src/crawler/confluence-crawler.ts
124
+ var import_undici2 = require("undici");
125
+ var cheerio = __toESM(require("cheerio"));
126
+ var ConfluenceCrawler = class {
127
+ constructor(baseUrl, email, apiKey) {
128
+ this.baseUrl = baseUrl.replace(/\/+$/, "");
129
+ this.email = email;
130
+ this.apiKey = apiKey;
131
+ this.apiEndpoint = this.baseUrl.includes("atlassian.net") ? `${this.baseUrl}/wiki/rest/api` : `${this.baseUrl}/rest/api`;
132
+ }
133
+ async extractPageContent(pageId) {
134
+ const xmlContent = await this.fetchPageContent(pageId);
135
+ return this.extractOrderedContentFromXml(xmlContent, pageId);
136
+ }
137
+ async fetchPageContent(pageId) {
138
+ const url = `${this.apiEndpoint}/content/${pageId}?expand=body.storage`;
139
+ const response = await (0, import_undici2.fetch)(url, {
140
+ headers: {
141
+ Authorization: `Basic ${Buffer.from(`${this.email}:${this.apiKey}`).toString("base64")}`,
142
+ "Content-Type": "application/json"
143
+ }
144
+ });
145
+ if (!response.ok) {
146
+ const errorText = await response.text();
147
+ throw new Error(
148
+ `Request failed: ${response.status} ${response.statusText}. Response body: ${errorText}`
149
+ );
150
+ }
151
+ const data = await response.json();
152
+ return data.body.storage.value;
153
+ }
154
+ extractOrderedContentFromXml(xml, pageId) {
155
+ const $ = cheerio.load(xml, { xmlMode: true });
156
+ const node = $("ac\\:layout").first();
157
+ const orderedContent = this.parseNodeContents(node, $, pageId);
158
+ return orderedContent;
159
+ }
160
+ parseNodeContents(node, $, pageId, ignoreTags = []) {
161
+ let result = [];
162
+ const contents = node.contents();
163
+ contents.each((_, ele) => {
164
+ switch (ele.type) {
165
+ case "text":
166
+ result.push(this.parseTextNode(ele));
167
+ break;
168
+ case "tag":
169
+ const tagResult = this.parseTagNode(ele, $, pageId, ignoreTags);
170
+ result = result.concat(tagResult);
171
+ break;
172
+ default:
173
+ break;
174
+ }
175
+ });
176
+ return result;
177
+ }
178
+ parseTextNode(ele) {
179
+ const text = ele.data.trim();
180
+ return { type: "text", content: text };
181
+ }
182
+ parseTagNode(ele, $, pageId, ignoreTags) {
183
+ let result = [];
184
+ const tagName = ele.tagName;
185
+ if (ignoreTags.includes(tagName)) {
186
+ return result;
187
+ }
188
+ switch (tagName) {
189
+ case "ac:structured-macro":
190
+ result = result.concat(this.parseStructuredMacro(ele, $, pageId));
191
+ break;
192
+ case "ac:adf-extension":
193
+ result = result.concat(this.parseAdfExtension(ele, $));
194
+ break;
195
+ case "ac:task-list":
196
+ result = result.concat(this.parseTaskList(ele, $, pageId));
197
+ break;
198
+ case "ac:image":
199
+ result = result.concat(this.parseImage(ele, $, pageId));
200
+ break;
201
+ case "table":
202
+ result = result.concat(this.parseTable(ele, $));
203
+ break;
204
+ case "a":
205
+ result = result.concat(this.parseLink(ele, $, pageId));
206
+ break;
207
+ default:
208
+ result = result.concat(this.parseNodeContents($(ele), $, pageId));
209
+ break;
210
+ }
211
+ return result;
212
+ }
213
+ parseStructuredMacro(ele, $, pageId) {
214
+ const macroName = ele.attribs["ac:name"];
215
+ const result = [];
216
+ switch (macroName) {
217
+ case "code":
218
+ result.push(this.parseCodeMacro(ele, $));
219
+ break;
220
+ case "info":
221
+ result.push(this.parseInfoMacro(ele, $, pageId));
222
+ break;
223
+ case "warning":
224
+ result.push(this.parseWarningMacro(ele, $, pageId));
225
+ break;
226
+ case "note":
227
+ result.push(this.parseNoteMacro(ele, $, pageId));
228
+ break;
229
+ case "tip":
230
+ result.push(this.parseTipMacro(ele, $, pageId));
231
+ break;
232
+ case "panel":
233
+ result.push(this.parsePanelMacro(ele, $, pageId));
234
+ break;
235
+ case "expand":
236
+ result.push(this.parseExpandMacro(ele, $, pageId));
237
+ break;
238
+ case "status":
239
+ result.push(this.parseStatusMacro(ele, $, pageId));
240
+ break;
241
+ case "view-file":
242
+ result.push(this.parseViewFileMacro(ele, $, pageId));
243
+ break;
244
+ }
245
+ return result;
246
+ }
247
+ parseLink(ele, $, pageId) {
248
+ const href = $(ele).attr("href");
249
+ const text = this.parseNodeContents($(ele), $, pageId);
250
+ return [{ type: "link", content: `${text.map((t) => t.content).join("")} (${href})` }];
251
+ }
252
+ parseTaskList(ele, $, pageId) {
253
+ const result = [];
254
+ const tasks = $(ele).find("ac\\:task");
255
+ tasks.each((_, task) => {
256
+ const taskStatus = $(task).find("ac\\:task-status").text().trim();
257
+ const taskBody = this.parseNodeContents($(task).find("ac\\:task-body"), $, pageId);
258
+ result.push({
259
+ type: "task",
260
+ content: `${taskBody.map((t) => t.content).join("")} [Status: ${taskStatus}]`
261
+ });
262
+ });
263
+ return result;
264
+ }
265
+ parseImage(ele, $, pageId) {
266
+ const attachment = $(ele).find("ri\\:attachment");
267
+ const filename = attachment.attr("ri:filename")?.trim();
268
+ if (!filename) {
269
+ return [];
270
+ }
271
+ const imageUrl = this.baseUrl.includes("atlassian.net") ? `${this.baseUrl}/wiki/download/attachments/${pageId}/${encodeURIComponent(filename)}` : `${this.baseUrl}/download/attachments/${pageId}/${encodeURIComponent(filename)}`;
272
+ return [{ type: "image", content: imageUrl }];
273
+ }
274
+ parseTable(ele, $) {
275
+ const result = [];
276
+ const rows = $(ele).find("tr");
277
+ const tableData = [];
278
+ rows.each((_, row) => {
279
+ const cells = $(row).find("th, td");
280
+ const rowData = [];
281
+ cells.each((_2, cell) => {
282
+ rowData.push($(cell).text().trim());
283
+ });
284
+ tableData.push(rowData.join(" | "));
285
+ });
286
+ result.push({ type: "table", content: tableData.join("\n") });
287
+ return result;
288
+ }
289
+ parseCodeMacro(ele, $) {
290
+ const code = $(ele).find("ac\\:plain-text-body").text().trim();
291
+ return { type: "code", content: code };
292
+ }
293
+ parseInfoMacro(ele, $, pageId) {
294
+ const info = this.parseNodeContents($(ele), $, pageId);
295
+ return { type: "info", content: info.map((t) => t.content).join("") };
296
+ }
297
+ parseWarningMacro(ele, $, pageId) {
298
+ const warning = this.parseNodeContents($(ele), $, pageId);
299
+ return { type: "warning", content: warning.map((t) => t.content).join("") };
300
+ }
301
+ parseNoteMacro(ele, $, pageId) {
302
+ const note = this.parseNodeContents($(ele), $, pageId);
303
+ return { type: "note", content: note.map((t) => t.content).join("") };
304
+ }
305
+ parseTipMacro(ele, $, pageId) {
306
+ const tip = this.parseNodeContents($(ele), $, pageId);
307
+ return { type: "tip", content: tip.map((t) => t.content).join("") };
308
+ }
309
+ parsePanelMacro(ele, $, pageId) {
310
+ const panel = this.parseNodeContents($(ele), $, pageId, ["ac:parameter"]);
311
+ return { type: "panel", content: panel.map((t) => t.content).join("") };
312
+ }
313
+ parseExpandMacro(ele, $, pageId) {
314
+ const expand = this.parseNodeContents($(ele), $, pageId);
315
+ return { type: "expand", content: expand.map((t) => t.content).join("") };
316
+ }
317
+ parseStatusMacro(ele, $, pageId) {
318
+ const status = this.parseNodeContents($(ele), $, pageId);
319
+ return { type: "status", content: status.map((t) => t.content).join("") };
320
+ }
321
+ parseAdfExtension(ele, $) {
322
+ const adfNode = $(ele).find("ac\\:adf-node");
323
+ const content = adfNode.find("ac\\:adf-content").text().trim();
324
+ return [{ type: "adf-extension", content }];
325
+ }
326
+ parseViewFileMacro(ele, $, pageId) {
327
+ const attachment = $(ele).find('ac\\:parameter[ac\\:name="name"] ri\\:attachment');
328
+ const filename = attachment.attr("ri:filename")?.trim();
329
+ if (!filename) {
330
+ return { type: "view-file", content: "" };
331
+ }
332
+ const fileUrl = this.baseUrl.includes("atlassian.net") ? `${this.baseUrl}/wiki/download/attachments/${pageId}/${encodeURIComponent(filename)}` : `${this.baseUrl}/download/attachments/${pageId}/${encodeURIComponent(filename)}`;
333
+ return { type: "view-file", content: fileUrl };
334
+ }
335
+ };
336
+
337
+ // src/extractors/any-extractor.ts
338
+ var AnyExtractor = class {
339
+ constructor(extractorConfig) {
340
+ this.extractorConfig = {
341
+ llm: {
342
+ llmProvider: "openai",
343
+ visionModel: "",
344
+ apikey: ""
345
+ },
346
+ confluence: {
347
+ baseUrl: "",
348
+ email: "",
349
+ apiKey: ""
350
+ }
351
+ };
352
+ this.mimeParserMap = /* @__PURE__ */ new Map();
353
+ this.parsers = [];
354
+ this.addParser = (method) => {
355
+ this.parsers.push(method);
356
+ method.mimes.forEach((mime) => {
357
+ this.mimeParserMap.set(mime, method);
358
+ });
359
+ return this;
360
+ };
361
+ this.parseFile = async (input, basicAuth = null, extractingOptions = {
362
+ extractImages: false,
363
+ imageExtractionMethod: "ocr",
364
+ language: "eng"
365
+ }) => {
366
+ let preparedInput;
367
+ if (typeof input === "string") {
368
+ if (isValidUrl(input)) {
369
+ preparedInput = await readFileUrl(input, basicAuth);
370
+ } else {
371
+ preparedInput = await readFile(input);
372
+ }
373
+ } else {
374
+ preparedInput = input;
375
+ }
376
+ if (!preparedInput) {
377
+ throw new Error("AnyExtractor: No input provided");
378
+ }
379
+ const mimeDetails = (0, import_file_type_mime.parse)(
380
+ preparedInput.buffer.slice(
381
+ preparedInput.byteOffset,
382
+ preparedInput.byteOffset + preparedInput.byteLength
383
+ )
384
+ );
385
+ if (!mimeDetails) {
386
+ return preparedInput.toString("utf-8");
387
+ }
388
+ const extractor = this.mimeParserMap.get(mimeDetails.mime);
389
+ if (!extractor?.apply) {
390
+ const message = `AnyExtractor: No extraction method registered for MIME type '${mimeDetails.mime}'`;
391
+ throw new Error(message);
392
+ }
393
+ return extractor.apply(preparedInput, extractingOptions, this.extractorConfig);
394
+ };
395
+ this.parseConfluenceDoc = async (pageId, extractingOptions = {
396
+ extractAttachments: false,
397
+ extractImages: false,
398
+ imageExtractionMethod: "ocr",
399
+ language: "eng"
400
+ }) => {
401
+ const { baseUrl, email, apiKey } = this.extractorConfig.confluence || {};
402
+ if (!baseUrl || !email || !apiKey) {
403
+ throw new Error("AnyExtractor: Confluence base URL, email, and API key are required");
404
+ }
405
+ const confCrawler = new ConfluenceCrawler(baseUrl, email, apiKey);
406
+ const content = await confCrawler.extractPageContent(pageId);
407
+ let textContent = "";
408
+ for (const item of content) {
409
+ if (item.type === "image" && extractingOptions.extractImages) {
410
+ const parsedFile = await this.parseFile(
411
+ item.content,
412
+ `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`,
413
+ extractingOptions
414
+ );
415
+ textContent += parsedFile ? `
416
+ (Image): ${parsedFile}
417
+ ` : "";
418
+ } else if (item.type === "view-file" && extractingOptions.extractAttachments) {
419
+ const parsedFile = await this.parseFile(
420
+ item.content,
421
+ `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`,
422
+ extractingOptions
423
+ );
424
+ textContent += parsedFile ? `
425
+ [Attachment]: ${parsedFile}
426
+ ` : "";
427
+ } else if ([
428
+ "h1",
429
+ "h2",
430
+ "h3",
431
+ "h4",
432
+ "h5",
433
+ "h6",
434
+ "p",
435
+ "table",
436
+ "li",
437
+ "code",
438
+ "info",
439
+ "warning",
440
+ "tip",
441
+ "note",
442
+ "panel",
443
+ "expand",
444
+ "adf-extension"
445
+ ].includes(item.type)) {
446
+ textContent += `
447
+ ${item.content}
448
+ `;
449
+ } else if (item.type === "text") {
450
+ textContent += ` ${item.content}`;
451
+ }
452
+ }
453
+ return textContent;
454
+ };
455
+ if (extractorConfig) {
456
+ this.extractorConfig = extractorConfig;
457
+ }
458
+ }
459
+ };
460
+
461
+ // src/file-parser/excel-parser.ts
462
+ var ExcelParser = class {
463
+ constructor(anyExtractor) {
464
+ this.mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
465
+ this.anyExtractor = anyExtractor;
466
+ }
467
+ async apply(file, extractingOptions) {
468
+ const patterns = {
469
+ sheets: /xl\/worksheets\/sheet\d+.xml/g,
470
+ drawings: /xl\/drawings\/drawing\d+.xml/g,
471
+ charts: /xl\/charts\/chart\d+.xml/g,
472
+ sharedStrings: "xl/sharedStrings.xml",
473
+ images: /xl\/media\/image\d+\.(png|jpeg|jpg|webp)/g
474
+ };
475
+ try {
476
+ const files = await extractFiles(
477
+ file,
478
+ (path) => [patterns.sheets, patterns.drawings, patterns.charts, patterns.images].some(
479
+ (regex) => regex.test(path)
480
+ ) || path === patterns.sharedStrings
481
+ );
482
+ if (files.length === 0 || !files.some((file2) => patterns.sheets.test(file2.path))) {
483
+ throw ERRORMSG.fileCorrupted("Missing or corrupted sheet files.");
484
+ }
485
+ const xmlContent = {
486
+ sheets: files.filter((file2) => patterns.sheets.test(file2.path)).map((file2) => file2.content.toString()),
487
+ drawings: files.filter((file2) => patterns.drawings.test(file2.path)).map((file2) => file2.content.toString()),
488
+ charts: files.filter((file2) => patterns.charts.test(file2.path)).map((file2) => file2.content.toString()),
489
+ sharedStrings: files.find((file2) => file2.path === patterns.sharedStrings)?.content.toString(),
490
+ images: files.filter((file2) => patterns.images.test(file2.path))
491
+ };
492
+ const sharedStrings = this.parseSharedStrings(xmlContent.sharedStrings);
493
+ const orderedText = files.map(async (file2) => {
494
+ if (patterns.sheets.test(file2.path)) {
495
+ return this.extractSheetText([file2.content.toString()], sharedStrings);
496
+ } else if (patterns.drawings.test(file2.path)) {
497
+ return this.extractDrawingText([file2.content.toString()]);
498
+ } else if (patterns.charts.test(file2.path)) {
499
+ return this.extractChartText([file2.content.toString()]);
500
+ } else if (patterns.images.test(file2.path)) {
501
+ return await this.extractImageText([file2], extractingOptions);
502
+ }
503
+ return null;
504
+ }).filter(Boolean);
505
+ const resolvedText = await Promise.all(orderedText);
506
+ return resolvedText.filter(Boolean).join("\n");
507
+ } catch (error) {
508
+ console.error("AnyExtractor: Error parsing Excel file:", error);
509
+ throw error;
510
+ }
511
+ }
512
+ parseSharedStrings(sharedStringsXml) {
513
+ if (!sharedStringsXml) return [];
514
+ const tNodes = parseString(sharedStringsXml).getElementsByTagName("t");
515
+ return Array.from(tNodes).map((node) => node.childNodes[0]?.nodeValue ?? "");
516
+ }
517
+ extractSheetText(sheetFiles, sharedStrings) {
518
+ return sheetFiles.map((content) => {
519
+ const cNodes = parseString(content).getElementsByTagName("c");
520
+ return Array.from(cNodes).filter((node) => this.isValidInlineString(node) || this.hasValidValueNode(node)).map((node) => this.getCellValue(node, sharedStrings)).join("\n");
521
+ }).join("\n");
522
+ }
523
+ extractDrawingText(drawingFiles) {
524
+ return drawingFiles.map((content) => {
525
+ const pNodes = parseString(content).getElementsByTagName("a:p");
526
+ return Array.from(pNodes).map((node) => {
527
+ const tNodes = node.getElementsByTagName("a:t");
528
+ return Array.from(tNodes).map((tNode) => tNode.childNodes[0]?.nodeValue ?? "").join("");
529
+ }).join("\n");
530
+ }).join("\n");
531
+ }
532
+ extractChartText(chartFiles) {
533
+ return chartFiles.map((content) => {
534
+ const vNodes = parseString(content).getElementsByTagName("c:v");
535
+ return Array.from(vNodes).map((node) => node.childNodes[0]?.nodeValue ?? "").join("\n");
536
+ }).join("\n");
537
+ }
538
+ async extractImageText(imageFiles, extractingOptions) {
539
+ const texts = await Promise.all(
540
+ imageFiles.map(async (file) => {
541
+ try {
542
+ return await this.anyExtractor.parseFile(file.content, null, extractingOptions);
543
+ } catch (e) {
544
+ console.log(`AnyExtractor: Error extracting text from image ${file.path}:`, e);
545
+ return "";
546
+ }
547
+ })
548
+ );
549
+ return texts.filter(Boolean).join("\n");
550
+ }
551
+ isValidInlineString(cNode) {
552
+ if (cNode.tagName.toLowerCase() !== "c" || cNode.getAttribute("t") !== "inlineStr")
553
+ return false;
554
+ const isNodes = cNode.getElementsByTagName("is");
555
+ const tNodes = isNodes[0]?.getElementsByTagName("t");
556
+ return tNodes?.[0]?.childNodes[0]?.nodeValue !== void 0;
557
+ }
558
+ hasValidValueNode(cNode) {
559
+ const vNodes = cNode.getElementsByTagName("v");
560
+ return vNodes[0]?.childNodes[0]?.nodeValue !== void 0;
561
+ }
562
+ getCellValue(cNode, sharedStrings) {
563
+ if (this.isValidInlineString(cNode)) {
564
+ return cNode.getElementsByTagName("is")[0].getElementsByTagName("t")[0].childNodes[0].nodeValue ?? "";
565
+ }
566
+ if (this.hasValidValueNode(cNode)) {
567
+ const isSharedString = cNode.getAttribute("t") === "s";
568
+ const valueIndex = parseInt(
569
+ cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue ?? "",
570
+ 10
571
+ );
572
+ if (isSharedString) {
573
+ if (valueIndex >= sharedStrings.length) {
574
+ throw ERRORMSG.fileCorrupted("AnyExtractor: Invalid shared string index.");
575
+ }
576
+ return sharedStrings[valueIndex];
577
+ }
578
+ return valueIndex.toString();
579
+ }
580
+ return "";
581
+ }
582
+ };
583
+
584
+ // src/file-parser/image-parser.ts
585
+ var import_tesseract = __toESM(require("tesseract.js"));
586
+ var import_undici3 = require("undici");
587
+ var import_file_type_mime2 = require("file-type-mime");
588
+ var ImageParser = class {
589
+ constructor() {
590
+ this.mimes = ["image/jpeg", "image/png", "image/webp"];
591
+ this.apply = async (file, extractingOptions, extractorConfig) => {
592
+ const { extractImages, imageExtractionMethod, language } = extractingOptions;
593
+ if (!extractImages) {
594
+ return "";
595
+ }
596
+ const mimeDetails = (0, import_file_type_mime2.parse)(
597
+ file.buffer.slice(file.byteOffset, file.byteOffset + file.byteLength)
598
+ );
599
+ if (!mimeDetails) {
600
+ throw new Error("AnyExtractor: Unable to parse MIME type");
601
+ }
602
+ const mimeType = mimeDetails.mime;
603
+ if (!this.mimes.includes(mimeType)) {
604
+ return "";
605
+ }
606
+ if (imageExtractionMethod === "ocr") {
607
+ return await this.performOCR(file, language);
608
+ }
609
+ const { llmProvider, visionModel, apikey } = extractorConfig.llm || {};
610
+ if (!llmProvider || !visionModel || !apikey) {
611
+ throw new Error(
612
+ "AnyExtractor: LLM provider, vision model and API key are required for image extraction"
613
+ );
614
+ }
615
+ const base64Image = file.toString("base64");
616
+ switch (llmProvider) {
617
+ case "openai":
618
+ return this.handleOpenAI(base64Image, mimeType, visionModel, apikey);
619
+ case "google":
620
+ return this.handleGoogle(base64Image, mimeType, visionModel, apikey);
621
+ case "anthropic":
622
+ return this.handleAnthropic(base64Image, mimeType, visionModel, apikey);
623
+ default:
624
+ throw new Error(`ImageParser: Unsupported LLM provider '${llmProvider}'`);
625
+ }
626
+ };
627
+ this.performOCR = async (file, language) => {
628
+ const worker = await import_tesseract.default.createWorker(language);
629
+ const {
630
+ data: { text }
631
+ } = await worker.recognize(file);
632
+ await worker.terminate();
633
+ return text;
634
+ };
635
+ this.handleOpenAI = async (base64Image, mimeType, visionModel, apikey) => {
636
+ const response = await (0, import_undici3.fetch)("https://api.openai.com/v1/chat/completions", {
637
+ method: "POST",
638
+ headers: {
639
+ "Content-Type": "application/json",
640
+ Authorization: `Bearer ${apikey}`
641
+ },
642
+ body: JSON.stringify({
643
+ model: visionModel,
644
+ messages: [
645
+ {
646
+ role: "user",
647
+ content: [
648
+ {
649
+ type: "text",
650
+ text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
651
+ },
652
+ {
653
+ type: "image_url",
654
+ image_url: {
655
+ url: `data:${mimeType};base64,${base64Image}`
656
+ }
657
+ }
658
+ ]
659
+ }
660
+ ]
661
+ })
662
+ });
663
+ if (!response.ok) {
664
+ throw new Error(`ImageParser: OpenAI API error ${response.status}`);
665
+ }
666
+ const data = await response.json();
667
+ return data.choices[0].message.content;
668
+ };
669
+ this.handleGoogle = async (base64Image, mimeType, visionModel, apikey) => {
670
+ const response = await (0, import_undici3.fetch)(
671
+ `https://generativelanguage.googleapis.com/v1beta/models/${visionModel}:generateContent?key=${apikey}`,
672
+ {
673
+ method: "POST",
674
+ headers: {
675
+ "Content-Type": "application/json"
676
+ },
677
+ body: JSON.stringify({
678
+ contents: [
679
+ {
680
+ parts: [
681
+ {
682
+ text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
683
+ },
684
+ {
685
+ inlineData: {
686
+ mimeType,
687
+ data: base64Image
688
+ }
689
+ }
690
+ ]
691
+ }
692
+ ]
693
+ })
694
+ }
695
+ );
696
+ if (!response.ok) {
697
+ throw new Error(`Google Gemini error: ${response.statusText}`);
698
+ }
699
+ const data = await response.json();
700
+ return data.candidates[0].content.parts[0].text;
701
+ };
702
+ this.handleAnthropic = async (base64Image, mimeType, visionModel, apikey) => {
703
+ const response = await (0, import_undici3.fetch)("https://api.anthropic.com/v1/messages", {
704
+ method: "POST",
705
+ headers: {
706
+ "Content-Type": "application/json",
707
+ "x-api-key": apikey,
708
+ "anthropic-version": "2023-06-01"
709
+ },
710
+ body: JSON.stringify({
711
+ model: visionModel,
712
+ max_tokens: 300,
713
+ messages: [
714
+ {
715
+ role: "user",
716
+ content: [
717
+ {
718
+ type: "text",
719
+ text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
720
+ },
721
+ {
722
+ type: "image",
723
+ source: {
724
+ type: "base64",
725
+ media_type: mimeType,
726
+ data: base64Image
727
+ }
728
+ }
729
+ ]
730
+ }
731
+ ]
732
+ })
733
+ });
734
+ if (!response.ok) {
735
+ throw new Error(`Anthropic Claude error: ${response.statusText}`);
736
+ }
737
+ const data = await response.json();
738
+ return data.content[0].text;
739
+ };
740
+ }
741
+ };
742
+
743
+ // src/file-parser/openoffice-paser.ts
744
+ var OpenOfficeParser = class {
745
+ constructor() {
746
+ this.mimes = [
747
+ "application/vnd.oasis.opendocument.text",
748
+ "application/vnd.oasis.opendocument.spreadsheet",
749
+ "application/vnd.oasis.opendocument.presentation",
750
+ "application/vnd.oasis.opendocument.graphics",
751
+ "application/vnd.oasis.opendocument.formula"
752
+ ];
753
+ this.apply = async (file) => {
754
+ const MAIN_CONTENT_FILE = "content.xml";
755
+ const OBJECT_CONTENT_REGEX = /Object \d+\/content.xml/;
756
+ try {
757
+ const files = await extractFiles(
758
+ file,
759
+ (path) => path === MAIN_CONTENT_FILE || OBJECT_CONTENT_REGEX.test(path)
760
+ );
761
+ const contentFiles = files.filter((file2) => file2.path === MAIN_CONTENT_FILE || OBJECT_CONTENT_REGEX.test(file2.path)).sort((a, b) => a.path.localeCompare(b.path));
762
+ const notesText = [];
763
+ const outputChunks = [];
764
+ const ALLOWED_TEXT_TAGS = ["text:p", "text:h"];
765
+ const NOTES_TAG = "presentation:notes";
766
+ const extractAllTextsFromNode = (root) => {
767
+ const textArray = [];
768
+ traverseNode(root, textArray, true);
769
+ return textArray.join("");
770
+ };
771
+ const traverseNode = (node, textArray, isFirstRecursion) => {
772
+ if (!node.childNodes || node.childNodes.length === 0) {
773
+ if (node.parentNode && node.parentNode.tagName.startsWith("text") && node.nodeValue) {
774
+ const parent = node.parentNode;
775
+ if (isNotesNode(parent)) {
776
+ notesText.push(node.nodeValue);
777
+ if (ALLOWED_TEXT_TAGS.includes(parent.tagName) && !isFirstRecursion) {
778
+ notesText.push("\n");
779
+ }
780
+ } else {
781
+ textArray.push(node.nodeValue);
782
+ if (ALLOWED_TEXT_TAGS.includes(parent.tagName) && !isFirstRecursion) {
783
+ textArray.push("\n");
784
+ }
785
+ }
786
+ }
787
+ return;
788
+ }
789
+ for (let i = 0; i < node.childNodes.length; i++) {
790
+ traverseNode(node.childNodes[i], textArray, false);
791
+ }
792
+ };
793
+ const isNotesNode = (node) => {
794
+ return node.tagName === NOTES_TAG ? true : node.parentNode ? isNotesNode(node.parentNode) : false;
795
+ };
796
+ const isInvalidTextNode = (node) => {
797
+ return ALLOWED_TEXT_TAGS.includes(node.tagName) ? true : node.parentNode ? isInvalidTextNode(node.parentNode) : false;
798
+ };
799
+ for (const contentFile of contentFiles) {
800
+ const xmlDoc = parseString(contentFile.content.toString());
801
+ const textNodes = Array.from(xmlDoc.getElementsByTagName("*")).filter(
802
+ (node) => ALLOWED_TEXT_TAGS.includes(node.tagName) && !isInvalidTextNode(node.parentNode)
803
+ );
804
+ const textChunk = textNodes.map((node) => extractAllTextsFromNode(node)).filter((text) => text.trim() !== "").join("\n");
805
+ if (textChunk) {
806
+ outputChunks.push(textChunk);
807
+ }
808
+ }
809
+ return [...outputChunks, ...notesText].join("\n\n");
810
+ } catch (error) {
811
+ console.error("AnyExtractor: Error parsing OpenOffice file:", error);
812
+ throw error;
813
+ }
814
+ };
815
+ }
816
+ };
817
+
818
+ // src/file-parser/pdf-parser.ts
819
+ var import_pdf_parse = __toESM(require("pdf-parse"));
820
+ var PDFParser = class {
821
+ constructor() {
822
+ this.mimes = ["application/pdf"];
823
+ this.apply = async (file) => {
824
+ try {
825
+ const data = await (0, import_pdf_parse.default)(file);
826
+ const textContent = data.text;
827
+ return textContent;
828
+ } catch (error) {
829
+ console.error("AnyExtractor: Error parsing PDF file:", error);
830
+ throw error;
831
+ }
832
+ };
833
+ }
834
+ };
835
+
836
+ // src/file-parser/powerpoint-parser.ts
837
+ var PowerPointParser = class {
838
+ constructor(anyExtractor) {
839
+ this.mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
840
+ this.anyExtractor = anyExtractor;
841
+ }
842
+ async apply(file, extractingOptions) {
843
+ const fileMatchRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+\.xml|ppt\/media\/image\d+\..+|ppt\/slides\/_rels\/slide\d+\.xml.rels/i;
844
+ const slideNumberRegex = /slide(\d+)\.xml/;
845
+ const imageRegex = /^ppt\/media\/image\d+\..+$/i;
846
+ try {
847
+ const files = await extractFiles(file, (x) => fileMatchRegex.test(x));
848
+ const imageBuffers = {};
849
+ const slideXmls = {};
850
+ const relsFiles = {};
851
+ for (const file2 of files) {
852
+ if (imageRegex.test(file2.path)) {
853
+ imageBuffers[file2.path] = file2.content;
854
+ } else if (/ppt\/slides\/slide\d+\.xml/.test(file2.path)) {
855
+ const match = file2.path.match(slideNumberRegex);
856
+ if (match) slideXmls[+match[1]] = file2.content.toString();
857
+ } else if (/ppt\/slides\/_rels\/slide\d+\.xml.rels/.test(file2.path)) {
858
+ const match = file2.path.match(slideNumberRegex);
859
+ if (match) relsFiles[+match[1]] = file2.content.toString();
860
+ }
861
+ }
862
+ const results = [];
863
+ const sortedSlideNumbers = Object.keys(slideXmls).map(Number).sort((a, b) => a - b);
864
+ for (const slideNumber of sortedSlideNumbers) {
865
+ const xmlContent = slideXmls[slideNumber];
866
+ const slideText = this.extractTextFromXml(xmlContent);
867
+ if (slideText) results.push(slideText);
868
+ const imagePaths = this.extractImagePathsFromRels(relsFiles[slideNumber]);
869
+ for (const imagePath of imagePaths) {
870
+ const imageFullPath = `ppt/${imagePath.replace(/^(\.\.\/)+/, "")}`;
871
+ const imageBuffer = imageBuffers[imageFullPath];
872
+ if (imageBuffer) {
873
+ const imageDescription = await this.convertImageToText(imageBuffer, extractingOptions);
874
+ if (imageDescription) {
875
+ results.push(`[Image]: ${imageDescription}`);
876
+ }
877
+ }
878
+ }
879
+ }
880
+ return results.join("\n");
881
+ } catch (error) {
882
+ console.error("AnyExtractor: Error parsing PowerPoint file:", error);
883
+ throw error;
884
+ }
885
+ }
886
+ extractTextFromXml(xml) {
887
+ const xmlParagraphNodesList = parseString(xml).getElementsByTagName("a:p");
888
+ return Array.from(xmlParagraphNodesList).filter((paragraphNode) => paragraphNode.getElementsByTagName("a:t").length > 0).map((paragraphNode) => {
889
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
890
+ return Array.from(xmlTextNodeList).map((textNode) => textNode.childNodes[0]?.nodeValue || "").join("");
891
+ }).join("\n");
892
+ }
893
+ extractImagePathsFromRels(relsXml) {
894
+ if (!relsXml) return [];
895
+ const rels = parseString(relsXml).getElementsByTagName("Relationship");
896
+ return Array.from(rels).filter((rel) => rel.getAttribute("Type")?.includes("/image") && rel.getAttribute("Target")).map((rel) => rel.getAttribute("Target"));
897
+ }
898
+ async convertImageToText(imageBuffer, extractingOptions) {
899
+ return this.anyExtractor.parseFile(imageBuffer, null, extractingOptions);
900
+ }
901
+ };
902
+
903
+ // src/file-parser/simple-parser.ts
904
+ var SimpleParser = class {
905
+ constructor() {
906
+ this.mimes = ["text/plain", "application/json"];
907
+ this.apply = async (file) => {
908
+ return file.toString("utf-8");
909
+ };
910
+ }
911
+ };
912
+
913
+ // src/file-parser/word-parser.ts
914
+ var WordParser = class {
915
+ constructor(anyExtractor) {
916
+ this.mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
917
+ this.anyExtractor = anyExtractor;
918
+ }
919
+ async apply(file, extractingOptions) {
920
+ const mainRegex = /word\/document[\d+]?.xml/;
921
+ const footnotesRegex = /word\/footnotes[\d+]?.xml/;
922
+ const endnotesRegex = /word\/endnotes[\d+]?.xml/;
923
+ const mediaRegex = /^word\/media\//;
924
+ const relsRegex = /^word\/_rels\/document.xml.rels$/;
925
+ try {
926
+ const files = await extractFiles(
927
+ file,
928
+ (filePath) => [mainRegex, footnotesRegex, endnotesRegex, relsRegex].some((r) => r.test(filePath)) || mediaRegex.test(filePath)
929
+ );
930
+ const getFile = (regex) => files.find((f) => regex.test(f.path));
931
+ const mainDoc = getFile(mainRegex);
932
+ const footnotesDoc = getFile(footnotesRegex);
933
+ const endnotesDoc = getFile(endnotesRegex);
934
+ const relsFile = getFile(relsRegex);
935
+ if (!mainDoc || !relsFile) {
936
+ throw ERRORMSG.fileCorrupted("Main content or relationships file is missing.");
937
+ }
938
+ const mediaFiles = {};
939
+ for (const file2 of files) {
940
+ if (mediaRegex.test(file2.path)) {
941
+ const fileName = file2.path.split("/").pop();
942
+ mediaFiles[fileName] = file2;
943
+ }
944
+ }
945
+ const embedMap = this.parseRelationships(relsFile.content.toString());
946
+ const mainText = await this.extractTextAndImages(
947
+ mainDoc.content.toString(),
948
+ embedMap,
949
+ mediaFiles,
950
+ extractingOptions
951
+ );
952
+ const footnotesText = footnotesDoc ? await this.extractTextAndImages(
953
+ footnotesDoc.content.toString(),
954
+ embedMap,
955
+ mediaFiles,
956
+ extractingOptions
957
+ ) : "";
958
+ const endnotesText = endnotesDoc ? await this.extractTextAndImages(
959
+ endnotesDoc.content.toString(),
960
+ embedMap,
961
+ mediaFiles,
962
+ extractingOptions
963
+ ) : "";
964
+ return [
965
+ mainText,
966
+ footnotesText ? "\n--- Footnotes ---\n" + footnotesText : "",
967
+ endnotesText ? "\n--- Endnotes ---\n" + endnotesText : ""
968
+ ].join("\n");
969
+ } catch (error) {
970
+ console.error("AnyExtractor: Error parsing Word file:", error);
971
+ throw error;
972
+ }
973
+ }
974
+ parseRelationships(xmlContent) {
975
+ const doc = parseString(xmlContent);
976
+ const rels = doc.getElementsByTagName("Relationship");
977
+ const map = {};
978
+ for (const rel of Array.from(rels)) {
979
+ const id = rel.getAttribute("Id");
980
+ const target = rel.getAttribute("Target");
981
+ if (id && target?.startsWith("media/")) {
982
+ const filename = target.split("/").pop();
983
+ map[id] = filename;
984
+ }
985
+ }
986
+ return map;
987
+ }
988
+ async extractTextAndImages(xmlContent, embedMap, mediaFiles, extractingOptions) {
989
+ const doc = parseString(xmlContent);
990
+ const paragraphs = Array.from(doc.getElementsByTagName("w:p"));
991
+ const parts = [];
992
+ for (const paragraph of paragraphs) {
993
+ let paragraphText = "";
994
+ const texts = Array.from(paragraph.getElementsByTagName("w:t"));
995
+ paragraphText += texts.map((t) => t.childNodes[0]?.nodeValue || "").join("");
996
+ const drawings = Array.from(paragraph.getElementsByTagName("w:drawing"));
997
+ for (const drawing of drawings) {
998
+ const blip = drawing.getElementsByTagName("a:blip")[0];
999
+ const embedId = blip?.getAttribute("r:embed");
1000
+ if (embedId && embedMap[embedId]) {
1001
+ const imageFile = mediaFiles[embedMap[embedId]];
1002
+ if (imageFile) {
1003
+ const imageBuffer = imageFile.content;
1004
+ const imageDescription = await this.convertImageToText(imageBuffer, extractingOptions);
1005
+ paragraphText += `
1006
+ [Image: ${imageDescription}]`;
1007
+ }
1008
+ }
1009
+ }
1010
+ if (paragraphText.trim()) {
1011
+ parts.push(paragraphText.trim());
1012
+ }
1013
+ }
1014
+ return parts.join("\n");
1015
+ }
1016
+ async convertImageToText(imageBuffer, extractingOptions) {
1017
+ return await this.anyExtractor.parseFile(imageBuffer, null, extractingOptions);
1018
+ }
1019
+ };
1020
+
1021
+ // src/index.ts
1022
+ var getAnyExtractor = (config) => {
1023
+ const anyExtractor = new AnyExtractor(config);
1024
+ const parsers = [
1025
+ new ExcelParser(anyExtractor),
1026
+ new ImageParser(),
1027
+ new OpenOfficeParser(),
1028
+ new PDFParser(),
1029
+ new PowerPointParser(anyExtractor),
1030
+ new SimpleParser(),
1031
+ new WordParser(anyExtractor)
1032
+ ];
1033
+ parsers.forEach((parser) => anyExtractor.addParser(parser));
1034
+ return anyExtractor;
1035
+ };
1036
+ // Annotate the CommonJS export names for ESM import in node:
1037
+ 0 && (module.exports = {
1038
+ getAnyExtractor
1039
+ });
1040
+ //# sourceMappingURL=index.js.map