any-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ type InputType = 'buffer' | 'file' | 'fileurl';
2
+ type ExtractionPayload = {
3
+ type: InputType;
4
+ input: string | Buffer;
5
+ };
6
+ type AnyParserMethod = {
7
+ mimes: string[];
8
+ apply: (_: Buffer) => Promise<string>;
9
+ };
10
+ type ExtractedFile = {
11
+ path: string;
12
+ content: string;
13
+ };
14
+
15
+ declare class AnyExtractor {
16
+ private parserMap;
17
+ private parsers;
18
+ addParser: (method: AnyParserMethod) => this;
19
+ getRegisteredParsers: () => string[];
20
+ extractText: ({ input, type }: ExtractionPayload) => Promise<string>;
21
+ }
22
+
23
+ declare const getAnyExtractor: () => AnyExtractor;
24
+
25
+ export { type AnyParserMethod, type ExtractedFile, type ExtractionPayload, type InputType, getAnyExtractor };
@@ -0,0 +1,25 @@
1
+ type InputType = 'buffer' | 'file' | 'fileurl';
2
+ type ExtractionPayload = {
3
+ type: InputType;
4
+ input: string | Buffer;
5
+ };
6
+ type AnyParserMethod = {
7
+ mimes: string[];
8
+ apply: (_: Buffer) => Promise<string>;
9
+ };
10
+ type ExtractedFile = {
11
+ path: string;
12
+ content: string;
13
+ };
14
+
15
+ declare class AnyExtractor {
16
+ private parserMap;
17
+ private parsers;
18
+ addParser: (method: AnyParserMethod) => this;
19
+ getRegisteredParsers: () => string[];
20
+ extractText: ({ input, type }: ExtractionPayload) => Promise<string>;
21
+ }
22
+
23
+ declare const getAnyExtractor: () => AnyExtractor;
24
+
25
+ export { type AnyParserMethod, type ExtractedFile, type ExtractionPayload, type InputType, getAnyExtractor };
package/dist/index.js ADDED
@@ -0,0 +1,428 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ getAnyExtractor: () => getAnyExtractor
34
+ });
35
+ module.exports = __toCommonJS(index_exports);
36
+
37
+ // src/util.ts
38
+ var import_promises = require("fs/promises");
39
+ var import_undici = require("undici");
40
+ var import_yauzl = __toESM(require("yauzl"));
41
+
42
+ // src/constant.ts
43
+ var ERRORMSG = {
44
+ extensionUnsupported: (ext) => `Sorry, AnyExtractor currently support docx, pptx, xlsx, odt, odp, ods, pdf files only. Create a ticket in Issues on github to add support for ${ext} files. Stay tuned for further updates.`,
45
+ fileCorrupted: (filepath) => `Your file ${filepath} seems to be corrupted. If you are sure it is fine, please create a ticket in Issues on github with the file to reproduce error.`,
46
+ fileDoesNotExist: (filepath) => `File ${filepath} could not be found! Check if the file exists or verify if the relative path to the file is correct from your terminal's location.`,
47
+ locationNotFound: (location) => `Entered location ${location} is not reachable! Please make sure that the entered directory location exists. Check relative paths and reenter.`,
48
+ improperArguments: `Improper arguments`,
49
+ improperBuffers: `Error occured while reading the file buffers`,
50
+ invalidInput: `Invalid input type: Expected a Buffer or a valid file path`
51
+ };
52
+
53
+ // src/util.ts
54
+ var import_concat_stream = __toESM(require("concat-stream"));
55
+ var import_xmldom = require("@xmldom/xmldom");
56
+ var readFile = async (filePath) => await (0, import_promises.readFile)(filePath);
57
+ var readFileUrl = async (url) => {
58
+ const res = await (0, import_undici.fetch)(url);
59
+ if (!res.ok) throw new Error(`Failed to fetch: ${res.statusText}`);
60
+ return Buffer.from(await res.arrayBuffer());
61
+ };
62
+ var extractFiles = (zipInput, filterFn) => {
63
+ return new Promise((res, rej) => {
64
+ const processZipfile = (zipfile) => {
65
+ const extractedFiles = [];
66
+ zipfile.readEntry();
67
+ function processEntry(entry) {
68
+ if (filterFn(entry.fileName)) {
69
+ zipfile.openReadStream(entry, (err, readStream) => {
70
+ if (err)
71
+ return rej(err);
72
+ readStream.pipe((0, import_concat_stream.default)((data) => {
73
+ extractedFiles.push({
74
+ path: entry.fileName,
75
+ content: data.toString()
76
+ });
77
+ zipfile.readEntry();
78
+ }));
79
+ });
80
+ } else
81
+ zipfile.readEntry();
82
+ }
83
+ zipfile.on("entry", processEntry);
84
+ zipfile.on("end", () => res(extractedFiles));
85
+ zipfile.on("error", rej);
86
+ };
87
+ if (Buffer.isBuffer(zipInput)) {
88
+ import_yauzl.default.fromBuffer(zipInput, { lazyEntries: true }, (err, zipfile) => {
89
+ if (err) return rej(err);
90
+ processZipfile(zipfile);
91
+ });
92
+ } else if (typeof zipInput === "string") {
93
+ import_yauzl.default.open(zipInput, { lazyEntries: true }, (err, zipfile) => {
94
+ if (err) return rej(err);
95
+ processZipfile(zipfile);
96
+ });
97
+ } else
98
+ rej(ERRORMSG.invalidInput);
99
+ });
100
+ };
101
+ var parseString = (xml) => {
102
+ let parser = new import_xmldom.DOMParser();
103
+ return parser.parseFromString(xml, "text/xml");
104
+ };
105
+
106
+ // src/extractors/any-extractor.ts
107
+ var import_file_type = require("file-type");
108
+ var AnyExtractor = class {
109
+ constructor() {
110
+ this.parserMap = /* @__PURE__ */ new Map();
111
+ this.parsers = [];
112
+ this.addParser = (method) => {
113
+ this.parsers.push(method);
114
+ method.mimes.forEach((mime) => {
115
+ this.parserMap.set(mime, method);
116
+ });
117
+ return this;
118
+ };
119
+ this.getRegisteredParsers = () => {
120
+ return Array.from(this.parserMap.keys());
121
+ };
122
+ this.extractText = async ({ input, type }) => {
123
+ let preparedInput;
124
+ if (typeof input === "string") {
125
+ switch (type) {
126
+ case "file":
127
+ preparedInput = await readFile(input);
128
+ break;
129
+ case "fileurl":
130
+ preparedInput = await readFileUrl(input);
131
+ break;
132
+ default:
133
+ preparedInput = Buffer.from(input);
134
+ }
135
+ } else {
136
+ preparedInput = input;
137
+ }
138
+ const mimeDetails = await (0, import_file_type.fileTypeFromBuffer)(preparedInput);
139
+ if (!mimeDetails) return preparedInput.toString("utf-8");
140
+ const extractor = this.parserMap.get(mimeDetails.mime);
141
+ if (!extractor?.apply) {
142
+ const message = `AnyExtractor: No extraction method registered for MIME type '${mimeDetails.mime}'`;
143
+ throw new Error(message);
144
+ }
145
+ return extractor.apply(preparedInput);
146
+ };
147
+ }
148
+ };
149
+
150
+ // src/parser/excel-parser.ts
151
+ var ExcelParser = class {
152
+ constructor() {
153
+ this.mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
154
+ }
155
+ async apply(file) {
156
+ const sheetsRegex = /xl\/worksheets\/sheet\d+.xml/g;
157
+ const drawingsRegex = /xl\/drawings\/drawing\d+.xml/g;
158
+ const chartsRegex = /xl\/charts\/chart\d+.xml/g;
159
+ const stringsFilePath = "xl/sharedStrings.xml";
160
+ try {
161
+ let isValidInlineStringCNode2 = function(cNode) {
162
+ if (cNode.tagName.toLowerCase() != "c") return false;
163
+ if (cNode.getAttribute("t") != "inlineStr") return false;
164
+ const childNodesNamedIs = cNode.getElementsByTagName("is");
165
+ if (childNodesNamedIs.length != 1) return false;
166
+ const childNodesNamedT = childNodesNamedIs[0].getElementsByTagName("t");
167
+ if (childNodesNamedT.length != 1) return false;
168
+ return childNodesNamedT[0].childNodes[0] && childNodesNamedT[0].childNodes[0].nodeValue != "";
169
+ }, hasValidVNodeInCNode2 = function(cNode) {
170
+ const vNodes = cNode.getElementsByTagName("v");
171
+ return vNodes[0] && vNodes[0].childNodes[0] && vNodes[0].childNodes[0].nodeValue != "";
172
+ };
173
+ var isValidInlineStringCNode = isValidInlineStringCNode2, hasValidVNodeInCNode = hasValidVNodeInCNode2;
174
+ const files = await extractFiles(
175
+ file,
176
+ (x) => [sheetsRegex, drawingsRegex, chartsRegex].some((fileRegex) => x.match(fileRegex)) || x == stringsFilePath
177
+ );
178
+ if (files.length == 0 || !files.map((file2) => file2.path).some((filename) => filename.match(sheetsRegex))) {
179
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
180
+ }
181
+ const xmlContentFilesObject = {
182
+ sheetFiles: files.filter((file2) => file2.path.match(sheetsRegex)).map((file2) => file2.content),
183
+ drawingFiles: files.filter((file2) => file2.path.match(drawingsRegex)).map((file2) => file2.content),
184
+ chartFiles: files.filter((file2) => file2.path.match(chartsRegex)).map((file2) => file2.content),
185
+ sharedStringsFile: files.filter((file2) => file2.path == stringsFilePath).map((file2) => file2.content)[0]
186
+ };
187
+ let responseText = [];
188
+ const sharedStringsXmlTNodesList = xmlContentFilesObject.sharedStringsFile != void 0 ? parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t") : [];
189
+ const sharedStrings = Array.from(sharedStringsXmlTNodesList).map((tNode) => tNode.childNodes[0]?.nodeValue ?? "");
190
+ for (const sheetXmlContent of xmlContentFilesObject.sheetFiles) {
191
+ const sheetsXmlCNodesList = parseString(sheetXmlContent).getElementsByTagName("c");
192
+ responseText.push(
193
+ Array.from(sheetsXmlCNodesList).filter((cNode) => isValidInlineStringCNode2(cNode) || hasValidVNodeInCNode2(cNode)).map((cNode) => {
194
+ if (isValidInlineStringCNode2(cNode))
195
+ return cNode.getElementsByTagName("is")[0].getElementsByTagName("t")[0].childNodes[0].nodeValue;
196
+ if (hasValidVNodeInCNode2(cNode)) {
197
+ const isIndexInSharedStrings = cNode.getAttribute("t") == "s";
198
+ const value = parseInt(cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue ?? "", 10);
199
+ if (isIndexInSharedStrings && value >= sharedStrings.length)
200
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
201
+ return isIndexInSharedStrings ? sharedStrings[value] : value;
202
+ }
203
+ return "";
204
+ }).join("\n")
205
+ );
206
+ }
207
+ for (const drawingXmlContent of xmlContentFilesObject.drawingFiles) {
208
+ const drawingsXmlParagraphNodesList = parseString(drawingXmlContent).getElementsByTagName("a:p");
209
+ responseText.push(
210
+ Array.from(drawingsXmlParagraphNodesList).filter((paragraphNode) => paragraphNode.getElementsByTagName("a:t").length != 0).map((paragraphNode) => {
211
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
212
+ return Array.from(xmlTextNodeList).filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue).map((textNode) => textNode.childNodes[0].nodeValue).join("");
213
+ }).join("\n")
214
+ );
215
+ }
216
+ for (const chartXmlContent of xmlContentFilesObject.chartFiles) {
217
+ const chartsXmlCVNodesList = parseString(chartXmlContent).getElementsByTagName("c:v");
218
+ responseText.push(
219
+ Array.from(chartsXmlCVNodesList).filter((cVNode) => cVNode.childNodes[0] && cVNode.childNodes[0].nodeValue).map((cVNode) => cVNode.childNodes[0].nodeValue).join("\n")
220
+ );
221
+ }
222
+ return responseText.join("\n");
223
+ } catch (error) {
224
+ console.error("Error parsing Excel file:", error);
225
+ throw error;
226
+ }
227
+ }
228
+ };
229
+
230
+ // src/parser/openoffice-paser.ts
231
+ var OpenOfficeParser = class {
232
+ constructor() {
233
+ this.mimes = [
234
+ "application/vnd.oasis.opendocument.text",
235
+ "application/vnd.oasis.opendocument.spreadsheet",
236
+ "application/vnd.oasis.opendocument.presentation",
237
+ "application/vnd.oasis.opendocument.graphics",
238
+ "application/vnd.oasis.opendocument.formula"
239
+ ];
240
+ this.apply = async (file) => {
241
+ const mainContentFilePath = "content.xml";
242
+ const objectContentFilesRegex = /Object \d+\/content.xml/g;
243
+ try {
244
+ let extractAllTextsFromNode2 = function(root) {
245
+ let xmlTextArray = [];
246
+ for (let i = 0; i < root.childNodes.length; i++) {
247
+ traversal2(root.childNodes[i], xmlTextArray, true);
248
+ }
249
+ return xmlTextArray.join("");
250
+ }, traversal2 = function(node, xmlTextArray, isFirstRecursion) {
251
+ if (!node.childNodes || node.childNodes.length == 0) {
252
+ if (node.parentNode && node.parentNode.tagName.indexOf("text") == 0 && node.nodeValue) {
253
+ if (isNotesNode2(node.parentNode)) {
254
+ notesText.push(node.nodeValue);
255
+ if (allowedTextTags.includes(node.parentNode.tagName) && !isFirstRecursion) {
256
+ notesText.push("\n");
257
+ }
258
+ } else {
259
+ xmlTextArray.push(node.nodeValue);
260
+ if (allowedTextTags.includes(node.parentNode.tagName) && !isFirstRecursion) {
261
+ xmlTextArray.push("\n");
262
+ }
263
+ }
264
+ }
265
+ return;
266
+ }
267
+ for (let i = 0; i < node.childNodes.length; i++) {
268
+ traversal2(node.childNodes[i], xmlTextArray, false);
269
+ }
270
+ }, isNotesNode2 = function(node) {
271
+ if (node.tagName == notesTag) {
272
+ return true;
273
+ }
274
+ if (node.parentNode) {
275
+ return isNotesNode2(node.parentNode);
276
+ }
277
+ return false;
278
+ }, isInvalidTextNode2 = function(node) {
279
+ if (allowedTextTags.includes(node.tagName)) {
280
+ return true;
281
+ }
282
+ if (node.parentNode) {
283
+ return isInvalidTextNode2(node.parentNode);
284
+ }
285
+ return false;
286
+ };
287
+ var extractAllTextsFromNode = extractAllTextsFromNode2, traversal = traversal2, isNotesNode = isNotesNode2, isInvalidTextNode = isInvalidTextNode2;
288
+ const files = await extractFiles(file, (x) => x == mainContentFilePath || !!x.match(objectContentFilesRegex));
289
+ if (!files.map((file2) => file2.path).includes(mainContentFilePath)) {
290
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
291
+ }
292
+ const xmlContentFilesObject = {
293
+ mainContentFile: files.filter((file2) => file2.path == mainContentFilePath).map((file2) => file2.content)[0],
294
+ objectContentFiles: files.filter((file2) => file2.path.match(objectContentFilesRegex)).map((file2) => file2.content)
295
+ };
296
+ let notesText = [];
297
+ let responseText = [];
298
+ const allowedTextTags = ["text:p", "text:h"];
299
+ const notesTag = "presentation:notes";
300
+ const xmlContentArray = [xmlContentFilesObject.mainContentFile, ...xmlContentFilesObject.objectContentFiles].map((xmlContent) => parseString(xmlContent));
301
+ xmlContentArray.forEach((xmlContent) => {
302
+ const xmlTextNodesList = [...Array.from(xmlContent.getElementsByTagName("*")).filter((node) => allowedTextTags.includes(node.tagName) && !isInvalidTextNode2(node.parentNode))];
303
+ responseText.push(
304
+ xmlTextNodesList.map((textNode) => extractAllTextsFromNode2(textNode)).filter((text) => text != "").join("\n")
305
+ );
306
+ });
307
+ responseText = [...responseText, ...notesText];
308
+ return responseText.join("\n");
309
+ } catch (error) {
310
+ console.error("Error parsing OpenOffice file:", error);
311
+ throw error;
312
+ }
313
+ };
314
+ }
315
+ };
316
+
317
+ // src/parser/pdf-parser.ts
318
+ var import_pdf_parse = __toESM(require("pdf-parse"));
319
+ var PDFParser = class {
320
+ constructor() {
321
+ this.mimes = ["application/pdf"];
322
+ this.apply = async (file) => {
323
+ try {
324
+ const data = await (0, import_pdf_parse.default)(file);
325
+ const textContent = data.text;
326
+ return textContent;
327
+ } catch (error) {
328
+ console.error("Error parsing PDF file:", error);
329
+ throw error;
330
+ }
331
+ };
332
+ }
333
+ };
334
+
335
+ // src/parser/powerpoint-parser.ts
336
+ var PowerPointParser = class {
337
+ constructor() {
338
+ this.mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
339
+ }
340
+ async apply(file) {
341
+ const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
342
+ const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
343
+ const slideNumberRegex = /lide(\d+)\.xml/;
344
+ try {
345
+ const files = await extractFiles(file, (x) => !!x.match(allFilesRegex));
346
+ files.sort((a, b) => {
347
+ const matchedANumber = parseInt(a.path.match(slideNumberRegex)?.at(1) ?? "", 10);
348
+ const matchedBNumber = parseInt(b.path.match(slideNumberRegex)?.at(1) ?? "", 10);
349
+ const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber;
350
+ const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber;
351
+ return aNumber - bNumber || Number(a.path.includes("notes")) - Number(b.path.includes("notes"));
352
+ });
353
+ if (files.length == 0 || !files.map((file2) => file2.path).some((filename) => filename.match(slidesRegex))) {
354
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
355
+ }
356
+ files.sort((a, b) => a.path.indexOf("notes") - b.path.indexOf("notes"));
357
+ const xmlContentArray = files.map((file2) => file2.content);
358
+ let responseText = [];
359
+ for (const xmlContent of xmlContentArray) {
360
+ const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName("a:p");
361
+ responseText.push(
362
+ Array.from(xmlParagraphNodesList).filter((paragraphNode) => paragraphNode.getElementsByTagName("a:t").length != 0).map((paragraphNode) => {
363
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
364
+ return Array.from(xmlTextNodeList).filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue).map((textNode) => textNode.childNodes[0].nodeValue).join("");
365
+ }).join("\n")
366
+ );
367
+ }
368
+ const responseTextString = responseText.join("\n");
369
+ return responseTextString;
370
+ } catch (error) {
371
+ console.error("Error parsing PowerPoint file:", error);
372
+ throw error;
373
+ }
374
+ }
375
+ };
376
+
377
+ // src/parser/word-parser.ts
378
+ var WordParser = class {
379
+ constructor() {
380
+ this.mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
381
+ }
382
+ async apply(file) {
383
+ const mainContentFileRegex = /word\/document[\d+]?.xml/g;
384
+ const footnotesFileRegex = /word\/footnotes[\d+]?.xml/g;
385
+ const endnotesFileRegex = /word\/endnotes[\d+]?.xml/g;
386
+ try {
387
+ const files = await extractFiles(
388
+ file,
389
+ (x) => [mainContentFileRegex, footnotesFileRegex, endnotesFileRegex].some((fileRegex) => x.match(fileRegex))
390
+ );
391
+ if (!files.some((file2) => file2.path.match(mainContentFileRegex))) {
392
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
393
+ }
394
+ const xmlContentArray = files.filter((file2) => file2.path.match(mainContentFileRegex) || file2.path.match(footnotesFileRegex) || file2.path.match(endnotesFileRegex)).map((file2) => file2.content);
395
+ let responseText = [];
396
+ xmlContentArray.forEach((xmlContent) => {
397
+ const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName("w:p");
398
+ responseText.push(
399
+ Array.from(xmlParagraphNodesList).filter((paragraphNode) => paragraphNode.getElementsByTagName("w:t").length != 0).map((paragraphNode) => {
400
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("w:t");
401
+ return Array.from(xmlTextNodeList).filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue).map((textNode) => textNode.childNodes[0].nodeValue).join("");
402
+ }).join("\n")
403
+ );
404
+ });
405
+ const responseTextString = responseText.join("\n");
406
+ return responseTextString;
407
+ } catch (error) {
408
+ console.error("Error parsing Word file:", error);
409
+ throw error;
410
+ }
411
+ }
412
+ };
413
+
414
+ // src/index.ts
415
+ var getAnyExtractor = () => {
416
+ const anyExtractor = new AnyExtractor();
417
+ anyExtractor.addParser(new ExcelParser());
418
+ anyExtractor.addParser(new OpenOfficeParser());
419
+ anyExtractor.addParser(new PDFParser());
420
+ anyExtractor.addParser(new PowerPointParser());
421
+ anyExtractor.addParser(new WordParser());
422
+ return anyExtractor;
423
+ };
424
+ // Annotate the CommonJS export names for ESM import in node:
425
+ 0 && (module.exports = {
426
+ getAnyExtractor
427
+ });
428
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts","../src/util.ts","../src/constant.ts","../src/extractors/any-extractor.ts","../src/parser/excel-parser.ts","../src/parser/openoffice-paser.ts","../src/parser/pdf-parser.ts","../src/parser/powerpoint-parser.ts","../src/parser/word-parser.ts"],"sourcesContent":["import { AnyExtractor } from \"./extractors/any-extractor\";\nimport { ExcelParser } from \"./parser/excel-parser\";\nimport { OpenOfficeParser } from \"./parser/openoffice-paser\";\nimport { PDFParser } from \"./parser/pdf-parser\";\nimport { PowerPointParser } from \"./parser/powerpoint-parser\";\nimport { WordParser } from \"./parser/word-parser\";\n\nexport const getAnyExtractor = (): AnyExtractor => {\n const anyExtractor = new AnyExtractor();\n \n anyExtractor.addParser(new ExcelParser());\n anyExtractor.addParser(new OpenOfficeParser());\n anyExtractor.addParser(new PDFParser());\n anyExtractor.addParser(new PowerPointParser());\n anyExtractor.addParser(new WordParser());\n\n return anyExtractor;\n}\n\nexport * from \"./types\";","import { readFile as read } from 'node:fs/promises';\nimport { fetch } from 'undici';\nimport yauzl from 'yauzl';\nimport { ERRORMSG } from './constant';\nimport { ExtractedFile } from './types';\nimport concat from 'concat-stream';\nimport { DOMParser } from '@xmldom/xmldom';\n\nexport const readFile = async (filePath: string): Promise<Buffer> =>\n (await read(filePath)) as unknown as Buffer;\n\nexport const readFileUrl = async (url: string): Promise<Buffer> => {\n const res = await fetch(url);\n if (!res.ok) throw new Error(`Failed to fetch: ${res.statusText}`);\n return Buffer.from(await res.arrayBuffer());\n}\n\nexport const extractFiles = (zipInput: Buffer | string, filterFn: (x: string) => boolean): Promise<ExtractedFile[]> => {\n return new Promise((res, rej) => {\n const processZipfile = (zipfile: yauzl.ZipFile) => {\n const extractedFiles: ExtractedFile[] = [];\n zipfile.readEntry();\n\n function processEntry(entry: yauzl.Entry) {\n if (filterFn(entry.fileName)) {\n zipfile.openReadStream(entry, (err, readStream) => {\n if (err)\n return rej(err);\n\n readStream.pipe(concat((data: Buffer) => {\n extractedFiles.push({\n path: entry.fileName,\n content: data.toString()\n });\n zipfile.readEntry();\n }));\n });\n }\n else\n zipfile.readEntry();\n }\n\n zipfile.on('entry', processEntry);\n zipfile.on('end', () => res(extractedFiles));\n zipfile.on('error', rej);\n };\n\n if (Buffer.isBuffer(zipInput)) {\n yauzl.fromBuffer(zipInput, { lazyEntries: true }, (err, zipfile) => {\n if (err) return rej(err);\n processZipfile(zipfile);\n });\n }\n else if (typeof zipInput === 'string') {\n yauzl.open(zipInput, { lazyEntries: true }, (err, zipfile) => {\n if (err) return rej(err);\n processZipfile(zipfile);\n });\n }\n else\n rej(ERRORMSG.invalidInput);\n });\n}\n\nexport const parseString = (xml: string) => {\n let parser = new DOMParser();\n return parser.parseFromString(xml, \"text/xml\");\n};","/** Header for error messages */\nexport const ERRORHEADER = \"[AnyExtractor]: \";\n\n/** Error messages */\nexport const ERRORMSG = {\n extensionUnsupported: (ext: string) => `Sorry, AnyExtractor currently support docx, pptx, xlsx, odt, odp, ods, pdf files only. Create a ticket in Issues on github to add support for ${ext} files. Stay tuned for further updates.`,\n fileCorrupted: (filepath: string) => `Your file ${filepath} seems to be corrupted. If you are sure it is fine, please create a ticket in Issues on github with the file to reproduce error.`,\n fileDoesNotExist: (filepath: string) => `File ${filepath} could not be found! Check if the file exists or verify if the relative path to the file is correct from your terminal's location.`,\n locationNotFound: (location: string) => `Entered location ${location} is not reachable! Please make sure that the entered directory location exists. Check relative paths and reenter.`,\n improperArguments: `Improper arguments`,\n improperBuffers: `Error occured while reading the file buffers`,\n invalidInput: `Invalid input type: Expected a Buffer or a valid file path`\n}","import { AnyParserMethod, ExtractionPayload } from \"../types\"\nimport { readFileUrl, readFile } from \"../util\"\nimport { fileTypeFromBuffer as getFileType } from 'file-type'\n\nexport class AnyExtractor {\n\tprivate parserMap: Map<string, AnyParserMethod> = new Map();\n\tprivate parsers: AnyParserMethod[] = [];\n\n\tpublic addParser = (method: AnyParserMethod): this => {\n\t\tthis.parsers.push(method);\n\t\tmethod.mimes.forEach((mime) => {\n\t\t\tthis.parserMap.set(mime, method);\n\t\t});\n\t\treturn this;\n\t}\n\n\tpublic getRegisteredParsers = (): string[] => {\n\t\treturn Array.from(this.parserMap.keys());\n\t}\n\n\tpublic extractText = async ({ input, type }: ExtractionPayload): Promise<string> => {\n\t\tlet preparedInput: Buffer;\n\t\tif (typeof input === 'string') {\n\t\t\tswitch (type) {\n\t\t\t\tcase 'file':\n\t\t\t\t\tpreparedInput = await readFile(input);\n\t\t\t\t\tbreak;\n\t\t\t\tcase 'fileurl':\n\t\t\t\t\tpreparedInput = await readFileUrl(input);\n\t\t\t\t\tbreak;\n\t\t\t\tdefault:\n\t\t\t\t\tpreparedInput = Buffer.from(input);\n\t\t\t}\n\t\t} else {\n\t\t\tpreparedInput = input;\n\t\t}\n\n\t\tconst mimeDetails = await getFileType(preparedInput);\n\t\tif (!mimeDetails) return preparedInput.toString('utf-8');\n\n\t\tconst extractor = this.parserMap.get(mimeDetails.mime);\n\n\t\tif (!extractor?.apply) {\n\t\t\tconst message = `AnyExtractor: No extraction method registered for MIME type '${mimeDetails.mime}'`;\n\t\t\tthrow new Error(message);\n\t\t}\n\n\t\treturn extractor.apply(preparedInput)\n\t}\n}","import { Element, LiveNodeList } from \"@xmldom/xmldom\";\nimport { ERRORMSG } from \"../constant\";\nimport { AnyParserMethod } from \"../types\";\nimport { extractFiles, parseString } from \"../util\";\n\nexport class ExcelParser implements AnyParserMethod {\n mimes = [\"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\"];\n\n async apply(file: Buffer): Promise<string> {\n const sheetsRegex = /xl\\/worksheets\\/sheet\\d+.xml/g;\n const drawingsRegex = /xl\\/drawings\\/drawing\\d+.xml/g;\n const chartsRegex = /xl\\/charts\\/chart\\d+.xml/g;\n const stringsFilePath = 'xl/sharedStrings.xml';\n\n try {\n const files = await extractFiles(file, x =>\n [sheetsRegex, drawingsRegex, chartsRegex].some(fileRegex => x.match(fileRegex)) || x == stringsFilePath\n );\n\n if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(sheetsRegex))) {\n throw ERRORMSG.fileCorrupted(\"TODO: figure this out\");\n }\n\n const xmlContentFilesObject = {\n sheetFiles: files.filter(file => file.path.match(sheetsRegex)).map(file => file.content),\n drawingFiles: files.filter(file => file.path.match(drawingsRegex)).map(file => file.content),\n chartFiles: files.filter(file => file.path.match(chartsRegex)).map(file => file.content),\n sharedStringsFile: files.filter(file => file.path == stringsFilePath).map(file => file.content)[0],\n };\n\n let responseText: string[] = [];\n\n function isValidInlineStringCNode(cNode: Element): boolean {\n if (cNode.tagName.toLowerCase() != 'c') return false;\n if (cNode.getAttribute(\"t\") != 'inlineStr') return false;\n const childNodesNamedIs: LiveNodeList<Element> = cNode.getElementsByTagName('is');\n if (childNodesNamedIs.length != 1) return false;\n const childNodesNamedT: LiveNodeList<Element> = childNodesNamedIs[0].getElementsByTagName('t');\n if (childNodesNamedT.length != 1) return false;\n return childNodesNamedT[0].childNodes[0] && childNodesNamedT[0].childNodes[0].nodeValue != '';\n }\n\n function hasValidVNodeInCNode(cNode: Element): boolean {\n const vNodes = cNode.getElementsByTagName(\"v\");\n return vNodes[0] && vNodes[0].childNodes[0] && vNodes[0].childNodes[0].nodeValue != '';\n }\n\n const sharedStringsXmlTNodesList = xmlContentFilesObject.sharedStringsFile != undefined\n ? parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName(\"t\")\n : [];\n\n const sharedStrings = Array.from(sharedStringsXmlTNodesList)\n .map(tNode => tNode.childNodes[0]?.nodeValue ?? '');\n\n for (const sheetXmlContent of xmlContentFilesObject.sheetFiles) {\n const sheetsXmlCNodesList = parseString(sheetXmlContent).getElementsByTagName(\"c\");\n responseText.push(\n Array.from(sheetsXmlCNodesList)\n .filter(cNode => isValidInlineStringCNode(cNode) || hasValidVNodeInCNode(cNode))\n .map(cNode => {\n if (isValidInlineStringCNode(cNode))\n return cNode.getElementsByTagName('is')[0].getElementsByTagName('t')[0].childNodes[0].nodeValue;\n if (hasValidVNodeInCNode(cNode)) {\n const isIndexInSharedStrings = cNode.getAttribute(\"t\") == \"s\";\n const value = parseInt(cNode.getElementsByTagName(\"v\")[0].childNodes[0].nodeValue ?? \"\", 10);\n if (isIndexInSharedStrings && value >= sharedStrings.length)\n throw ERRORMSG.fileCorrupted(\"TODO: figure this out\");\n\n return isIndexInSharedStrings\n ? sharedStrings[value]\n : value;\n }\n return '';\n })\n .join(\"\\n\")\n );\n }\n\n for (const drawingXmlContent of xmlContentFilesObject.drawingFiles) {\n const drawingsXmlParagraphNodesList = parseString(drawingXmlContent).getElementsByTagName(\"a:p\");\n responseText.push(\n Array.from(drawingsXmlParagraphNodesList)\n .filter(paragraphNode => paragraphNode.getElementsByTagName(\"a:t\").length != 0)\n .map(paragraphNode => {\n const xmlTextNodeList = paragraphNode.getElementsByTagName(\"a:t\");\n return Array.from(xmlTextNodeList)\n .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)\n .map(textNode => textNode.childNodes[0].nodeValue)\n .join(\"\");\n })\n .join(\"\\n\")\n );\n }\n\n for (const chartXmlContent of xmlContentFilesObject.chartFiles) {\n const chartsXmlCVNodesList = parseString(chartXmlContent).getElementsByTagName(\"c:v\");\n responseText.push(\n Array.from(chartsXmlCVNodesList)\n .filter(cVNode => cVNode.childNodes[0] && cVNode.childNodes[0].nodeValue)\n .map(cVNode => cVNode.childNodes[0].nodeValue)\n .join(\"\\n\")\n );\n }\n\n return responseText.join(\"\\n\");\n } catch (error) {\n console.error(\"Error parsing Excel file:\", error);\n throw error;\n }\n }\n}\n","import { ERRORMSG } from \"../constant\";\nimport { AnyParserMethod } from \"../types\";\nimport { extractFiles, parseString } from \"../util\";\nimport { Element, Node } from \"@xmldom/xmldom\";\n\nexport class OpenOfficeParser implements AnyParserMethod {\n mimes = [\"application/vnd.oasis.opendocument.text\",\n \"application/vnd.oasis.opendocument.spreadsheet\",\n \"application/vnd.oasis.opendocument.presentation\",\n \"application/vnd.oasis.opendocument.graphics\",\n \"application/vnd.oasis.opendocument.formula\"];\n\n apply = async (file: Buffer): Promise<string> => {\n const mainContentFilePath = 'content.xml';\n const objectContentFilesRegex = /Object \\d+\\/content.xml/g;\n\n try {\n const files = await extractFiles(file, x => x == mainContentFilePath || !!x.match(objectContentFilesRegex));\n\n if (!files.map(file => file.path).includes(mainContentFilePath)) {\n throw ERRORMSG.fileCorrupted(\"TODO: figure this out\");\n }\n\n const xmlContentFilesObject = {\n mainContentFile: files.filter(file => file.path == mainContentFilePath).map(file => file.content)[0],\n objectContentFiles: files.filter(file => file.path.match(objectContentFilesRegex)).map(file => file.content),\n };\n\n let notesText: string[] = [];\n let responseText: string[] = [];\n\n const allowedTextTags = [\"text:p\", \"text:h\"];\n const notesTag = \"presentation:notes\";\n\n function extractAllTextsFromNode(root: Element): string {\n let xmlTextArray: string[] = [];\n for (let i = 0; i < root.childNodes.length; i++) {\n traversal(root.childNodes[i], xmlTextArray, true);\n }\n return xmlTextArray.join(\"\");\n }\n\n function traversal(node: Node, xmlTextArray: string[], isFirstRecursion: boolean): void {\n if (!node.childNodes || node.childNodes.length == 0) {\n if (node.parentNode && (node.parentNode as Element).tagName.indexOf('text') == 0 && node.nodeValue) {\n if (isNotesNode(node.parentNode as Element)) {\n notesText.push(node.nodeValue);\n if (allowedTextTags.includes((node.parentNode as Element).tagName) && !isFirstRecursion) {\n notesText.push(\"\\n\");\n }\n } else {\n xmlTextArray.push(node.nodeValue);\n if (allowedTextTags.includes((node.parentNode as Element).tagName) && !isFirstRecursion) {\n xmlTextArray.push(\"\\n\");\n }\n }\n }\n return;\n }\n\n for (let i = 0; i < node.childNodes.length; i++) {\n traversal(node.childNodes[i] as Element, xmlTextArray, false);\n }\n }\n\n function isNotesNode(node: Element): boolean {\n if (node.tagName == notesTag) {\n return true;\n }\n if (node.parentNode) {\n return isNotesNode(node.parentNode as Element);\n }\n return false;\n }\n\n function isInvalidTextNode(node: Element) {\n if (allowedTextTags.includes(node.tagName)) {\n return true;\n }\n if (node.parentNode) {\n return isInvalidTextNode(node.parentNode as Element);\n }\n return false;\n }\n\n const xmlContentArray = [xmlContentFilesObject.mainContentFile, ...xmlContentFilesObject.objectContentFiles].map(xmlContent => parseString(xmlContent));\n xmlContentArray.forEach(xmlContent => {\n const xmlTextNodesList = [...Array.from(xmlContent\n .getElementsByTagName(\"*\"))\n .filter(node => allowedTextTags.includes(node.tagName)\n && !isInvalidTextNode(node.parentNode as Element))];\n responseText.push(\n xmlTextNodesList\n .map(textNode => extractAllTextsFromNode(textNode))\n .filter(text => text != \"\")\n .join(\"\\n\")\n );\n });\n\n responseText = [...responseText, ...notesText];\n return responseText.join(\"\\n\");\n\n } catch (error) {\n console.error(\"Error parsing OpenOffice file:\", error);\n throw error;\n }\n }\n}","import pdf from 'pdf-parse';\nimport { AnyParserMethod } from \"../types\";\n\nexport class PDFParser implements AnyParserMethod {\n mimes = [\"application/pdf\"];\n\n apply = async (file: Buffer): Promise<string> => {\n try {\n const data = await pdf(file);\n const textContent = data.text;\n return textContent;\n } catch (error) {\n console.error(\"Error parsing PDF file:\", error);\n throw error;\n }\n };\n}","import { ERRORMSG } from \"../constant\";\nimport { AnyParserMethod } from \"../types\";\nimport { extractFiles, parseString } from \"../util\";\n\nexport class PowerPointParser implements AnyParserMethod {\n mimes = [\"application/vnd.openxmlformats-officedocument.presentationml.presentation\"];\n\n async apply(file: Buffer): Promise<string> {\n const allFilesRegex = /ppt\\/(notesSlides|slides)\\/(notesSlide|slide)\\d+.xml/g;\n const slidesRegex = /ppt\\/slides\\/slide\\d+.xml/g;\n const slideNumberRegex = /lide(\\d+)\\.xml/;\n\n try {\n const files = await extractFiles(file, x => !!x.match(allFilesRegex));\n\n files.sort((a, b) => {\n const matchedANumber = parseInt(a.path.match(slideNumberRegex)?.at(1) ?? \"\", 10);\n const matchedBNumber = parseInt(b.path.match(slideNumberRegex)?.at(1) ?? \"\", 10);\n\n const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber;\n const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber;\n\n return aNumber - bNumber || Number(a.path.includes('notes')) - Number(b.path.includes('notes'));\n });\n\n if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(slidesRegex))) {\n throw ERRORMSG.fileCorrupted(\"TODO: figure this out\");\n }\n\n files.sort((a, b) => a.path.indexOf(\"notes\") - b.path.indexOf(\"notes\"));\n\n const xmlContentArray = files.map(file => file.content);\n\n let responseText: string[] = [];\n\n for (const xmlContent of xmlContentArray) {\n const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName(\"a:p\");\n responseText.push(\n Array.from(xmlParagraphNodesList)\n .filter(paragraphNode => paragraphNode.getElementsByTagName(\"a:t\").length != 0)\n .map(paragraphNode => {\n const xmlTextNodeList = paragraphNode.getElementsByTagName(\"a:t\");\n return Array.from(xmlTextNodeList)\n .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)\n .map(textNode => textNode.childNodes[0].nodeValue)\n .join(\"\");\n })\n .join(\"\\n\")\n );\n }\n const responseTextString = responseText.join(\"\\n\");\n return responseTextString;\n } catch (error) {\n console.error(\"Error parsing PowerPoint file:\", error);\n throw error;\n }\n }\n}","import { ERRORMSG } from \"../constant\";\nimport { AnyParserMethod } from \"../types\";\nimport { extractFiles, parseString } from \"../util\";\n\nexport class WordParser implements AnyParserMethod {\n mimes = [\"application/vnd.openxmlformats-officedocument.wordprocessingml.document\"];\n\n async apply(file: Buffer): Promise<string> {\n const mainContentFileRegex = /word\\/document[\\d+]?.xml/g;\n const footnotesFileRegex = /word\\/footnotes[\\d+]?.xml/g;\n const endnotesFileRegex = /word\\/endnotes[\\d+]?.xml/g;\n\n try {\n const files = await extractFiles(file, x =>\n [mainContentFileRegex, footnotesFileRegex, endnotesFileRegex].some(fileRegex => x.match(fileRegex))\n );\n\n if (!files.some(file => file.path.match(mainContentFileRegex))) {\n throw ERRORMSG.fileCorrupted(\"TODO: figure this out\");\n }\n\n const xmlContentArray = files\n .filter(file => file.path.match(mainContentFileRegex) || file.path.match(footnotesFileRegex) || file.path.match(endnotesFileRegex))\n .map(file => file.content);\n\n let responseText: string[] = [];\n\n xmlContentArray.forEach(xmlContent => {\n const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName(\"w:p\");\n responseText.push(\n Array.from(xmlParagraphNodesList)\n .filter(paragraphNode => paragraphNode.getElementsByTagName(\"w:t\").length != 0)\n .map(paragraphNode => {\n const xmlTextNodeList = paragraphNode.getElementsByTagName(\"w:t\");\n return Array.from(xmlTextNodeList)\n .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)\n .map(textNode => textNode.childNodes[0].nodeValue)\n .join(\"\");\n })\n .join(\"\\n\")\n );\n });\n const responseTextString = responseText.join(\"\\n\");\n return responseTextString;\n } catch (error) {\n console.error(\"Error parsing Word file:\", error);\n throw error;\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,sBAAiC;AACjC,oBAAsB;AACtB,mBAAkB;;;ACEX,IAAM,WAAW;AAAA,EACtB,sBAAsB,CAAC,QAAgB,iJAAiJ,GAAG;AAAA,EAC3L,eAAe,CAAC,aAAqB,aAAa,QAAQ;AAAA,EAC1D,kBAAkB,CAAC,aAAqB,QAAQ,QAAQ;AAAA,EACxD,kBAAkB,CAAC,aAAqB,oBAAoB,QAAQ;AAAA,EACpE,mBAAmB;AAAA,EACnB,iBAAiB;AAAA,EACjB,cAAc;AAChB;;;ADPA,2BAAmB;AACnB,oBAA0B;AAEnB,IAAM,WAAW,OAAO,aAC5B,UAAM,gBAAAA,UAAK,QAAQ;AAEf,IAAM,cAAc,OAAO,QAAiC;AACjE,QAAM,MAAM,UAAM,qBAAM,GAAG;AAC3B,MAAI,CAAC,IAAI,GAAI,OAAM,IAAI,MAAM,oBAAoB,IAAI,UAAU,EAAE;AACjE,SAAO,OAAO,KAAK,MAAM,IAAI,YAAY,CAAC;AAC5C;AAEO,IAAM,eAAe,CAAC,UAA2B,aAA+D;AACrH,SAAO,IAAI,QAAQ,CAAC,KAAK,QAAQ;AAC/B,UAAM,iBAAiB,CAAC,YAA2B;AACjD,YAAM,iBAAkC,CAAC;AACzC,cAAQ,UAAU;AAElB,eAAS,aAAa,OAAoB;AACxC,YAAI,SAAS,MAAM,QAAQ,GAAG;AAC5B,kBAAQ,eAAe,OAAO,CAAC,KAAK,eAAe;AACjD,gBAAI;AACF,qBAAO,IAAI,GAAG;AAEhB,uBAAW,SAAK,qBAAAC,SAAO,CAAC,SAAiB;AACvC,6BAAe,KAAK;AAAA,gBACpB,MAAM,MAAM;AAAA,gBACZ,SAAS,KAAK,SAAS;AAAA,cACvB,CAAC;AACD,sBAAQ,UAAU;AAAA,YACpB,CAAC,CAAC;AAAA,UACJ,CAAC;AAAA,QACH;AAEE,kBAAQ,UAAU;AAAA,MACtB;AAEA,cAAQ,GAAG,SAAS,YAAY;AAChC,cAAQ,GAAG,OAAO,MAAM,IAAI,cAAc,CAAC;AAC3C,cAAQ,GAAG,SAAS,GAAG;AAAA,IACzB;AAEA,QAAI,OAAO,SAAS,QAAQ,GAAG;AAC7B,mBAAAC,QAAM,WAAW,UAAU,EAAE,aAAa,KAAK,GAAG,CAAC,KAAK,YAAY;AAClE,YAAI,IAAK,QAAO,IAAI,GAAG;AACvB,uBAAe,OAAO;AAAA,MACxB,CAAC;AAAA,IACH,WACS,OAAO,aAAa,UAAU;AACrC,mBAAAA,QAAM,KAAK,UAAU,EAAE,aAAa,KAAK,GAAG,CAAC,KAAK,YAAY;AAC5D,YAAI,IAAK,QAAO,IAAI,GAAG;AACvB,uBAAe,OAAO;AAAA,MACxB,CAAC;AAAA,IACH;AAEE,UAAI,SAAS,YAAY;AAAA,EAC7B,CAAC;AACH;AAEO,IAAM,cAAc,CAAC,QAAgB;AAC1C,MAAI,SAAS,IAAI,wBAAU;AAC3B,SAAO,OAAO,gBAAgB,KAAK,UAAU;AAC/C;;;AEjEA,uBAAkD;AAE3C,IAAM,eAAN,MAAmB;AAAA,EAAnB;AACN,SAAQ,YAA8C,oBAAI,IAAI;AAC9D,SAAQ,UAAiC,CAAC;AAE1C,SAAO,YAAY,CAAC,WAAsC;AACzD,WAAK,QAAQ,KAAK,MAAM;AACxB,aAAO,MAAM,QAAQ,CAAC,SAAS;AAC9B,aAAK,UAAU,IAAI,MAAM,MAAM;AAAA,MAChC,CAAC;AACD,aAAO;AAAA,IACR;AAEA,SAAO,uBAAuB,MAAgB;AAC7C,aAAO,MAAM,KAAK,KAAK,UAAU,KAAK,CAAC;AAAA,IACxC;AAEA,SAAO,cAAc,OAAO,EAAE,OAAO,KAAK,MAA0C;AACnF,UAAI;AACJ,UAAI,OAAO,UAAU,UAAU;AAC9B,gBAAQ,MAAM;AAAA,UACb,KAAK;AACJ,4BAAgB,MAAM,SAAS,KAAK;AACpC;AAAA,UACD,KAAK;AACJ,4BAAgB,MAAM,YAAY,KAAK;AACvC;AAAA,UACD;AACC,4BAAgB,OAAO,KAAK,KAAK;AAAA,QACnC;AAAA,MACD,OAAO;AACN,wBAAgB;AAAA,MACjB;AAEA,YAAM,cAAc,UAAM,iBAAAC,oBAAY,aAAa;AACnD,UAAI,CAAC,YAAa,QAAO,cAAc,SAAS,OAAO;AAEvD,YAAM,YAAY,KAAK,UAAU,IAAI,YAAY,IAAI;AAErD,UAAI,CAAC,WAAW,OAAO;AACtB,cAAM,UAAU,4DAA4D,YAAY,IAAI;AAC5F,cAAM,IAAI,MAAM,OAAO;AAAA,MACxB;AAEA,aAAO,UAAU,MAAM,aAAa;AAAA,IACrC;AAAA;AACD;;;AC5CO,IAAM,cAAN,MAAiD;AAAA,EAAjD;AACL,iBAAQ,CAAC,mEAAmE;AAAA;AAAA,EAE5E,MAAM,MAAM,MAA+B;AACzC,UAAM,cAAc;AACpB,UAAM,gBAAgB;AACtB,UAAM,cAAc;AACpB,UAAM,kBAAkB;AAExB,QAAI;AAkBF,UAASC,4BAAT,SAAkC,OAAyB;AACzD,YAAI,MAAM,QAAQ,YAAY,KAAK,IAAK,QAAO;AAC/C,YAAI,MAAM,aAAa,GAAG,KAAK,YAAa,QAAO;AACnD,cAAM,oBAA2C,MAAM,qBAAqB,IAAI;AAChF,YAAI,kBAAkB,UAAU,EAAG,QAAO;AAC1C,cAAM,mBAA0C,kBAAkB,CAAC,EAAE,qBAAqB,GAAG;AAC7F,YAAI,iBAAiB,UAAU,EAAG,QAAO;AACzC,eAAO,iBAAiB,CAAC,EAAE,WAAW,CAAC,KAAK,iBAAiB,CAAC,EAAE,WAAW,CAAC,EAAE,aAAa;AAAA,MAC7F,GAESC,wBAAT,SAA8B,OAAyB;AACrD,cAAM,SAAS,MAAM,qBAAqB,GAAG;AAC7C,eAAO,OAAO,CAAC,KAAK,OAAO,CAAC,EAAE,WAAW,CAAC,KAAK,OAAO,CAAC,EAAE,WAAW,CAAC,EAAE,aAAa;AAAA,MACtF;AAbS,qCAAAD,2BAUA,uBAAAC;AA3BT,YAAM,QAAQ,MAAM;AAAA,QAAa;AAAA,QAAM,OACrC,CAAC,aAAa,eAAe,WAAW,EAAE,KAAK,eAAa,EAAE,MAAM,SAAS,CAAC,KAAK,KAAK;AAAA,MAC1F;AAEA,UAAI,MAAM,UAAU,KAAK,CAAC,MAAM,IAAI,CAAAC,UAAQA,MAAK,IAAI,EAAE,KAAK,cAAY,SAAS,MAAM,WAAW,CAAC,GAAG;AACpG,cAAM,SAAS,cAAc,uBAAuB;AAAA,MACtD;AAEA,YAAM,wBAAwB;AAAA,QAC5B,YAAY,MAAM,OAAO,CAAAA,UAAQA,MAAK,KAAK,MAAM,WAAW,CAAC,EAAE,IAAI,CAAAA,UAAQA,MAAK,OAAO;AAAA,QACvF,cAAc,MAAM,OAAO,CAAAA,UAAQA,MAAK,KAAK,MAAM,aAAa,CAAC,EAAE,IAAI,CAAAA,UAAQA,MAAK,OAAO;AAAA,QAC3F,YAAY,MAAM,OAAO,CAAAA,UAAQA,MAAK,KAAK,MAAM,WAAW,CAAC,EAAE,IAAI,CAAAA,UAAQA,MAAK,OAAO;AAAA,QACvF,mBAAmB,MAAM,OAAO,CAAAA,UAAQA,MAAK,QAAQ,eAAe,EAAE,IAAI,CAAAA,UAAQA,MAAK,OAAO,EAAE,CAAC;AAAA,MACnG;AAEA,UAAI,eAAyB,CAAC;AAiB9B,YAAM,6BAA6B,sBAAsB,qBAAqB,SAC1E,YAAY,sBAAsB,iBAAiB,EAAE,qBAAqB,GAAG,IAC7E,CAAC;AAEL,YAAM,gBAAgB,MAAM,KAAK,0BAA0B,EACxD,IAAI,WAAS,MAAM,WAAW,CAAC,GAAG,aAAa,EAAE;AAEpD,iBAAW,mBAAmB,sBAAsB,YAAY;AAC9D,cAAM,sBAAsB,YAAY,eAAe,EAAE,qBAAqB,GAAG;AACjF,qBAAa;AAAA,UACX,MAAM,KAAK,mBAAmB,EAC3B,OAAO,WAASF,0BAAyB,KAAK,KAAKC,sBAAqB,KAAK,CAAC,EAC9E,IAAI,WAAS;AACZ,gBAAID,0BAAyB,KAAK;AAChC,qBAAO,MAAM,qBAAqB,IAAI,EAAE,CAAC,EAAE,qBAAqB,GAAG,EAAE,CAAC,EAAE,WAAW,CAAC,EAAE;AACxF,gBAAIC,sBAAqB,KAAK,GAAG;AAC/B,oBAAM,yBAAyB,MAAM,aAAa,GAAG,KAAK;AAC1D,oBAAM,QAAQ,SAAS,MAAM,qBAAqB,GAAG,EAAE,CAAC,EAAE,WAAW,CAAC,EAAE,aAAa,IAAI,EAAE;AAC3F,kBAAI,0BAA0B,SAAS,cAAc;AACnD,sBAAM,SAAS,cAAc,uBAAuB;AAEtD,qBAAO,yBACH,cAAc,KAAK,IACnB;AAAA,YACN;AACA,mBAAO;AAAA,UACT,CAAC,EACA,KAAK,IAAI;AAAA,QACd;AAAA,MACF;AAEA,iBAAW,qBAAqB,sBAAsB,cAAc;AAClE,cAAM,gCAAgC,YAAY,iBAAiB,EAAE,qBAAqB,KAAK;AAC/F,qBAAa;AAAA,UACX,MAAM,KAAK,6BAA6B,EACrC,OAAO,mBAAiB,cAAc,qBAAqB,KAAK,EAAE,UAAU,CAAC,EAC7E,IAAI,mBAAiB;AACpB,kBAAM,kBAAkB,cAAc,qBAAqB,KAAK;AAChE,mBAAO,MAAM,KAAK,eAAe,EAC9B,OAAO,cAAY,SAAS,WAAW,CAAC,KAAK,SAAS,WAAW,CAAC,EAAE,SAAS,EAC7E,IAAI,cAAY,SAAS,WAAW,CAAC,EAAE,SAAS,EAChD,KAAK,EAAE;AAAA,UACZ,CAAC,EACA,KAAK,IAAI;AAAA,QACd;AAAA,MACF;AAEA,iBAAW,mBAAmB,sBAAsB,YAAY;AAC9D,cAAM,uBAAuB,YAAY,eAAe,EAAE,qBAAqB,KAAK;AACpF,qBAAa;AAAA,UACX,MAAM,KAAK,oBAAoB,EAC5B,OAAO,YAAU,OAAO,WAAW,CAAC,KAAK,OAAO,WAAW,CAAC,EAAE,SAAS,EACvE,IAAI,YAAU,OAAO,WAAW,CAAC,EAAE,SAAS,EAC5C,KAAK,IAAI;AAAA,QACd;AAAA,MACF;AAEA,aAAO,aAAa,KAAK,IAAI;AAAA,IAC/B,SAAS,OAAO;AACd,cAAQ,MAAM,6BAA6B,KAAK;AAChD,YAAM;AAAA,IACR;AAAA,EACF;AACF;;;ACzGO,IAAM,mBAAN,MAAsD;AAAA,EAAtD;AACL,iBAAQ;AAAA,MAAC;AAAA,MACP;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IAA4C;AAE9C,iBAAQ,OAAO,SAAkC;AAC/C,YAAM,sBAAsB;AAC5B,YAAM,0BAA0B;AAEhC,UAAI;AAkBF,YAASE,2BAAT,SAAiC,MAAuB;AACtD,cAAI,eAAyB,CAAC;AAC9B,mBAAS,IAAI,GAAG,IAAI,KAAK,WAAW,QAAQ,KAAK;AAC/C,YAAAC,WAAU,KAAK,WAAW,CAAC,GAAG,cAAc,IAAI;AAAA,UAClD;AACA,iBAAO,aAAa,KAAK,EAAE;AAAA,QAC7B,GAESA,aAAT,SAAmB,MAAY,cAAwB,kBAAiC;AACtF,cAAI,CAAC,KAAK,cAAc,KAAK,WAAW,UAAU,GAAG;AACnD,gBAAI,KAAK,cAAe,KAAK,WAAuB,QAAQ,QAAQ,MAAM,KAAK,KAAK,KAAK,WAAW;AAClG,kBAAIC,aAAY,KAAK,UAAqB,GAAG;AAC3C,0BAAU,KAAK,KAAK,SAAS;AAC7B,oBAAI,gBAAgB,SAAU,KAAK,WAAuB,OAAO,KAAK,CAAC,kBAAkB;AACvF,4BAAU,KAAK,IAAI;AAAA,gBACrB;AAAA,cACF,OAAO;AACL,6BAAa,KAAK,KAAK,SAAS;AAChC,oBAAI,gBAAgB,SAAU,KAAK,WAAuB,OAAO,KAAK,CAAC,kBAAkB;AACvF,+BAAa,KAAK,IAAI;AAAA,gBACxB;AAAA,cACF;AAAA,YACF;AACA;AAAA,UACF;AAEA,mBAAS,IAAI,GAAG,IAAI,KAAK,WAAW,QAAQ,KAAK;AAC/C,YAAAD,WAAU,KAAK,WAAW,CAAC,GAAc,cAAc,KAAK;AAAA,UAC9D;AAAA,QACF,GAESC,eAAT,SAAqB,MAAwB;AAC3C,cAAI,KAAK,WAAW,UAAU;AAC5B,mBAAO;AAAA,UACT;AACA,cAAI,KAAK,YAAY;AACnB,mBAAOA,aAAY,KAAK,UAAqB;AAAA,UAC/C;AACA,iBAAO;AAAA,QACT,GAESC,qBAAT,SAA2B,MAAe;AACxC,cAAI,gBAAgB,SAAS,KAAK,OAAO,GAAG;AAC1C,mBAAO;AAAA,UACT;AACA,cAAI,KAAK,YAAY;AACnB,mBAAOA,mBAAkB,KAAK,UAAqB;AAAA,UACrD;AACA,iBAAO;AAAA,QACT;AAjDS,sCAAAH,0BAQA,YAAAC,YAuBA,cAAAC,cAUA,oBAAAC;AA1DT,cAAM,QAAQ,MAAM,aAAa,MAAM,OAAK,KAAK,uBAAuB,CAAC,CAAC,EAAE,MAAM,uBAAuB,CAAC;AAE1G,YAAI,CAAC,MAAM,IAAI,CAAAC,UAAQA,MAAK,IAAI,EAAE,SAAS,mBAAmB,GAAG;AAC/D,gBAAM,SAAS,cAAc,uBAAuB;AAAA,QACtD;AAEA,cAAM,wBAAwB;AAAA,UAC5B,iBAAiB,MAAM,OAAO,CAAAA,UAAQA,MAAK,QAAQ,mBAAmB,EAAE,IAAI,CAAAA,UAAQA,MAAK,OAAO,EAAE,CAAC;AAAA,UACnG,oBAAoB,MAAM,OAAO,CAAAA,UAAQA,MAAK,KAAK,MAAM,uBAAuB,CAAC,EAAE,IAAI,CAAAA,UAAQA,MAAK,OAAO;AAAA,QAC7G;AAEA,YAAI,YAAsB,CAAC;AAC3B,YAAI,eAAyB,CAAC;AAE9B,cAAM,kBAAkB,CAAC,UAAU,QAAQ;AAC3C,cAAM,WAAW;AAqDjB,cAAM,kBAAkB,CAAC,sBAAsB,iBAAiB,GAAG,sBAAsB,kBAAkB,EAAE,IAAI,gBAAc,YAAY,UAAU,CAAC;AACtJ,wBAAgB,QAAQ,gBAAc;AACpC,gBAAM,mBAAmB,CAAC,GAAG,MAAM,KAAK,WACrC,qBAAqB,GAAG,CAAC,EACzB,OAAO,UAAQ,gBAAgB,SAAS,KAAK,OAAO,KAChD,CAACD,mBAAkB,KAAK,UAAqB,CAAC,CAAC;AACtD,uBAAa;AAAA,YACX,iBACG,IAAI,cAAYH,yBAAwB,QAAQ,CAAC,EACjD,OAAO,UAAQ,QAAQ,EAAE,EACzB,KAAK,IAAI;AAAA,UACd;AAAA,QACF,CAAC;AAED,uBAAe,CAAC,GAAG,cAAc,GAAG,SAAS;AAC7C,eAAO,aAAa,KAAK,IAAI;AAAA,MAE/B,SAAS,OAAO;AACd,gBAAQ,MAAM,kCAAkC,KAAK;AACrD,cAAM;AAAA,MACR;AAAA,IACF;AAAA;AACF;;;AC3GA,uBAAgB;AAGT,IAAM,YAAN,MAA+C;AAAA,EAA/C;AACL,iBAAQ,CAAC,iBAAiB;AAE1B,iBAAQ,OAAO,SAAkC;AAC/C,UAAI;AACF,cAAM,OAAO,UAAM,iBAAAK,SAAI,IAAI;AAC3B,cAAM,cAAc,KAAK;AACzB,eAAO;AAAA,MACT,SAAS,OAAO;AACd,gBAAQ,MAAM,2BAA2B,KAAK;AAC9C,cAAM;AAAA,MACR;AAAA,IACF;AAAA;AACF;;;ACZO,IAAM,mBAAN,MAAsD;AAAA,EAAtD;AACL,iBAAQ,CAAC,2EAA2E;AAAA;AAAA,EAEpF,MAAM,MAAM,MAA+B;AACzC,UAAM,gBAAgB;AACtB,UAAM,cAAc;AACpB,UAAM,mBAAmB;AAEzB,QAAI;AACF,YAAM,QAAQ,MAAM,aAAa,MAAM,OAAK,CAAC,CAAC,EAAE,MAAM,aAAa,CAAC;AAEpE,YAAM,KAAK,CAAC,GAAG,MAAM;AACnB,cAAM,iBAAiB,SAAS,EAAE,KAAK,MAAM,gBAAgB,GAAG,GAAG,CAAC,KAAK,IAAI,EAAE;AAC/E,cAAM,iBAAiB,SAAS,EAAE,KAAK,MAAM,gBAAgB,GAAG,GAAG,CAAC,KAAK,IAAI,EAAE;AAE/E,cAAM,UAAU,MAAM,cAAc,IAAI,WAAW;AACnD,cAAM,UAAU,MAAM,cAAc,IAAI,WAAW;AAEnD,eAAO,UAAU,WAAW,OAAO,EAAE,KAAK,SAAS,OAAO,CAAC,IAAI,OAAO,EAAE,KAAK,SAAS,OAAO,CAAC;AAAA,MAChG,CAAC;AAED,UAAI,MAAM,UAAU,KAAK,CAAC,MAAM,IAAI,CAAAC,UAAQA,MAAK,IAAI,EAAE,KAAK,cAAY,SAAS,MAAM,WAAW,CAAC,GAAG;AACpG,cAAM,SAAS,cAAc,uBAAuB;AAAA,MACtD;AAEA,YAAM,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,QAAQ,OAAO,IAAI,EAAE,KAAK,QAAQ,OAAO,CAAC;AAEtE,YAAM,kBAAkB,MAAM,IAAI,CAAAA,UAAQA,MAAK,OAAO;AAEtD,UAAI,eAAyB,CAAC;AAE9B,iBAAW,cAAc,iBAAiB;AACxC,cAAM,wBAAwB,YAAY,UAAU,EAAE,qBAAqB,KAAK;AAChF,qBAAa;AAAA,UACX,MAAM,KAAK,qBAAqB,EAC7B,OAAO,mBAAiB,cAAc,qBAAqB,KAAK,EAAE,UAAU,CAAC,EAC7E,IAAI,mBAAiB;AACpB,kBAAM,kBAAkB,cAAc,qBAAqB,KAAK;AAChE,mBAAO,MAAM,KAAK,eAAe,EAC9B,OAAO,cAAY,SAAS,WAAW,CAAC,KAAK,SAAS,WAAW,CAAC,EAAE,SAAS,EAC7E,IAAI,cAAY,SAAS,WAAW,CAAC,EAAE,SAAS,EAChD,KAAK,EAAE;AAAA,UACZ,CAAC,EACA,KAAK,IAAI;AAAA,QACd;AAAA,MACF;AACA,YAAM,qBAAqB,aAAa,KAAK,IAAI;AACjD,aAAO;AAAA,IACT,SAAS,OAAO;AACd,cAAQ,MAAM,kCAAkC,KAAK;AACrD,YAAM;AAAA,IACR;AAAA,EACF;AACF;;;ACrDO,IAAM,aAAN,MAAgD;AAAA,EAAhD;AACL,iBAAQ,CAAC,yEAAyE;AAAA;AAAA,EAElF,MAAM,MAAM,MAA+B;AACzC,UAAM,uBAAuB;AAC7B,UAAM,qBAAqB;AAC3B,UAAM,oBAAoB;AAE1B,QAAI;AACF,YAAM,QAAQ,MAAM;AAAA,QAAa;AAAA,QAAM,OACrC,CAAC,sBAAsB,oBAAoB,iBAAiB,EAAE,KAAK,eAAa,EAAE,MAAM,SAAS,CAAC;AAAA,MACpG;AAEA,UAAI,CAAC,MAAM,KAAK,CAAAC,UAAQA,MAAK,KAAK,MAAM,oBAAoB,CAAC,GAAG;AAC9D,cAAM,SAAS,cAAc,uBAAuB;AAAA,MACtD;AAEA,YAAM,kBAAkB,MACrB,OAAO,CAAAA,UAAQA,MAAK,KAAK,MAAM,oBAAoB,KAAKA,MAAK,KAAK,MAAM,kBAAkB,KAAKA,MAAK,KAAK,MAAM,iBAAiB,CAAC,EACjI,IAAI,CAAAA,UAAQA,MAAK,OAAO;AAE3B,UAAI,eAAyB,CAAC;AAE9B,sBAAgB,QAAQ,gBAAc;AACpC,cAAM,wBAAwB,YAAY,UAAU,EAAE,qBAAqB,KAAK;AAChF,qBAAa;AAAA,UACX,MAAM,KAAK,qBAAqB,EAC7B,OAAO,mBAAiB,cAAc,qBAAqB,KAAK,EAAE,UAAU,CAAC,EAC7E,IAAI,mBAAiB;AACpB,kBAAM,kBAAkB,cAAc,qBAAqB,KAAK;AAChE,mBAAO,MAAM,KAAK,eAAe,EAC9B,OAAO,cAAY,SAAS,WAAW,CAAC,KAAK,SAAS,WAAW,CAAC,EAAE,SAAS,EAC7E,IAAI,cAAY,SAAS,WAAW,CAAC,EAAE,SAAS,EAChD,KAAK,EAAE;AAAA,UACZ,CAAC,EACA,KAAK,IAAI;AAAA,QACd;AAAA,MACF,CAAC;AACD,YAAM,qBAAqB,aAAa,KAAK,IAAI;AACjD,aAAO;AAAA,IACT,SAAS,OAAO;AACd,cAAQ,MAAM,4BAA4B,KAAK;AAC/C,YAAM;AAAA,IACR;AAAA,EACF;AACF;;;AR1CO,IAAM,kBAAkB,MAAoB;AACjD,QAAM,eAAe,IAAI,aAAa;AAEtC,eAAa,UAAU,IAAI,YAAY,CAAC;AACxC,eAAa,UAAU,IAAI,iBAAiB,CAAC;AAC7C,eAAa,UAAU,IAAI,UAAU,CAAC;AACtC,eAAa,UAAU,IAAI,iBAAiB,CAAC;AAC7C,eAAa,UAAU,IAAI,WAAW,CAAC;AAEvC,SAAO;AACT;","names":["read","concat","yauzl","getFileType","isValidInlineStringCNode","hasValidVNodeInCNode","file","extractAllTextsFromNode","traversal","isNotesNode","isInvalidTextNode","file","pdf","file","file"]}