npm - any-extractor - Versions diffs - 1.0.0 - Mend

any-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/index.d.mts +25 -0
package/dist/index.d.ts +25 -0
package/dist/index.js +428 -0
package/dist/index.js.map +1 -0
package/dist/index.mjs +391 -0
package/dist/index.mjs.map +1 -0
package/package.json +43 -0
package/src/constant.ts +13 -0
package/src/extractors/any-extractor.ts +50 -0
package/src/index.ts +20 -0
package/src/parser/excel-parser.ts +111 -0
package/src/parser/openoffice-paser.ts +108 -0
package/src/parser/pdf-parser.ts +17 -0
package/src/parser/powerpoint-parser.ts +58 -0
package/src/parser/word-parser.ts +50 -0
package/src/types.ts +12 -0
package/src/util.ts +68 -0
package/tsconfig.json +17 -0
package/tsup.config.ts +9 -0

package/src/parser/excel-parser.ts ADDED Viewed

@@ -0,0 +1,111 @@
+import { Element, LiveNodeList } from "@xmldom/xmldom";
+import { ERRORMSG } from "../constant";
+import { AnyParserMethod } from "../types";
+import { extractFiles, parseString } from "../util";
+export class ExcelParser implements AnyParserMethod {
+  mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
+  async apply(file: Buffer): Promise<string> {
+    const sheetsRegex = /xl\/worksheets\/sheet\d+.xml/g;
+    const drawingsRegex = /xl\/drawings\/drawing\d+.xml/g;
+    const chartsRegex = /xl\/charts\/chart\d+.xml/g;
+    const stringsFilePath = 'xl/sharedStrings.xml';
+    try {
+      const files = await extractFiles(file, x =>
+        [sheetsRegex, drawingsRegex, chartsRegex].some(fileRegex => x.match(fileRegex)) || x == stringsFilePath
+      );
+      if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(sheetsRegex))) {
+        throw ERRORMSG.fileCorrupted("TODO: figure this out");
+      }
+      const xmlContentFilesObject = {
+        sheetFiles: files.filter(file => file.path.match(sheetsRegex)).map(file => file.content),
+        drawingFiles: files.filter(file => file.path.match(drawingsRegex)).map(file => file.content),
+        chartFiles: files.filter(file => file.path.match(chartsRegex)).map(file => file.content),
+        sharedStringsFile: files.filter(file => file.path == stringsFilePath).map(file => file.content)[0],
+      };
+      let responseText: string[] = [];
+      function isValidInlineStringCNode(cNode: Element): boolean {
+        if (cNode.tagName.toLowerCase() != 'c') return false;
+        if (cNode.getAttribute("t") != 'inlineStr') return false;
+        const childNodesNamedIs: LiveNodeList<Element> = cNode.getElementsByTagName('is');
+        if (childNodesNamedIs.length != 1) return false;
+        const childNodesNamedT: LiveNodeList<Element> = childNodesNamedIs[0].getElementsByTagName('t');
+        if (childNodesNamedT.length != 1) return false;
+        return childNodesNamedT[0].childNodes[0] && childNodesNamedT[0].childNodes[0].nodeValue != '';
+      }
+      function hasValidVNodeInCNode(cNode: Element): boolean {
+        const vNodes = cNode.getElementsByTagName("v");
+        return vNodes[0] && vNodes[0].childNodes[0] && vNodes[0].childNodes[0].nodeValue != '';
+      }
+      const sharedStringsXmlTNodesList = xmlContentFilesObject.sharedStringsFile != undefined
+        ? parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t")
+        : [];
+      const sharedStrings = Array.from(sharedStringsXmlTNodesList)
+        .map(tNode => tNode.childNodes[0]?.nodeValue ?? '');
+      for (const sheetXmlContent of xmlContentFilesObject.sheetFiles) {
+        const sheetsXmlCNodesList = parseString(sheetXmlContent).getElementsByTagName("c");
+        responseText.push(
+          Array.from(sheetsXmlCNodesList)
+            .filter(cNode => isValidInlineStringCNode(cNode) || hasValidVNodeInCNode(cNode))
+            .map(cNode => {
+              if (isValidInlineStringCNode(cNode))
+                return cNode.getElementsByTagName('is')[0].getElementsByTagName('t')[0].childNodes[0].nodeValue;
+              if (hasValidVNodeInCNode(cNode)) {
+                const isIndexInSharedStrings = cNode.getAttribute("t") == "s";
+                const value = parseInt(cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue ?? "", 10);
+                if (isIndexInSharedStrings && value >= sharedStrings.length)
+                  throw ERRORMSG.fileCorrupted("TODO: figure this out");
+                return isIndexInSharedStrings
+                  ? sharedStrings[value]
+                  : value;
+              }
+              return '';
+            })
+            .join("\n")
+        );
+      }
+      for (const drawingXmlContent of xmlContentFilesObject.drawingFiles) {
+        const drawingsXmlParagraphNodesList = parseString(drawingXmlContent).getElementsByTagName("a:p");
+        responseText.push(
+          Array.from(drawingsXmlParagraphNodesList)
+            .filter(paragraphNode => paragraphNode.getElementsByTagName("a:t").length != 0)
+            .map(paragraphNode => {
+              const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
+              return Array.from(xmlTextNodeList)
+                .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
+                .map(textNode => textNode.childNodes[0].nodeValue)
+                .join("");
+            })
+            .join("\n")
+        );
+      }
+      for (const chartXmlContent of xmlContentFilesObject.chartFiles) {
+        const chartsXmlCVNodesList = parseString(chartXmlContent).getElementsByTagName("c:v");
+        responseText.push(
+          Array.from(chartsXmlCVNodesList)
+            .filter(cVNode => cVNode.childNodes[0] && cVNode.childNodes[0].nodeValue)
+            .map(cVNode => cVNode.childNodes[0].nodeValue)
+            .join("\n")
+        );
+      }
+      return responseText.join("\n");
+    } catch (error) {
+      console.error("Error parsing Excel file:", error);
+      throw error;
+    }
+  }
+}

package/src/parser/openoffice-paser.ts ADDED Viewed

@@ -0,0 +1,108 @@
+import { ERRORMSG } from "../constant";
+import { AnyParserMethod } from "../types";
+import { extractFiles, parseString } from "../util";
+import { Element, Node } from "@xmldom/xmldom";
+export class OpenOfficeParser implements AnyParserMethod {
+  mimes = ["application/vnd.oasis.opendocument.text",
+    "application/vnd.oasis.opendocument.spreadsheet",
+    "application/vnd.oasis.opendocument.presentation",
+    "application/vnd.oasis.opendocument.graphics",
+    "application/vnd.oasis.opendocument.formula"];
+  apply = async (file: Buffer): Promise<string> => {
+    const mainContentFilePath = 'content.xml';
+    const objectContentFilesRegex = /Object \d+\/content.xml/g;
+    try {
+      const files = await extractFiles(file, x => x == mainContentFilePath || !!x.match(objectContentFilesRegex));
+      if (!files.map(file => file.path).includes(mainContentFilePath)) {
+        throw ERRORMSG.fileCorrupted("TODO: figure this out");
+      }
+      const xmlContentFilesObject = {
+        mainContentFile: files.filter(file => file.path == mainContentFilePath).map(file => file.content)[0],
+        objectContentFiles: files.filter(file => file.path.match(objectContentFilesRegex)).map(file => file.content),
+      };
+      let notesText: string[] = [];
+      let responseText: string[] = [];
+      const allowedTextTags = ["text:p", "text:h"];
+      const notesTag = "presentation:notes";
+      function extractAllTextsFromNode(root: Element): string {
+        let xmlTextArray: string[] = [];
+        for (let i = 0; i < root.childNodes.length; i++) {
+          traversal(root.childNodes[i], xmlTextArray, true);
+        }
+        return xmlTextArray.join("");
+      }
+      function traversal(node: Node, xmlTextArray: string[], isFirstRecursion: boolean): void {
+        if (!node.childNodes || node.childNodes.length == 0) {
+          if (node.parentNode && (node.parentNode as Element).tagName.indexOf('text') == 0 && node.nodeValue) {
+            if (isNotesNode(node.parentNode as Element)) {
+              notesText.push(node.nodeValue);
+              if (allowedTextTags.includes((node.parentNode as Element).tagName) && !isFirstRecursion) {
+                notesText.push("\n");
+              }
+            } else {
+              xmlTextArray.push(node.nodeValue);
+              if (allowedTextTags.includes((node.parentNode as Element).tagName) && !isFirstRecursion) {
+                xmlTextArray.push("\n");
+              }
+            }
+          }
+          return;
+        }
+        for (let i = 0; i < node.childNodes.length; i++) {
+          traversal(node.childNodes[i] as Element, xmlTextArray, false);
+        }
+      }
+      function isNotesNode(node: Element): boolean {
+        if (node.tagName == notesTag) {
+          return true;
+        }
+        if (node.parentNode) {
+          return isNotesNode(node.parentNode as Element);
+        }
+        return false;
+      }
+      function isInvalidTextNode(node: Element) {
+        if (allowedTextTags.includes(node.tagName)) {
+          return true;
+        }
+        if (node.parentNode) {
+          return isInvalidTextNode(node.parentNode as Element);
+        }
+        return false;
+      }
+      const xmlContentArray = [xmlContentFilesObject.mainContentFile, ...xmlContentFilesObject.objectContentFiles].map(xmlContent => parseString(xmlContent));
+      xmlContentArray.forEach(xmlContent => {
+        const xmlTextNodesList = [...Array.from(xmlContent
+          .getElementsByTagName("*"))
+          .filter(node => allowedTextTags.includes(node.tagName)
+            && !isInvalidTextNode(node.parentNode as Element))];
+        responseText.push(
+          xmlTextNodesList
+            .map(textNode => extractAllTextsFromNode(textNode))
+            .filter(text => text != "")
+            .join("\n")
+        );
+      });
+      responseText = [...responseText, ...notesText];
+      return responseText.join("\n");
+    } catch (error) {
+      console.error("Error parsing OpenOffice file:", error);
+      throw error;
+    }
+  }
+}

package/src/parser/pdf-parser.ts ADDED Viewed

@@ -0,0 +1,17 @@
+import pdf from 'pdf-parse';
+import { AnyParserMethod } from "../types";
+export class PDFParser implements AnyParserMethod {
+  mimes = ["application/pdf"];
+  apply = async (file: Buffer): Promise<string> => {
+    try {
+      const data = await pdf(file);
+      const textContent = data.text;
+      return textContent;
+    } catch (error) {
+      console.error("Error parsing PDF file:", error);
+      throw error;
+    }
+  };
+}

package/src/parser/powerpoint-parser.ts ADDED Viewed

@@ -0,0 +1,58 @@
+import { ERRORMSG } from "../constant";
+import { AnyParserMethod } from "../types";
+import { extractFiles, parseString } from "../util";
+export class PowerPointParser implements AnyParserMethod {
+  mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
+  async apply(file: Buffer): Promise<string> {
+    const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
+    const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
+    const slideNumberRegex = /lide(\d+)\.xml/;
+    try {
+      const files = await extractFiles(file, x => !!x.match(allFilesRegex));
+      files.sort((a, b) => {
+        const matchedANumber = parseInt(a.path.match(slideNumberRegex)?.at(1) ?? "", 10);
+        const matchedBNumber = parseInt(b.path.match(slideNumberRegex)?.at(1) ?? "", 10);
+        const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber;
+        const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber;
+        return aNumber - bNumber || Number(a.path.includes('notes')) - Number(b.path.includes('notes'));
+      });
+      if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(slidesRegex))) {
+        throw ERRORMSG.fileCorrupted("TODO: figure this out");
+      }
+      files.sort((a, b) => a.path.indexOf("notes") - b.path.indexOf("notes"));
+      const xmlContentArray = files.map(file => file.content);
+      let responseText: string[] = [];
+      for (const xmlContent of xmlContentArray) {
+        const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName("a:p");
+        responseText.push(
+          Array.from(xmlParagraphNodesList)
+            .filter(paragraphNode => paragraphNode.getElementsByTagName("a:t").length != 0)
+            .map(paragraphNode => {
+              const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
+              return Array.from(xmlTextNodeList)
+                .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
+                .map(textNode => textNode.childNodes[0].nodeValue)
+                .join("");
+            })
+            .join("\n")
+        );
+      }
+      const responseTextString = responseText.join("\n");
+      return responseTextString;
+    } catch (error) {
+      console.error("Error parsing PowerPoint file:", error);
+      throw error;
+    }
+  }
+}

package/src/parser/word-parser.ts ADDED Viewed

@@ -0,0 +1,50 @@
+import { ERRORMSG } from "../constant";
+import { AnyParserMethod } from "../types";
+import { extractFiles, parseString } from "../util";
+export class WordParser implements AnyParserMethod {
+  mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
+  async apply(file: Buffer): Promise<string> {
+    const mainContentFileRegex = /word\/document[\d+]?.xml/g;
+    const footnotesFileRegex = /word\/footnotes[\d+]?.xml/g;
+    const endnotesFileRegex = /word\/endnotes[\d+]?.xml/g;
+    try {
+      const files = await extractFiles(file, x =>
+        [mainContentFileRegex, footnotesFileRegex, endnotesFileRegex].some(fileRegex => x.match(fileRegex))
+      );
+      if (!files.some(file => file.path.match(mainContentFileRegex))) {
+        throw ERRORMSG.fileCorrupted("TODO: figure this out");
+      }
+      const xmlContentArray = files
+        .filter(file => file.path.match(mainContentFileRegex) || file.path.match(footnotesFileRegex) || file.path.match(endnotesFileRegex))
+        .map(file => file.content);
+      let responseText: string[] = [];
+      xmlContentArray.forEach(xmlContent => {
+        const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName("w:p");
+        responseText.push(
+          Array.from(xmlParagraphNodesList)
+            .filter(paragraphNode => paragraphNode.getElementsByTagName("w:t").length != 0)
+            .map(paragraphNode => {
+              const xmlTextNodeList = paragraphNode.getElementsByTagName("w:t");
+              return Array.from(xmlTextNodeList)
+                .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
+                .map(textNode => textNode.childNodes[0].nodeValue)
+                .join("");
+            })
+            .join("\n")
+        );
+      });
+      const responseTextString = responseText.join("\n");
+      return responseTextString;
+    } catch (error) {
+      console.error("Error parsing Word file:", error);
+      throw error;
+    }
+  }
+}

package/src/types.ts ADDED Viewed

@@ -0,0 +1,12 @@
+export type InputType = 'buffer' | 'file' | 'fileurl'
+export type ExtractionPayload = { type: InputType; input: string | Buffer }
+export type AnyParserMethod = {
+	mimes: string[];
+	apply: (_: Buffer) => Promise<string>;
+}
+export type ExtractedFile = {
+	path: string;
+	content: string;
+}

package/src/util.ts ADDED Viewed

@@ -0,0 +1,68 @@
+import { readFile as read } from 'node:fs/promises';
+import { fetch } from 'undici';
+import yauzl from 'yauzl';
+import { ERRORMSG } from './constant';
+import { ExtractedFile } from './types';
+import concat from 'concat-stream';
+import { DOMParser } from '@xmldom/xmldom';
+export const readFile = async (filePath: string): Promise<Buffer> =>
+  (await read(filePath)) as unknown as Buffer;
+export const readFileUrl = async (url: string): Promise<Buffer> => {
+  const res = await fetch(url);
+  if (!res.ok) throw new Error(`Failed to fetch: ${res.statusText}`);
+  return Buffer.from(await res.arrayBuffer());
+}
+export const extractFiles = (zipInput: Buffer | string, filterFn: (x: string) => boolean): Promise<ExtractedFile[]> => {
+  return new Promise((res, rej) => {
+    const processZipfile = (zipfile: yauzl.ZipFile) => {
+      const extractedFiles: ExtractedFile[] = [];
+      zipfile.readEntry();
+      function processEntry(entry: yauzl.Entry) {
+        if (filterFn(entry.fileName)) {
+          zipfile.openReadStream(entry, (err, readStream) => {
+            if (err)
+              return rej(err);
+            readStream.pipe(concat((data: Buffer) => {
+              extractedFiles.push({
+              path: entry.fileName,
+              content: data.toString()
+              });
+              zipfile.readEntry();
+            }));
+          });
+        }
+        else
+          zipfile.readEntry();
+      }
+      zipfile.on('entry', processEntry);
+      zipfile.on('end', () => res(extractedFiles));
+      zipfile.on('error', rej);
+    };
+    if (Buffer.isBuffer(zipInput)) {
+      yauzl.fromBuffer(zipInput, { lazyEntries: true }, (err, zipfile) => {
+        if (err) return rej(err);
+        processZipfile(zipfile);
+      });
+    }
+    else if (typeof zipInput === 'string') {
+      yauzl.open(zipInput, { lazyEntries: true }, (err, zipfile) => {
+        if (err) return rej(err);
+        processZipfile(zipfile);
+      });
+    }
+    else
+      rej(ERRORMSG.invalidInput);
+  });
+}
+export const parseString = (xml: string) => {
+  let parser = new DOMParser();
+  return parser.parseFromString(xml, "text/xml");
+};

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,17 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "module": "ESNext",
+    "moduleResolution": "Node",
+    "outDir": "dist",
+    "declaration": true,
+    "declarationDir": "dist",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "skipLibCheck": true,
+    "strict": true,
+    "resolveJsonModule": true,
+    "sourceMap": true
+  },
+  "include": ["src"]
+}

package/tsup.config.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import { defineConfig } from 'tsup'
+export default defineConfig({
+  entry: ['src/index.ts'],
+  format: ['esm', 'cjs'],
+  dts: true,
+  sourcemap: true,
+  clean: true
+})