npm - any-extractor - Versions diffs - 2.0.2 → 2.0.4 - Mend

any-extractor 2.0.2 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.js CHANGED Viewed

@@ -343,11 +343,6 @@ var ConfluenceCrawler = class {
 var AnyExtractor = class {
   constructor(extractorConfig) {
     this.extractorConfig = {
-      llm: {
-        llmProvider: "openai",
-        visionModel: "",
-        apikey: ""
-      },
       confluence: {
         baseUrl: "",
         email: "",
@@ -361,11 +356,7 @@ var AnyExtractor = class {
       });
       return this;
     };
-    this.parseFile = async (input, basicAuth = null, extractingOptions = {
-      extractImages: false,
-      imageExtractionMethod: "ocr",
-      language: "eng"
-    }) => {
+    this.parseFile = async (input, basicAuth = null) => {
       let preparedInput;
       if (typeof input === "string") {
         if (isValidUrl(input)) {
@@ -390,17 +381,11 @@ var AnyExtractor = class {
       }
       const extractor = this.mimeParserMap.get(mimeDetails.mime);
       if (!extractor?.apply) {
-        const message = `AnyExtractor: No extraction method registered for MIME type '${mimeDetails.mime}'`;
-        throw new Error(message);
+        return "";
       }
-      return await extractor.apply(preparedInput, extractingOptions, this.extractorConfig);
+      return await extractor.apply(preparedInput, this.extractorConfig);
     };
-    this.parseConfluenceDoc = async (pageId, extractingOptions = {
-      extractAttachments: false,
-      extractImages: false,
-      imageExtractionMethod: "ocr",
-      language: "eng"
-    }) => {
+    this.parseConfluenceDoc = async (pageId) => {
       const { baseUrl, email, apiKey } = this.extractorConfig.confluence || {};
       if (!baseUrl || !email || !apiKey) {
         throw new Error("AnyExtractor: Confluence base URL, email, and API key are required");
@@ -409,20 +394,18 @@ var AnyExtractor = class {
       const content = await confCrawler.extractPageContent(pageId);
       let textContent = "";
       for (const item of content) {
-        if (item.type === "image" && extractingOptions.extractImages) {
+        if (item.type === "image") {
           const parsedFile = await this.parseFile(
             item.content,
-            `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`,
-            extractingOptions
+            `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`
           );
           textContent += parsedFile ? `
 (Image): ${parsedFile}
 ` : "";
-        } else if (item.type === "view-file" && extractingOptions.extractAttachments) {
+        } else if (item.type === "view-file") {
           const parsedFile = await this.parseFile(
             item.content,
-            `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`,
-            extractingOptions
+            `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`
           );
           textContent += parsedFile ? `
 [Attachment]: ${parsedFile}
@@ -467,7 +450,7 @@ var ExcelParser = class {
     this.mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
     this.anyExtractor = anyExtractor;
   }
-  async apply(file, extractingOptions) {
+  async apply(file) {
     const patterns = {
       sheets: /xl\/worksheets\/sheet\d+.xml/g,
       drawings: /xl\/drawings\/drawing\d+.xml/g,
@@ -501,7 +484,7 @@ var ExcelParser = class {
         } else if (patterns.charts.test(file2.path)) {
           return this.extractChartText([file2.content.toString()]);
         } else if (patterns.images.test(file2.path)) {
-          return await this.extractImageText([file2], extractingOptions);
+          return await this.extractImageText([file2]);
         }
         return null;
       }).filter(Boolean);
@@ -538,11 +521,11 @@ var ExcelParser = class {
       return Array.from(vNodes).map((node) => node.childNodes[0]?.nodeValue ?? "").join("\n");
     }).join("\n");
   }
-  async extractImageText(imageFiles, extractingOptions) {
+  async extractImageText(imageFiles) {
     const texts = await Promise.all(
       imageFiles.map(async (file) => {
         try {
-          return await this.anyExtractor.parseFile(file.content, null, extractingOptions);
+          return await this.anyExtractor.parseFile(file.content, null);
         } catch (e) {
           console.log(`AnyExtractor: Error extracting text from image ${file.path}:`, e);
           return "";
@@ -584,165 +567,6 @@ var ExcelParser = class {
   }
 };
-// src/file-parser/image-parser.ts
-var import_tesseract = __toESM(require("tesseract.js"));
-var import_undici3 = require("undici");
-var import_file_type_mime2 = require("file-type-mime");
-var ImageParser = class {
-  constructor() {
-    this.mimes = ["image/jpeg", "image/png", "image/webp"];
-    this.apply = async (file, extractingOptions, extractorConfig) => {
-      const { extractImages, imageExtractionMethod, language } = extractingOptions;
-      if (!extractImages) {
-        return "";
-      }
-      const mimeDetails = (0, import_file_type_mime2.parse)(
-        file.buffer.slice(file.byteOffset, file.byteOffset + file.byteLength)
-      );
-      if (!mimeDetails) {
-        throw new Error("AnyExtractor: Unable to parse MIME type");
-      }
-      const mimeType = mimeDetails.mime;
-      if (!this.mimes.includes(mimeType)) {
-        return "";
-      }
-      if (imageExtractionMethod === "ocr") {
-        return await this.performOCR(file, language);
-      }
-      const { llmProvider, visionModel, apikey } = extractorConfig.llm || {};
-      if (!llmProvider || !visionModel || !apikey) {
-        throw new Error(
-          "AnyExtractor: LLM provider, vision model and API key are required for image extraction"
-        );
-      }
-      const base64Image = file.toString("base64");
-      switch (llmProvider) {
-        case "openai":
-          return this.handleOpenAI(base64Image, mimeType, visionModel, apikey);
-        case "google":
-          return this.handleGoogle(base64Image, mimeType, visionModel, apikey);
-        case "anthropic":
-          return this.handleAnthropic(base64Image, mimeType, visionModel, apikey);
-        default:
-          throw new Error(`ImageParser: Unsupported LLM provider '${llmProvider}'`);
-      }
-    };
-    this.performOCR = async (file, language) => {
-      const worker = await import_tesseract.default.createWorker(language);
-      const {
-        data: { text }
-      } = await worker.recognize(file);
-      await worker.terminate();
-      return text;
-    };
-    this.handleOpenAI = async (base64Image, mimeType, visionModel, apikey) => {
-      const response = await (0, import_undici3.fetch)("https://api.openai.com/v1/chat/completions", {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-          Authorization: `Bearer ${apikey}`
-        },
-        body: JSON.stringify({
-          model: visionModel,
-          messages: [
-            {
-              role: "user",
-              content: [
-                {
-                  type: "text",
-                  text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
-                },
-                {
-                  type: "image_url",
-                  image_url: {
-                    url: `data:${mimeType};base64,${base64Image}`
-                  }
-                }
-              ]
-            }
-          ]
-        })
-      });
-      if (!response.ok) {
-        throw new Error(`ImageParser: OpenAI API error ${response.status}`);
-      }
-      const data = await response.json();
-      return data.choices[0].message.content;
-    };
-    this.handleGoogle = async (base64Image, mimeType, visionModel, apikey) => {
-      const response = await (0, import_undici3.fetch)(
-        `https://generativelanguage.googleapis.com/v1beta/models/${visionModel}:generateContent?key=${apikey}`,
-        {
-          method: "POST",
-          headers: {
-            "Content-Type": "application/json"
-          },
-          body: JSON.stringify({
-            contents: [
-              {
-                parts: [
-                  {
-                    text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
-                  },
-                  {
-                    inlineData: {
-                      mimeType,
-                      data: base64Image
-                    }
-                  }
-                ]
-              }
-            ]
-          })
-        }
-      );
-      if (!response.ok) {
-        throw new Error(`Google Gemini error: ${response.statusText}`);
-      }
-      const data = await response.json();
-      return data.candidates[0].content.parts[0].text;
-    };
-    this.handleAnthropic = async (base64Image, mimeType, visionModel, apikey) => {
-      const response = await (0, import_undici3.fetch)("https://api.anthropic.com/v1/messages", {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-          "x-api-key": apikey,
-          "anthropic-version": "2023-06-01"
-        },
-        body: JSON.stringify({
-          model: visionModel,
-          max_tokens: 300,
-          messages: [
-            {
-              role: "user",
-              content: [
-                {
-                  type: "text",
-                  text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
-                },
-                {
-                  type: "image",
-                  source: {
-                    type: "base64",
-                    media_type: mimeType,
-                    data: base64Image
-                  }
-                }
-              ]
-            }
-          ]
-        })
-      });
-      if (!response.ok) {
-        throw new Error(`Anthropic Claude error: ${response.statusText}`);
-      }
-      const data = await response.json();
-      return data.content[0].text;
-    };
-  }
-};
 // src/file-parser/openoffice-paser.ts
 var OpenOfficeParser = class {
   constructor() {
@@ -842,7 +666,7 @@ var PowerPointParser = class {
     this.mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
     this.anyExtractor = anyExtractor;
   }
-  async apply(file, extractingOptions) {
+  async apply(file) {
     const fileMatchRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+\.xml|ppt\/media\/image\d+\..+|ppt\/slides\/_rels\/slide\d+\.xml.rels/i;
     const slideNumberRegex = /slide(\d+)\.xml/;
     const imageRegex = /^ppt\/media\/image\d+\..+$/i;
@@ -873,7 +697,7 @@ var PowerPointParser = class {
           const imageFullPath = `ppt/${imagePath.replace(/^(\.\.\/)+/, "")}`;
           const imageBuffer = imageBuffers[imageFullPath];
           if (imageBuffer) {
-            const imageDescription = await this.convertImageToText(imageBuffer, extractingOptions);
+            const imageDescription = await this.convertImageToText(imageBuffer);
             if (imageDescription) {
               results.push(`[Image]: ${imageDescription}`);
             }
@@ -898,8 +722,8 @@ var PowerPointParser = class {
     const rels = parseString(relsXml).getElementsByTagName("Relationship");
     return Array.from(rels).filter((rel) => rel.getAttribute("Type")?.includes("/image") && rel.getAttribute("Target")).map((rel) => rel.getAttribute("Target"));
   }
-  async convertImageToText(imageBuffer, extractingOptions) {
-    return this.anyExtractor.parseFile(imageBuffer, null, extractingOptions);
+  async convertImageToText(imageBuffer) {
+    return this.anyExtractor.parseFile(imageBuffer, null);
   }
 };
@@ -919,7 +743,7 @@ var WordParser = class {
     this.mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
     this.anyExtractor = anyExtractor;
   }
-  async apply(file, extractingOptions) {
+  async apply(file) {
     const mainRegex = /word\/document[\d+]?.xml/;
     const footnotesRegex = /word\/footnotes[\d+]?.xml/;
     const endnotesRegex = /word\/endnotes[\d+]?.xml/;
@@ -949,21 +773,10 @@ var WordParser = class {
       const mainText = await this.extractTextAndImages(
         mainDoc.content.toString(),
         embedMap,
-        mediaFiles,
-        extractingOptions
+        mediaFiles
       );
-      const footnotesText = footnotesDoc ? await this.extractTextAndImages(
-        footnotesDoc.content.toString(),
-        embedMap,
-        mediaFiles,
-        extractingOptions
-      ) : "";
-      const endnotesText = endnotesDoc ? await this.extractTextAndImages(
-        endnotesDoc.content.toString(),
-        embedMap,
-        mediaFiles,
-        extractingOptions
-      ) : "";
+      const footnotesText = footnotesDoc ? await this.extractTextAndImages(footnotesDoc.content.toString(), embedMap, mediaFiles) : "";
+      const endnotesText = endnotesDoc ? await this.extractTextAndImages(endnotesDoc.content.toString(), embedMap, mediaFiles) : "";
       return [
         mainText,
         footnotesText ? "\n--- Footnotes ---\n" + footnotesText : "",
@@ -988,7 +801,7 @@ var WordParser = class {
     }
     return map;
   }
-  async extractTextAndImages(xmlContent, embedMap, mediaFiles, extractingOptions) {
+  async extractTextAndImages(xmlContent, embedMap, mediaFiles) {
     const doc = parseString(xmlContent);
     const paragraphs = Array.from(doc.getElementsByTagName("w:p"));
     const parts = [];
@@ -1004,7 +817,7 @@ var WordParser = class {
           const imageFile = mediaFiles[embedMap[embedId]];
           if (imageFile) {
             const imageBuffer = imageFile.content;
-            const imageDescription = await this.convertImageToText(imageBuffer, extractingOptions);
+            const imageDescription = await this.convertImageToText(imageBuffer);
             paragraphText += `
 [Image: ${imageDescription}]`;
           }
@@ -1016,8 +829,8 @@ var WordParser = class {
     }
     return parts.join("\n");
   }
-  async convertImageToText(imageBuffer, extractingOptions) {
-    return await this.anyExtractor.parseFile(imageBuffer, null, extractingOptions);
+  async convertImageToText(imageBuffer) {
+    return await this.anyExtractor.parseFile(imageBuffer, null);
   }
 };
@@ -1026,7 +839,6 @@ var getAnyExtractor = (config) => {
   const anyExtractor = new AnyExtractor(config);
   const parsers = [
     new ExcelParser(anyExtractor),
-    new ImageParser(),
     new OpenOfficeParser(),
     new PDFParser(),
     new PowerPointParser(anyExtractor),