npm - any-extractor - Versions diffs - 2.0.2 → 2.0.4 - Mend

any-extractor 2.0.2 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -4,31 +4,10 @@
 [![License](https://img.shields.io/npm/l/any-extractor)](https://www.npmjs.com/package/any-extractor)
 [![Downloads](https://img.shields.io/npm/dm/any-extractor)](https://www.npmjs.com/package/any-extractor)
-A Node.js package to extract text from any file.
-> This package is designed for **Node.js only** and does not work in browser environments.
-## Table of Contents
-- [Features](#features)
-- [Supported Files](#supported-files)
-- [Installation](#installation)
-- [Getting Started](#getting-started)
-- [Advanced Usage](#advanced-usage)
-- [Custom Parsers](#custom-parsers)
-- [Confluence Crawling](#confluence-crawling)
-- [Needs Work](#needs-work)
-- [Contributing](#contributing)
-- [Credits](#credits)
-- [License](#license)
-- [Support](#support)
+A Node.js package to extract text from files.
 ## Features
-- **Multi-format file support:** Extracts text from a wide range of file types. (See below for list of supported files)
-- **OCR for images:** Uses Optical Character Recognition to extract text from images within documents and standalone image files.
-- **LLM for image description:** Leverages AI to extract images description, providing richer information.
-- **ES6 and CommonJS support:** Supports both modern ES6 and traditional CommonJS JavaScript environments.
 - **Flexible input options:** Supports local file path, buffers, and file URLs.
 - **Auto type detection:** Automatically detects file type and extracts text using MIME type.
 - **Customizable parsers:** Allows creating new or modifying existing document parsers for any MIME types.
@@ -38,43 +17,29 @@ A Node.js package to extract text from any file.
 Here's a breakdown of the text extraction capabilities for each file type:
-| File Type                                        | Text Extraction | Image Extraction |
-| ------------------------------------------------ | --------------- | ---------------- |
-| `.docx`                                          | ✅              | ✅               |
-| `.pptx`                                          | ✅              | ✅               |
-| `.xlsx`                                          | ✅              | ✅               |
-| `.pdf`                                           | ✅              | ❌               |
-| `.png`                                           | N/A             | ✅               |
-| `.jpg`, `.jpeg`                                  | N/A             | ✅               |
-| `.webp`                                          | N/A             | ✅               |
-| `.odt`                                           | ✅              | ❌               |
-| `.odp`                                           | ✅              | ❌               |
-| `.ods`                                           | ✅              | ❌               |
-| `.csv`                                           | ✅              | N/A              |
-| `.txt`                                           | ✅              | N/A              |
-| `.json`                                          | ✅              | N/A              |
-| Plain text (e.g., `.py`,<br> `.ts`, `.md`, etc.) | ✅              | N/A              |
-| `confluence`                                     | ✅              | ✅               |
+| File Type                                        | Text Extraction |
+| ------------------------------------------------ | --------------- |
+| `.docx`                                          | ✅              |
+| `.pptx`                                          | ✅              |
+| `.xlsx`                                          | ✅              |
+| `.pdf`                                           | ✅              |
+| `.odt`                                           | ✅              |
+| `.odp`                                           | ✅              |
+| `.ods`                                           | ✅              |
+| `.csv`                                           | ✅              |
+| `.txt`                                           | ✅              |
+| `.json`                                          | ✅              |
+| Plain text (e.g., `.py`,<br> `.ts`, `.md`, etc.) | ✅              |
+| `confluence`                                     | ✅              |
 ## Installation
-This is a Node.js module available through the npm registry.<br>
-To work with this package, Node.js 20 or higher is required.
-#### Package Manager
-Using npm:
 ```bash
 npm install any-extractor
 ```
 ## Getting Started
-Here's a basic example of how to use AnyExtractor in both ES6 and CommonJS environments:
-#### ES6 (using `import`):
 ```ts
 import { getAnyExtractor } from 'any-extractor';
@@ -87,64 +52,8 @@ async function extractFromFile() {
 extractFromFile();
 ```
-#### CommonJS (using `require`):
-```ts
-const { getAnyExtractor } = require('any-extractor');
-async function extractFromFile() {
-  const textExt = getAnyExtractor();
-  const result = await textExt.parseFile('./filename.docx');
-  console.log(result);
-}
-extractFromFile();
-```
 ## Advanced Usage
-#### Parsing Images:
-AnyExtractor provides two primary methods for extracting text from images.
-1. Optical Character Recognition (OCR):<br>
-   ```ts
-   const anyExt = getAnyExtractor();
-   const text = await anyExt.parseFile('./imgfile.png', null, {
-     extractImages: true,
-     imageExtractionMethod: 'ocr',
-     language: 'eng',
-   });
-   console.log('Extracted Text:', text);
-   ```
-2. Using LLM:<br>
-   ```ts
-   const anyExt = getAnyExtractor({
-     llmProvider: 'google',
-     visionModel: 'gemini-2.0-flash',
-     apikey: '<your-api-key>',
-   });
-   const text = await anyExt.parseFile('./imgfile.png', null, {
-     extractImages: true,
-     imageExtractionMethod: 'llm',
-     language: 'eng',
-   });
-   console.log('Extracted Text:', text);
-   ```
-> Llm parsing supports `openai`, `google` and `anthropic` llmProvider for now. But you can always overwrite the image parser implementation with your code.
-> Optional argument of methods `getAnyExtractor` and `parseFile` are required for the extractor to parse images. Otherwise it will return empty string.
-> Image parsing also works other files, e.g., .docx, .pptx etc (see the table above).
 #### Authorization Parameter
 The second argument in `parseFile`, shown as `null`, is for Basic Authentication when accessing file URLs. Format: `Basic <base64-encoded-credentials>`
@@ -177,12 +86,7 @@ import { AnyParserMethod } from 'any-extractor';
 export class CustomParser implements AnyParserMethod {
   public mimes = ['application/hdb', 'application/sql'];
-  public apply = async (
-    file: Buffer,
-    mimeType: string,
-    extractingOptions: ExtractingOptions,
-    extractorConfig: ExtractorConfig,
-  ): Promise<string> => {
+  public apply = async (file: Buffer, extractorConfig: ExtractorConfig): Promise<string> => {
     // your text extraction logic
   };
 }
@@ -208,11 +112,6 @@ const { getAnyExtractor } = require('any-extractor');
 async function crawlConfluence() {
   const textExt = getAnyExtractor({
-    llm: {
-      llmProvider: 'google',
-      visionModel: 'gemini-2.0-flash',
-      apikey: '<your-api-key>',
-    },
     confluence: {
       baseUrl: '<baseurl>',
       email: '<username>',
@@ -220,39 +119,12 @@ async function crawlConfluence() {
     },
   });
-  const result = await textExt.parseConfluenceDoc('<pageId>', {
-    extractAttachments: true,
-    extractImages: false,
-    imageExtractionMethod: 'ocr',
-    language: 'eng',
-  });
+  const result = await textExt.parseConfluenceDoc('<pageId>');
 }
 crawlConfluence();
 ```
-## Needs Work
-1. `.pdf` and `OpenOffice` files doesn't support image extraction.
-2. `.xlsx` parsing isn't well structured and ordered.
-3. Doesn't support text extraction from web and compressed files.
-## Changelog
-This project uses [semantic-release](https://github.com/semantic-release/semantic-release) for automated versioning and changelog generation. See the [Releases](https://github.com/pranit-sh/any-extractor/releases) section for details.
-## Contributing
-Contributions are welcome! Please follow the [Conventional Commits](https://www.conventionalcommits.org/) style when committing changes.
-1. Fork the repository
-2. Create your feature branch (`git checkout -b feat/my-feature`)
-3. Commit your changes
-4. Push to the branch
-5. Open a Pull Request
-> Pre-commit hooks will run linting and formatting checks automatically.
 ## Credits
 **any-extractor** is inspired from [officeparser](https://www.npmjs.com/package/officeparser) and it uses [tesseract.js](https://www.npmjs.com/package/tesseract.js)<br>

package/dist/index.d.mts CHANGED Viewed

@@ -1,35 +1,18 @@
 type AnyParserMethod = {
     mimes: string[];
-    apply: (_: Buffer, ___: ExtractingOptions, ____: ExtractorConfig) => Promise<string>;
+    apply: (_: Buffer, ____: ExtractorConfig) => Promise<string>;
 };
 type ExtractedFile = {
     path: string;
     content: Buffer;
 };
 type ExtractorConfig = {
-    llm?: {
-        llmProvider: 'openai' | 'google' | 'anthropic';
-        visionModel: string;
-        apikey: string;
-    };
     confluence?: {
         baseUrl: string;
         email: string;
         apiKey: string;
     };
 };
-type ExtractingOptions = {
-    extractImages: boolean;
-    imageExtractionMethod: 'llm' | 'ocr';
-    language: SupportedOCRLanguage;
-};
-type ConfluenceOptions = {
-    extractAttachments: boolean;
-    extractImages: boolean;
-    imageExtractionMethod: 'llm' | 'ocr';
-    language: SupportedOCRLanguage;
-};
-type SupportedOCRLanguage = 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos' | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu' | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra' | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv' | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat' | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit' | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori' | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv' | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel' | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl' | 'vie' | 'yid';
 type ExtractedXmlItem = {
     type: string;
     content: string;
@@ -40,8 +23,8 @@ declare class AnyExtractor {
     constructor(extractorConfig?: ExtractorConfig);
     private mimeParserMap;
     addParser: (method: AnyParserMethod) => this;
-    parseFile: (input: string | Buffer, basicAuth?: string | null, extractingOptions?: ExtractingOptions) => Promise<string>;
-    parseConfluenceDoc: (pageId: string, extractingOptions?: ConfluenceOptions) => Promise<string>;
+    parseFile: (input: string | Buffer, basicAuth?: string | null) => Promise<string>;
+    parseConfluenceDoc: (pageId: string) => Promise<string>;
 }
 /**
@@ -52,4 +35,4 @@ declare class AnyExtractor {
  */
 declare const getAnyExtractor: (config?: ExtractorConfig) => AnyExtractor;
-export { type AnyParserMethod, type ConfluenceOptions, type ExtractedFile, type ExtractedXmlItem, type ExtractingOptions, type ExtractorConfig, type SupportedOCRLanguage, getAnyExtractor };
+export { type AnyParserMethod, type ExtractedFile, type ExtractedXmlItem, type ExtractorConfig, getAnyExtractor };

package/dist/index.d.ts CHANGED Viewed

@@ -1,35 +1,18 @@
 type AnyParserMethod = {
     mimes: string[];
-    apply: (_: Buffer, ___: ExtractingOptions, ____: ExtractorConfig) => Promise<string>;
+    apply: (_: Buffer, ____: ExtractorConfig) => Promise<string>;
 };
 type ExtractedFile = {
     path: string;
     content: Buffer;
 };
 type ExtractorConfig = {
-    llm?: {
-        llmProvider: 'openai' | 'google' | 'anthropic';
-        visionModel: string;
-        apikey: string;
-    };
     confluence?: {
         baseUrl: string;
         email: string;
         apiKey: string;
     };
 };
-type ExtractingOptions = {
-    extractImages: boolean;
-    imageExtractionMethod: 'llm' | 'ocr';
-    language: SupportedOCRLanguage;
-};
-type ConfluenceOptions = {
-    extractAttachments: boolean;
-    extractImages: boolean;
-    imageExtractionMethod: 'llm' | 'ocr';
-    language: SupportedOCRLanguage;
-};
-type SupportedOCRLanguage = 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos' | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu' | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra' | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv' | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat' | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit' | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori' | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv' | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel' | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl' | 'vie' | 'yid';
 type ExtractedXmlItem = {
     type: string;
     content: string;
@@ -40,8 +23,8 @@ declare class AnyExtractor {
     constructor(extractorConfig?: ExtractorConfig);
     private mimeParserMap;
     addParser: (method: AnyParserMethod) => this;
-    parseFile: (input: string | Buffer, basicAuth?: string | null, extractingOptions?: ExtractingOptions) => Promise<string>;
-    parseConfluenceDoc: (pageId: string, extractingOptions?: ConfluenceOptions) => Promise<string>;
+    parseFile: (input: string | Buffer, basicAuth?: string | null) => Promise<string>;
+    parseConfluenceDoc: (pageId: string) => Promise<string>;
 }
 /**
@@ -52,4 +35,4 @@ declare class AnyExtractor {
  */
 declare const getAnyExtractor: (config?: ExtractorConfig) => AnyExtractor;
-export { type AnyParserMethod, type ConfluenceOptions, type ExtractedFile, type ExtractedXmlItem, type ExtractingOptions, type ExtractorConfig, type SupportedOCRLanguage, getAnyExtractor };
+export { type AnyParserMethod, type ExtractedFile, type ExtractedXmlItem, type ExtractorConfig, getAnyExtractor };