any-extractor 2.0.2 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -145
- package/dist/index.d.mts +4 -21
- package/dist/index.d.ts +4 -21
- package/dist/index.js +23 -210
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +23 -210
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -3
package/README.md
CHANGED
|
@@ -4,31 +4,10 @@
|
|
|
4
4
|
[](https://www.npmjs.com/package/any-extractor)
|
|
5
5
|
[](https://www.npmjs.com/package/any-extractor)
|
|
6
6
|
|
|
7
|
-
A Node.js package to extract text from
|
|
8
|
-
|
|
9
|
-
> This package is designed for **Node.js only** and does not work in browser environments.
|
|
10
|
-
|
|
11
|
-
## Table of Contents
|
|
12
|
-
|
|
13
|
-
- [Features](#features)
|
|
14
|
-
- [Supported Files](#supported-files)
|
|
15
|
-
- [Installation](#installation)
|
|
16
|
-
- [Getting Started](#getting-started)
|
|
17
|
-
- [Advanced Usage](#advanced-usage)
|
|
18
|
-
- [Custom Parsers](#custom-parsers)
|
|
19
|
-
- [Confluence Crawling](#confluence-crawling)
|
|
20
|
-
- [Needs Work](#needs-work)
|
|
21
|
-
- [Contributing](#contributing)
|
|
22
|
-
- [Credits](#credits)
|
|
23
|
-
- [License](#license)
|
|
24
|
-
- [Support](#support)
|
|
7
|
+
A Node.js package to extract text from files.
|
|
25
8
|
|
|
26
9
|
## Features
|
|
27
10
|
|
|
28
|
-
- **Multi-format file support:** Extracts text from a wide range of file types. (See below for list of supported files)
|
|
29
|
-
- **OCR for images:** Uses Optical Character Recognition to extract text from images within documents and standalone image files.
|
|
30
|
-
- **LLM for image description:** Leverages AI to extract images description, providing richer information.
|
|
31
|
-
- **ES6 and CommonJS support:** Supports both modern ES6 and traditional CommonJS JavaScript environments.
|
|
32
11
|
- **Flexible input options:** Supports local file path, buffers, and file URLs.
|
|
33
12
|
- **Auto type detection:** Automatically detects file type and extracts text using MIME type.
|
|
34
13
|
- **Customizable parsers:** Allows creating new or modifying existing document parsers for any MIME types.
|
|
@@ -38,43 +17,29 @@ A Node.js package to extract text from any file.
|
|
|
38
17
|
|
|
39
18
|
Here's a breakdown of the text extraction capabilities for each file type:
|
|
40
19
|
|
|
41
|
-
| File Type | Text Extraction |
|
|
42
|
-
| ------------------------------------------------ | --------------- |
|
|
43
|
-
| `.docx` | ✅ |
|
|
44
|
-
| `.pptx` | ✅ |
|
|
45
|
-
| `.xlsx` | ✅ |
|
|
46
|
-
| `.pdf` | ✅ |
|
|
47
|
-
| `.
|
|
48
|
-
| `.
|
|
49
|
-
| `.
|
|
50
|
-
| `.
|
|
51
|
-
| `.
|
|
52
|
-
| `.
|
|
53
|
-
| `.
|
|
54
|
-
|
|
|
55
|
-
| `.json` | ✅ | N/A |
|
|
56
|
-
| Plain text (e.g., `.py`,<br> `.ts`, `.md`, etc.) | ✅ | N/A |
|
|
57
|
-
| `confluence` | ✅ | ✅ |
|
|
20
|
+
| File Type | Text Extraction |
|
|
21
|
+
| ------------------------------------------------ | --------------- |
|
|
22
|
+
| `.docx` | ✅ |
|
|
23
|
+
| `.pptx` | ✅ |
|
|
24
|
+
| `.xlsx` | ✅ |
|
|
25
|
+
| `.pdf` | ✅ |
|
|
26
|
+
| `.odt` | ✅ |
|
|
27
|
+
| `.odp` | ✅ |
|
|
28
|
+
| `.ods` | ✅ |
|
|
29
|
+
| `.csv` | ✅ |
|
|
30
|
+
| `.txt` | ✅ |
|
|
31
|
+
| `.json` | ✅ |
|
|
32
|
+
| Plain text (e.g., `.py`,<br> `.ts`, `.md`, etc.) | ✅ |
|
|
33
|
+
| `confluence` | ✅ |
|
|
58
34
|
|
|
59
35
|
## Installation
|
|
60
36
|
|
|
61
|
-
This is a Node.js module available through the npm registry.<br>
|
|
62
|
-
To work with this package, Node.js 20 or higher is required.
|
|
63
|
-
|
|
64
|
-
#### Package Manager
|
|
65
|
-
|
|
66
|
-
Using npm:
|
|
67
|
-
|
|
68
37
|
```bash
|
|
69
38
|
npm install any-extractor
|
|
70
39
|
```
|
|
71
40
|
|
|
72
41
|
## Getting Started
|
|
73
42
|
|
|
74
|
-
Here's a basic example of how to use AnyExtractor in both ES6 and CommonJS environments:
|
|
75
|
-
|
|
76
|
-
#### ES6 (using `import`):
|
|
77
|
-
|
|
78
43
|
```ts
|
|
79
44
|
import { getAnyExtractor } from 'any-extractor';
|
|
80
45
|
|
|
@@ -87,64 +52,8 @@ async function extractFromFile() {
|
|
|
87
52
|
extractFromFile();
|
|
88
53
|
```
|
|
89
54
|
|
|
90
|
-
#### CommonJS (using `require`):
|
|
91
|
-
|
|
92
|
-
```ts
|
|
93
|
-
const { getAnyExtractor } = require('any-extractor');
|
|
94
|
-
|
|
95
|
-
async function extractFromFile() {
|
|
96
|
-
const textExt = getAnyExtractor();
|
|
97
|
-
const result = await textExt.parseFile('./filename.docx');
|
|
98
|
-
console.log(result);
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
extractFromFile();
|
|
102
|
-
```
|
|
103
|
-
|
|
104
55
|
## Advanced Usage
|
|
105
56
|
|
|
106
|
-
#### Parsing Images:
|
|
107
|
-
|
|
108
|
-
AnyExtractor provides two primary methods for extracting text from images.
|
|
109
|
-
|
|
110
|
-
1. Optical Character Recognition (OCR):<br>
|
|
111
|
-
|
|
112
|
-
```ts
|
|
113
|
-
const anyExt = getAnyExtractor();
|
|
114
|
-
|
|
115
|
-
const text = await anyExt.parseFile('./imgfile.png', null, {
|
|
116
|
-
extractImages: true,
|
|
117
|
-
imageExtractionMethod: 'ocr',
|
|
118
|
-
language: 'eng',
|
|
119
|
-
});
|
|
120
|
-
|
|
121
|
-
console.log('Extracted Text:', text);
|
|
122
|
-
```
|
|
123
|
-
|
|
124
|
-
2. Using LLM:<br>
|
|
125
|
-
|
|
126
|
-
```ts
|
|
127
|
-
const anyExt = getAnyExtractor({
|
|
128
|
-
llmProvider: 'google',
|
|
129
|
-
visionModel: 'gemini-2.0-flash',
|
|
130
|
-
apikey: '<your-api-key>',
|
|
131
|
-
});
|
|
132
|
-
|
|
133
|
-
const text = await anyExt.parseFile('./imgfile.png', null, {
|
|
134
|
-
extractImages: true,
|
|
135
|
-
imageExtractionMethod: 'llm',
|
|
136
|
-
language: 'eng',
|
|
137
|
-
});
|
|
138
|
-
|
|
139
|
-
console.log('Extracted Text:', text);
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
> Llm parsing supports `openai`, `google` and `anthropic` llmProvider for now. But you can always overwrite the image parser implementation with your code.
|
|
143
|
-
|
|
144
|
-
> Optional argument of methods `getAnyExtractor` and `parseFile` are required for the extractor to parse images. Otherwise it will return empty string.
|
|
145
|
-
|
|
146
|
-
> Image parsing also works other files, e.g., .docx, .pptx etc (see the table above).
|
|
147
|
-
|
|
148
57
|
#### Authorization Parameter
|
|
149
58
|
|
|
150
59
|
The second argument in `parseFile`, shown as `null`, is for Basic Authentication when accessing file URLs. Format: `Basic <base64-encoded-credentials>`
|
|
@@ -177,12 +86,7 @@ import { AnyParserMethod } from 'any-extractor';
|
|
|
177
86
|
export class CustomParser implements AnyParserMethod {
|
|
178
87
|
public mimes = ['application/hdb', 'application/sql'];
|
|
179
88
|
|
|
180
|
-
public apply = async (
|
|
181
|
-
file: Buffer,
|
|
182
|
-
mimeType: string,
|
|
183
|
-
extractingOptions: ExtractingOptions,
|
|
184
|
-
extractorConfig: ExtractorConfig,
|
|
185
|
-
): Promise<string> => {
|
|
89
|
+
public apply = async (file: Buffer, extractorConfig: ExtractorConfig): Promise<string> => {
|
|
186
90
|
// your text extraction logic
|
|
187
91
|
};
|
|
188
92
|
}
|
|
@@ -208,11 +112,6 @@ const { getAnyExtractor } = require('any-extractor');
|
|
|
208
112
|
|
|
209
113
|
async function crawlConfluence() {
|
|
210
114
|
const textExt = getAnyExtractor({
|
|
211
|
-
llm: {
|
|
212
|
-
llmProvider: 'google',
|
|
213
|
-
visionModel: 'gemini-2.0-flash',
|
|
214
|
-
apikey: '<your-api-key>',
|
|
215
|
-
},
|
|
216
115
|
confluence: {
|
|
217
116
|
baseUrl: '<baseurl>',
|
|
218
117
|
email: '<username>',
|
|
@@ -220,39 +119,12 @@ async function crawlConfluence() {
|
|
|
220
119
|
},
|
|
221
120
|
});
|
|
222
121
|
|
|
223
|
-
const result = await textExt.parseConfluenceDoc('<pageId>'
|
|
224
|
-
extractAttachments: true,
|
|
225
|
-
extractImages: false,
|
|
226
|
-
imageExtractionMethod: 'ocr',
|
|
227
|
-
language: 'eng',
|
|
228
|
-
});
|
|
122
|
+
const result = await textExt.parseConfluenceDoc('<pageId>');
|
|
229
123
|
}
|
|
230
124
|
|
|
231
125
|
crawlConfluence();
|
|
232
126
|
```
|
|
233
127
|
|
|
234
|
-
## Needs Work
|
|
235
|
-
|
|
236
|
-
1. `.pdf` and `OpenOffice` files doesn't support image extraction.
|
|
237
|
-
2. `.xlsx` parsing isn't well structured and ordered.
|
|
238
|
-
3. Doesn't support text extraction from web and compressed files.
|
|
239
|
-
|
|
240
|
-
## Changelog
|
|
241
|
-
|
|
242
|
-
This project uses [semantic-release](https://github.com/semantic-release/semantic-release) for automated versioning and changelog generation. See the [Releases](https://github.com/pranit-sh/any-extractor/releases) section for details.
|
|
243
|
-
|
|
244
|
-
## Contributing
|
|
245
|
-
|
|
246
|
-
Contributions are welcome! Please follow the [Conventional Commits](https://www.conventionalcommits.org/) style when committing changes.
|
|
247
|
-
|
|
248
|
-
1. Fork the repository
|
|
249
|
-
2. Create your feature branch (`git checkout -b feat/my-feature`)
|
|
250
|
-
3. Commit your changes
|
|
251
|
-
4. Push to the branch
|
|
252
|
-
5. Open a Pull Request
|
|
253
|
-
|
|
254
|
-
> Pre-commit hooks will run linting and formatting checks automatically.
|
|
255
|
-
|
|
256
128
|
## Credits
|
|
257
129
|
|
|
258
130
|
**any-extractor** is inspired from [officeparser](https://www.npmjs.com/package/officeparser) and it uses [tesseract.js](https://www.npmjs.com/package/tesseract.js)<br>
|
package/dist/index.d.mts
CHANGED
|
@@ -1,35 +1,18 @@
|
|
|
1
1
|
type AnyParserMethod = {
|
|
2
2
|
mimes: string[];
|
|
3
|
-
apply: (_: Buffer,
|
|
3
|
+
apply: (_: Buffer, ____: ExtractorConfig) => Promise<string>;
|
|
4
4
|
};
|
|
5
5
|
type ExtractedFile = {
|
|
6
6
|
path: string;
|
|
7
7
|
content: Buffer;
|
|
8
8
|
};
|
|
9
9
|
type ExtractorConfig = {
|
|
10
|
-
llm?: {
|
|
11
|
-
llmProvider: 'openai' | 'google' | 'anthropic';
|
|
12
|
-
visionModel: string;
|
|
13
|
-
apikey: string;
|
|
14
|
-
};
|
|
15
10
|
confluence?: {
|
|
16
11
|
baseUrl: string;
|
|
17
12
|
email: string;
|
|
18
13
|
apiKey: string;
|
|
19
14
|
};
|
|
20
15
|
};
|
|
21
|
-
type ExtractingOptions = {
|
|
22
|
-
extractImages: boolean;
|
|
23
|
-
imageExtractionMethod: 'llm' | 'ocr';
|
|
24
|
-
language: SupportedOCRLanguage;
|
|
25
|
-
};
|
|
26
|
-
type ConfluenceOptions = {
|
|
27
|
-
extractAttachments: boolean;
|
|
28
|
-
extractImages: boolean;
|
|
29
|
-
imageExtractionMethod: 'llm' | 'ocr';
|
|
30
|
-
language: SupportedOCRLanguage;
|
|
31
|
-
};
|
|
32
|
-
type SupportedOCRLanguage = 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos' | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu' | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra' | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv' | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat' | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit' | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori' | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv' | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel' | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl' | 'vie' | 'yid';
|
|
33
16
|
type ExtractedXmlItem = {
|
|
34
17
|
type: string;
|
|
35
18
|
content: string;
|
|
@@ -40,8 +23,8 @@ declare class AnyExtractor {
|
|
|
40
23
|
constructor(extractorConfig?: ExtractorConfig);
|
|
41
24
|
private mimeParserMap;
|
|
42
25
|
addParser: (method: AnyParserMethod) => this;
|
|
43
|
-
parseFile: (input: string | Buffer, basicAuth?: string | null
|
|
44
|
-
parseConfluenceDoc: (pageId: string
|
|
26
|
+
parseFile: (input: string | Buffer, basicAuth?: string | null) => Promise<string>;
|
|
27
|
+
parseConfluenceDoc: (pageId: string) => Promise<string>;
|
|
45
28
|
}
|
|
46
29
|
|
|
47
30
|
/**
|
|
@@ -52,4 +35,4 @@ declare class AnyExtractor {
|
|
|
52
35
|
*/
|
|
53
36
|
declare const getAnyExtractor: (config?: ExtractorConfig) => AnyExtractor;
|
|
54
37
|
|
|
55
|
-
export { type AnyParserMethod, type
|
|
38
|
+
export { type AnyParserMethod, type ExtractedFile, type ExtractedXmlItem, type ExtractorConfig, getAnyExtractor };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,35 +1,18 @@
|
|
|
1
1
|
type AnyParserMethod = {
|
|
2
2
|
mimes: string[];
|
|
3
|
-
apply: (_: Buffer,
|
|
3
|
+
apply: (_: Buffer, ____: ExtractorConfig) => Promise<string>;
|
|
4
4
|
};
|
|
5
5
|
type ExtractedFile = {
|
|
6
6
|
path: string;
|
|
7
7
|
content: Buffer;
|
|
8
8
|
};
|
|
9
9
|
type ExtractorConfig = {
|
|
10
|
-
llm?: {
|
|
11
|
-
llmProvider: 'openai' | 'google' | 'anthropic';
|
|
12
|
-
visionModel: string;
|
|
13
|
-
apikey: string;
|
|
14
|
-
};
|
|
15
10
|
confluence?: {
|
|
16
11
|
baseUrl: string;
|
|
17
12
|
email: string;
|
|
18
13
|
apiKey: string;
|
|
19
14
|
};
|
|
20
15
|
};
|
|
21
|
-
type ExtractingOptions = {
|
|
22
|
-
extractImages: boolean;
|
|
23
|
-
imageExtractionMethod: 'llm' | 'ocr';
|
|
24
|
-
language: SupportedOCRLanguage;
|
|
25
|
-
};
|
|
26
|
-
type ConfluenceOptions = {
|
|
27
|
-
extractAttachments: boolean;
|
|
28
|
-
extractImages: boolean;
|
|
29
|
-
imageExtractionMethod: 'llm' | 'ocr';
|
|
30
|
-
language: SupportedOCRLanguage;
|
|
31
|
-
};
|
|
32
|
-
type SupportedOCRLanguage = 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos' | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu' | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra' | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv' | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat' | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit' | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori' | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv' | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel' | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl' | 'vie' | 'yid';
|
|
33
16
|
type ExtractedXmlItem = {
|
|
34
17
|
type: string;
|
|
35
18
|
content: string;
|
|
@@ -40,8 +23,8 @@ declare class AnyExtractor {
|
|
|
40
23
|
constructor(extractorConfig?: ExtractorConfig);
|
|
41
24
|
private mimeParserMap;
|
|
42
25
|
addParser: (method: AnyParserMethod) => this;
|
|
43
|
-
parseFile: (input: string | Buffer, basicAuth?: string | null
|
|
44
|
-
parseConfluenceDoc: (pageId: string
|
|
26
|
+
parseFile: (input: string | Buffer, basicAuth?: string | null) => Promise<string>;
|
|
27
|
+
parseConfluenceDoc: (pageId: string) => Promise<string>;
|
|
45
28
|
}
|
|
46
29
|
|
|
47
30
|
/**
|
|
@@ -52,4 +35,4 @@ declare class AnyExtractor {
|
|
|
52
35
|
*/
|
|
53
36
|
declare const getAnyExtractor: (config?: ExtractorConfig) => AnyExtractor;
|
|
54
37
|
|
|
55
|
-
export { type AnyParserMethod, type
|
|
38
|
+
export { type AnyParserMethod, type ExtractedFile, type ExtractedXmlItem, type ExtractorConfig, getAnyExtractor };
|