ocr-ai 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +458 -339
- package/dist/index.js +4 -6
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +4 -6
- package/dist/index.mjs.map +1 -1
- package/package.json +68 -68
package/README.md
CHANGED
|
@@ -1,339 +1,458 @@
|
|
|
1
|
-
# ocr-ai
|
|
2
|
-
|
|
3
|
-
Multi-provider AI document extraction for Node.js. Extract text or structured JSON from documents using Gemini, OpenAI, Claude, Grok, or Vertex AI.
|
|
4
|
-
|
|
5
|
-
## Installation
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
npm install ocr-ai
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
## Quick Start
|
|
12
|
-
|
|
13
|
-
### Using Gemini
|
|
14
|
-
|
|
15
|
-
```typescript
|
|
16
|
-
import { OcrAI } from 'ocr-ai';
|
|
17
|
-
|
|
18
|
-
const ocr = new OcrAI({
|
|
19
|
-
provider: 'gemini',
|
|
20
|
-
apiKey: 'YOUR_GEMINI_API_KEY',
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
const result = await ocr.extract('./invoice.png');
|
|
24
|
-
|
|
25
|
-
if (result.success) {
|
|
26
|
-
const text = result.content;
|
|
27
|
-
console.log(text);
|
|
28
|
-
}
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
### Using OpenAI
|
|
32
|
-
|
|
33
|
-
```typescript
|
|
34
|
-
import { OcrAI } from 'ocr-ai';
|
|
35
|
-
|
|
36
|
-
const ocr = new OcrAI({
|
|
37
|
-
provider: 'openai',
|
|
38
|
-
apiKey: 'YOUR_OPENAI_API_KEY',
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
const result = await ocr.extract('./document.pdf');
|
|
42
|
-
|
|
43
|
-
if (result.success) {
|
|
44
|
-
const text = result.content;
|
|
45
|
-
console.log(text);
|
|
46
|
-
}
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
### Custom Model
|
|
50
|
-
|
|
51
|
-
You can specify a custom model for any provider:
|
|
52
|
-
|
|
53
|
-
```typescript
|
|
54
|
-
const ocr = new OcrAI({
|
|
55
|
-
provider: 'gemini',
|
|
56
|
-
apiKey: 'YOUR_GEMINI_API_KEY',
|
|
57
|
-
model: 'gemini-2.0-flash', // Use a specific model
|
|
58
|
-
});
|
|
59
|
-
|
|
60
|
-
// Or with OpenAI
|
|
61
|
-
const ocrOpenAI = new OcrAI({
|
|
62
|
-
provider: 'openai',
|
|
63
|
-
apiKey: 'YOUR_OPENAI_API_KEY',
|
|
64
|
-
model: 'gpt-4o-mini', // Use a different model
|
|
65
|
-
});
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
### From URL
|
|
69
|
-
|
|
70
|
-
Extract directly from a URL:
|
|
71
|
-
|
|
72
|
-
```typescript
|
|
73
|
-
const result = await ocr.extract('https://example.com/invoice.png');
|
|
74
|
-
|
|
75
|
-
if (result.success) {
|
|
76
|
-
console.log(result.content);
|
|
77
|
-
}
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
### Custom Instructions
|
|
81
|
-
|
|
82
|
-
You can provide custom instructions to guide the extraction:
|
|
83
|
-
|
|
84
|
-
```typescript
|
|
85
|
-
const result = await ocr.extract('./receipt.png', {
|
|
86
|
-
prompt: 'Extract only the total amount and date from this receipt',
|
|
87
|
-
});
|
|
88
|
-
|
|
89
|
-
if (result.success) {
|
|
90
|
-
console.log(result.content);
|
|
91
|
-
// Output: "Total: $154.06, Date: 11/02/2019"
|
|
92
|
-
}
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
### Output Format
|
|
96
|
-
|
|
97
|
-
By default, extraction returns text. You can also extract structured JSON:
|
|
98
|
-
|
|
99
|
-
```typescript
|
|
100
|
-
// Text output (default)
|
|
101
|
-
const textResult = await ocr.extract('./invoice.png', {
|
|
102
|
-
format: 'text',
|
|
103
|
-
});
|
|
104
|
-
|
|
105
|
-
if (textResult.success) {
|
|
106
|
-
console.log(textResult.content); // string
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
// JSON output with schema
|
|
110
|
-
const jsonResult = await ocr.extract('./invoice.png', {
|
|
111
|
-
format: 'json',
|
|
112
|
-
schema: {
|
|
113
|
-
invoice_number: 'string',
|
|
114
|
-
date: 'string',
|
|
115
|
-
total: 'number',
|
|
116
|
-
items: [{ name: 'string', quantity: 'number', price: 'number' }],
|
|
117
|
-
},
|
|
118
|
-
});
|
|
119
|
-
|
|
120
|
-
if (jsonResult.success) {
|
|
121
|
-
console.log(jsonResult.data); // { invoice_number: "US-001", date: "11/02/2019", total: 154.06, items: [...] }
|
|
122
|
-
}
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
### JSON Schema
|
|
126
|
-
|
|
127
|
-
The schema defines the structure of the data you want to extract. Use a simple object where keys are field names and values are types:
|
|
128
|
-
|
|
129
|
-
**Basic types:**
|
|
130
|
-
- `'string'` - Text values
|
|
131
|
-
- `'number'` - Numeric values
|
|
132
|
-
- `'boolean'` - True/false values
|
|
133
|
-
|
|
134
|
-
**Nested objects:**
|
|
135
|
-
```typescript
|
|
136
|
-
const schema = {
|
|
137
|
-
company: {
|
|
138
|
-
name: 'string',
|
|
139
|
-
address: 'string',
|
|
140
|
-
phone: 'string',
|
|
141
|
-
},
|
|
142
|
-
customer: {
|
|
143
|
-
name: 'string',
|
|
144
|
-
email: 'string',
|
|
145
|
-
},
|
|
146
|
-
};
|
|
147
|
-
```
|
|
148
|
-
|
|
149
|
-
**Arrays:**
|
|
150
|
-
```typescript
|
|
151
|
-
const schema = {
|
|
152
|
-
// Array of objects
|
|
153
|
-
items: [
|
|
154
|
-
{
|
|
155
|
-
description: 'string',
|
|
156
|
-
quantity: 'number',
|
|
157
|
-
unit_price: 'number',
|
|
158
|
-
total: 'number',
|
|
159
|
-
},
|
|
160
|
-
],
|
|
161
|
-
// Simple array
|
|
162
|
-
tags: ['string'],
|
|
163
|
-
};
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
**Complete example (invoice):**
|
|
167
|
-
```typescript
|
|
168
|
-
const invoiceSchema = {
|
|
169
|
-
invoice_number: 'string',
|
|
170
|
-
date: 'string',
|
|
171
|
-
due_date: 'string',
|
|
172
|
-
company: {
|
|
173
|
-
name: 'string',
|
|
174
|
-
address: 'string',
|
|
175
|
-
phone: 'string',
|
|
176
|
-
email: 'string',
|
|
177
|
-
},
|
|
178
|
-
bill_to: {
|
|
179
|
-
name: 'string',
|
|
180
|
-
address: 'string',
|
|
181
|
-
},
|
|
182
|
-
items: [
|
|
183
|
-
{
|
|
184
|
-
description: 'string',
|
|
185
|
-
quantity: 'number',
|
|
186
|
-
unit_price: 'number',
|
|
187
|
-
total: 'number',
|
|
188
|
-
},
|
|
189
|
-
],
|
|
190
|
-
subtotal: 'number',
|
|
191
|
-
tax: 'number',
|
|
192
|
-
total: 'number',
|
|
193
|
-
};
|
|
194
|
-
|
|
195
|
-
const result = await ocr.extract('./invoice.png', {
|
|
196
|
-
format: 'json',
|
|
197
|
-
schema: invoiceSchema,
|
|
198
|
-
prompt: 'Extract all invoice data from this document.',
|
|
199
|
-
});
|
|
200
|
-
```
|
|
201
|
-
|
|
202
|
-
### Model Configuration
|
|
203
|
-
|
|
204
|
-
You can pass model-specific parameters like temperature, max tokens, and more:
|
|
205
|
-
|
|
206
|
-
```typescript
|
|
207
|
-
// Gemini with model config
|
|
208
|
-
const result = await ocr.extract('./invoice.png', {
|
|
209
|
-
modelConfig: {
|
|
210
|
-
temperature: 0.2,
|
|
211
|
-
maxTokens: 4096,
|
|
212
|
-
topP: 0.8,
|
|
213
|
-
topK: 40,
|
|
214
|
-
},
|
|
215
|
-
});
|
|
216
|
-
|
|
217
|
-
// OpenAI with model config
|
|
218
|
-
const result = await ocr.extract('./invoice.png', {
|
|
219
|
-
modelConfig: {
|
|
220
|
-
temperature: 0,
|
|
221
|
-
maxTokens: 2048,
|
|
222
|
-
topP: 1,
|
|
223
|
-
},
|
|
224
|
-
});
|
|
225
|
-
```
|
|
226
|
-
|
|
227
|
-
Available options:
|
|
228
|
-
|
|
229
|
-
| Option | Description | Supported Providers |
|
|
230
|
-
|--------|-------------|---------------------|
|
|
231
|
-
| temperature | Controls randomness (0.0-1.0+) | All |
|
|
232
|
-
| maxTokens | Maximum tokens to generate | All |
|
|
233
|
-
| topP | Nucleus sampling | All |
|
|
234
|
-
| topK | Top-k sampling | Gemini, Claude, Vertex |
|
|
235
|
-
| stopSequences | Stop generation at these strings | All |
|
|
236
|
-
|
|
237
|
-
### Token Usage
|
|
238
|
-
|
|
239
|
-
Access token usage information from the metadata:
|
|
240
|
-
|
|
241
|
-
```typescript
|
|
242
|
-
const result = await ocr.extract('./invoice.png');
|
|
243
|
-
|
|
244
|
-
if (result.success) {
|
|
245
|
-
console.log(result.content);
|
|
246
|
-
|
|
247
|
-
// Access metadata
|
|
248
|
-
console.log(result.metadata.processingTimeMs); // 2351
|
|
249
|
-
console.log(result.metadata.tokens?.inputTokens); // 1855
|
|
250
|
-
console.log(result.metadata.tokens?.outputTokens); // 260
|
|
251
|
-
console.log(result.metadata.tokens?.totalTokens); // 2115
|
|
252
|
-
}
|
|
253
|
-
```
|
|
254
|
-
|
|
255
|
-
## Supported Providers
|
|
256
|
-
|
|
257
|
-
| Provider | Default Model | Auth |
|
|
258
|
-
|----------|---------------|------|
|
|
259
|
-
| gemini | gemini-1.5-flash | API Key |
|
|
260
|
-
| openai | gpt-4o | API Key |
|
|
261
|
-
| claude | claude-sonnet-4-20250514 | API Key |
|
|
262
|
-
| grok | grok-2-vision-1212 | API Key |
|
|
263
|
-
| vertex | gemini-2.0-flash | Google Cloud |
|
|
264
|
-
|
|
265
|
-
> **Note:** For enterprise OCR needs, see [Advanced: Vertex AI](#advanced-vertex-ai-google-cloud) section below.
|
|
266
|
-
|
|
267
|
-
## Supported Inputs
|
|
268
|
-
|
|
269
|
-
- **Local files**: `./invoice.png`, `./document.pdf`
|
|
270
|
-
- **URLs**: `https://example.com/invoice.png`
|
|
271
|
-
|
|
272
|
-
## Supported Files
|
|
273
|
-
|
|
274
|
-
- **Images**: jpg, png, gif, webp
|
|
275
|
-
- **Documents**: pdf
|
|
276
|
-
- **Text**: txt, md, csv, json, xml, html
|
|
277
|
-
|
|
278
|
-
## Advanced: Vertex AI (Google Cloud)
|
|
279
|
-
|
|
280
|
-
The `vertex` provider enables access to Google Cloud's AI infrastructure, which is useful for enterprise scenarios requiring:
|
|
281
|
-
|
|
282
|
-
- **Compliance**: Data residency and regulatory requirements
|
|
283
|
-
- **Integration**: Native integration with Google Cloud services (BigQuery, Cloud Storage, etc.)
|
|
284
|
-
- **Specialized OCR**: Access to Google's Document AI and Vision AI processors
|
|
285
|
-
|
|
286
|
-
### Basic Setup
|
|
287
|
-
|
|
288
|
-
Vertex AI uses Google Cloud authentication instead of API keys:
|
|
289
|
-
|
|
290
|
-
```typescript
|
|
291
|
-
import { OcrAI } from 'ocr-ai';
|
|
292
|
-
|
|
293
|
-
const ocr = new OcrAI({
|
|
294
|
-
provider: 'vertex',
|
|
295
|
-
vertexConfig: {
|
|
296
|
-
project: 'your-gcp-project-id',
|
|
297
|
-
location: 'us-central1',
|
|
298
|
-
},
|
|
299
|
-
});
|
|
300
|
-
|
|
301
|
-
const result = await ocr.extract('./invoice.png');
|
|
302
|
-
```
|
|
303
|
-
|
|
304
|
-
**Requirements:**
|
|
305
|
-
1. Install the [gcloud CLI](https://cloud.google.com/sdk/docs/install)
|
|
306
|
-
2. Run `gcloud auth application-default login`
|
|
307
|
-
3. Enable the Vertex AI API in your GCP project
|
|
308
|
-
|
|
309
|
-
### When to Use Vertex AI vs Gemini API
|
|
310
|
-
|
|
311
|
-
| Scenario | Recommended |
|
|
312
|
-
|----------|-------------|
|
|
313
|
-
| Quick prototyping | Gemini (API Key) |
|
|
314
|
-
| Personal projects | Gemini (API Key) |
|
|
315
|
-
| Enterprise/production | Vertex AI |
|
|
316
|
-
| Data residency requirements | Vertex AI |
|
|
317
|
-
| High-volume processing | Vertex AI |
|
|
318
|
-
|
|
319
|
-
### Related Google Cloud OCR Services
|
|
320
|
-
|
|
321
|
-
For specialized document processing beyond what Gemini models offer, Google Cloud provides dedicated OCR services:
|
|
322
|
-
|
|
323
|
-
**[Document AI](https://cloud.google.com/document-ai)** - Optimized for structured documents:
|
|
324
|
-
- Invoice Parser, Receipt Parser, Form Parser
|
|
325
|
-
- W2, 1040, Bank Statement processors
|
|
326
|
-
- Custom extractors for domain-specific documents
|
|
327
|
-
- Higher accuracy for tables, forms, and handwritten text
|
|
328
|
-
|
|
329
|
-
**[Vision API](https://cloud.google.com/vision/docs/ocr)** - Optimized for images:
|
|
330
|
-
- Real-time OCR with low latency
|
|
331
|
-
- 80+ language support
|
|
332
|
-
- Handwriting detection
|
|
333
|
-
- Simple integration, ~98% accuracy on clean documents
|
|
334
|
-
|
|
335
|
-
These services are separate from ocr-ai but can complement it for enterprise document pipelines.
|
|
336
|
-
|
|
337
|
-
##
|
|
338
|
-
|
|
339
|
-
|
|
1
|
+
# ocr-ai
|
|
2
|
+
|
|
3
|
+
Multi-provider AI document extraction for Node.js. Extract text or structured JSON from documents using Gemini, OpenAI, Claude, Grok, or Vertex AI.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install ocr-ai
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
### Using Gemini
|
|
14
|
+
|
|
15
|
+
```typescript
|
|
16
|
+
import { OcrAI } from 'ocr-ai';
|
|
17
|
+
|
|
18
|
+
const ocr = new OcrAI({
|
|
19
|
+
provider: 'gemini',
|
|
20
|
+
apiKey: 'YOUR_GEMINI_API_KEY',
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
const result = await ocr.extract('./invoice.png');
|
|
24
|
+
|
|
25
|
+
if (result.success) {
|
|
26
|
+
const text = result.content;
|
|
27
|
+
console.log(text);
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Using OpenAI
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
import { OcrAI } from 'ocr-ai';
|
|
35
|
+
|
|
36
|
+
const ocr = new OcrAI({
|
|
37
|
+
provider: 'openai',
|
|
38
|
+
apiKey: 'YOUR_OPENAI_API_KEY',
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
const result = await ocr.extract('./document.pdf');
|
|
42
|
+
|
|
43
|
+
if (result.success) {
|
|
44
|
+
const text = result.content;
|
|
45
|
+
console.log(text);
|
|
46
|
+
}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Custom Model
|
|
50
|
+
|
|
51
|
+
You can specify a custom model for any provider:
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
const ocr = new OcrAI({
|
|
55
|
+
provider: 'gemini',
|
|
56
|
+
apiKey: 'YOUR_GEMINI_API_KEY',
|
|
57
|
+
model: 'gemini-2.0-flash', // Use a specific model
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// Or with OpenAI
|
|
61
|
+
const ocrOpenAI = new OcrAI({
|
|
62
|
+
provider: 'openai',
|
|
63
|
+
apiKey: 'YOUR_OPENAI_API_KEY',
|
|
64
|
+
model: 'gpt-4o-mini', // Use a different model
|
|
65
|
+
});
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### From URL
|
|
69
|
+
|
|
70
|
+
Extract directly from a URL:
|
|
71
|
+
|
|
72
|
+
```typescript
|
|
73
|
+
const result = await ocr.extract('https://example.com/invoice.png');
|
|
74
|
+
|
|
75
|
+
if (result.success) {
|
|
76
|
+
console.log(result.content);
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Custom Instructions
|
|
81
|
+
|
|
82
|
+
You can provide custom instructions to guide the extraction:
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
const result = await ocr.extract('./receipt.png', {
|
|
86
|
+
prompt: 'Extract only the total amount and date from this receipt',
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
if (result.success) {
|
|
90
|
+
console.log(result.content);
|
|
91
|
+
// Output: "Total: $154.06, Date: 11/02/2019"
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Output Format
|
|
96
|
+
|
|
97
|
+
By default, extraction returns text. You can also extract structured JSON:
|
|
98
|
+
|
|
99
|
+
```typescript
|
|
100
|
+
// Text output (default)
|
|
101
|
+
const textResult = await ocr.extract('./invoice.png', {
|
|
102
|
+
format: 'text',
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
if (textResult.success) {
|
|
106
|
+
console.log(textResult.content); // string
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// JSON output with schema
|
|
110
|
+
const jsonResult = await ocr.extract('./invoice.png', {
|
|
111
|
+
format: 'json',
|
|
112
|
+
schema: {
|
|
113
|
+
invoice_number: 'string',
|
|
114
|
+
date: 'string',
|
|
115
|
+
total: 'number',
|
|
116
|
+
items: [{ name: 'string', quantity: 'number', price: 'number' }],
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
if (jsonResult.success) {
|
|
121
|
+
console.log(jsonResult.data); // { invoice_number: "US-001", date: "11/02/2019", total: 154.06, items: [...] }
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### JSON Schema
|
|
126
|
+
|
|
127
|
+
The schema defines the structure of the data you want to extract. Use a simple object where keys are field names and values are types:
|
|
128
|
+
|
|
129
|
+
**Basic types:**
|
|
130
|
+
- `'string'` - Text values
|
|
131
|
+
- `'number'` - Numeric values
|
|
132
|
+
- `'boolean'` - True/false values
|
|
133
|
+
|
|
134
|
+
**Nested objects:**
|
|
135
|
+
```typescript
|
|
136
|
+
const schema = {
|
|
137
|
+
company: {
|
|
138
|
+
name: 'string',
|
|
139
|
+
address: 'string',
|
|
140
|
+
phone: 'string',
|
|
141
|
+
},
|
|
142
|
+
customer: {
|
|
143
|
+
name: 'string',
|
|
144
|
+
email: 'string',
|
|
145
|
+
},
|
|
146
|
+
};
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Arrays:**
|
|
150
|
+
```typescript
|
|
151
|
+
const schema = {
|
|
152
|
+
// Array of objects
|
|
153
|
+
items: [
|
|
154
|
+
{
|
|
155
|
+
description: 'string',
|
|
156
|
+
quantity: 'number',
|
|
157
|
+
unit_price: 'number',
|
|
158
|
+
total: 'number',
|
|
159
|
+
},
|
|
160
|
+
],
|
|
161
|
+
// Simple array
|
|
162
|
+
tags: ['string'],
|
|
163
|
+
};
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Complete example (invoice):**
|
|
167
|
+
```typescript
|
|
168
|
+
const invoiceSchema = {
|
|
169
|
+
invoice_number: 'string',
|
|
170
|
+
date: 'string',
|
|
171
|
+
due_date: 'string',
|
|
172
|
+
company: {
|
|
173
|
+
name: 'string',
|
|
174
|
+
address: 'string',
|
|
175
|
+
phone: 'string',
|
|
176
|
+
email: 'string',
|
|
177
|
+
},
|
|
178
|
+
bill_to: {
|
|
179
|
+
name: 'string',
|
|
180
|
+
address: 'string',
|
|
181
|
+
},
|
|
182
|
+
items: [
|
|
183
|
+
{
|
|
184
|
+
description: 'string',
|
|
185
|
+
quantity: 'number',
|
|
186
|
+
unit_price: 'number',
|
|
187
|
+
total: 'number',
|
|
188
|
+
},
|
|
189
|
+
],
|
|
190
|
+
subtotal: 'number',
|
|
191
|
+
tax: 'number',
|
|
192
|
+
total: 'number',
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
const result = await ocr.extract('./invoice.png', {
|
|
196
|
+
format: 'json',
|
|
197
|
+
schema: invoiceSchema,
|
|
198
|
+
prompt: 'Extract all invoice data from this document.',
|
|
199
|
+
});
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Model Configuration
|
|
203
|
+
|
|
204
|
+
You can pass model-specific parameters like temperature, max tokens, and more:
|
|
205
|
+
|
|
206
|
+
```typescript
|
|
207
|
+
// Gemini with model config
|
|
208
|
+
const result = await ocr.extract('./invoice.png', {
|
|
209
|
+
modelConfig: {
|
|
210
|
+
temperature: 0.2,
|
|
211
|
+
maxTokens: 4096,
|
|
212
|
+
topP: 0.8,
|
|
213
|
+
topK: 40,
|
|
214
|
+
},
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
// OpenAI with model config
|
|
218
|
+
const result = await ocr.extract('./invoice.png', {
|
|
219
|
+
modelConfig: {
|
|
220
|
+
temperature: 0,
|
|
221
|
+
maxTokens: 2048,
|
|
222
|
+
topP: 1,
|
|
223
|
+
},
|
|
224
|
+
});
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Available options:
|
|
228
|
+
|
|
229
|
+
| Option | Description | Supported Providers |
|
|
230
|
+
|--------|-------------|---------------------|
|
|
231
|
+
| temperature | Controls randomness (0.0-1.0+) | All |
|
|
232
|
+
| maxTokens | Maximum tokens to generate | All |
|
|
233
|
+
| topP | Nucleus sampling | All |
|
|
234
|
+
| topK | Top-k sampling | Gemini, Claude, Vertex |
|
|
235
|
+
| stopSequences | Stop generation at these strings | All |
|
|
236
|
+
|
|
237
|
+
### Token Usage
|
|
238
|
+
|
|
239
|
+
Access token usage information from the metadata:
|
|
240
|
+
|
|
241
|
+
```typescript
|
|
242
|
+
const result = await ocr.extract('./invoice.png');
|
|
243
|
+
|
|
244
|
+
if (result.success) {
|
|
245
|
+
console.log(result.content);
|
|
246
|
+
|
|
247
|
+
// Access metadata
|
|
248
|
+
console.log(result.metadata.processingTimeMs); // 2351
|
|
249
|
+
console.log(result.metadata.tokens?.inputTokens); // 1855
|
|
250
|
+
console.log(result.metadata.tokens?.outputTokens); // 260
|
|
251
|
+
console.log(result.metadata.tokens?.totalTokens); // 2115
|
|
252
|
+
}
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Supported Providers
|
|
256
|
+
|
|
257
|
+
| Provider | Default Model | Auth |
|
|
258
|
+
|----------|---------------|------|
|
|
259
|
+
| gemini | gemini-1.5-flash | API Key |
|
|
260
|
+
| openai | gpt-4o | API Key |
|
|
261
|
+
| claude | claude-sonnet-4-20250514 | API Key |
|
|
262
|
+
| grok | grok-2-vision-1212 | API Key |
|
|
263
|
+
| vertex | gemini-2.0-flash | Google Cloud |
|
|
264
|
+
|
|
265
|
+
> **Note:** For enterprise OCR needs, see [Advanced: Vertex AI](#advanced-vertex-ai-google-cloud) section below.
|
|
266
|
+
|
|
267
|
+
## Supported Inputs
|
|
268
|
+
|
|
269
|
+
- **Local files**: `./invoice.png`, `./document.pdf`
|
|
270
|
+
- **URLs**: `https://example.com/invoice.png`
|
|
271
|
+
|
|
272
|
+
## Supported Files
|
|
273
|
+
|
|
274
|
+
- **Images**: jpg, png, gif, webp
|
|
275
|
+
- **Documents**: pdf
|
|
276
|
+
- **Text**: txt, md, csv, json, xml, html
|
|
277
|
+
|
|
278
|
+
## Advanced: Vertex AI (Google Cloud)
|
|
279
|
+
|
|
280
|
+
The `vertex` provider enables access to Google Cloud's AI infrastructure, which is useful for enterprise scenarios requiring:
|
|
281
|
+
|
|
282
|
+
- **Compliance**: Data residency and regulatory requirements
|
|
283
|
+
- **Integration**: Native integration with Google Cloud services (BigQuery, Cloud Storage, etc.)
|
|
284
|
+
- **Specialized OCR**: Access to Google's Document AI and Vision AI processors
|
|
285
|
+
|
|
286
|
+
### Basic Setup
|
|
287
|
+
|
|
288
|
+
Vertex AI uses Google Cloud authentication instead of API keys:
|
|
289
|
+
|
|
290
|
+
```typescript
|
|
291
|
+
import { OcrAI } from 'ocr-ai';
|
|
292
|
+
|
|
293
|
+
const ocr = new OcrAI({
|
|
294
|
+
provider: 'vertex',
|
|
295
|
+
vertexConfig: {
|
|
296
|
+
project: 'your-gcp-project-id',
|
|
297
|
+
location: 'us-central1',
|
|
298
|
+
},
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
const result = await ocr.extract('./invoice.png');
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
**Requirements:**
|
|
305
|
+
1. Install the [gcloud CLI](https://cloud.google.com/sdk/docs/install)
|
|
306
|
+
2. Run `gcloud auth application-default login`
|
|
307
|
+
3. Enable the Vertex AI API in your GCP project
|
|
308
|
+
|
|
309
|
+
### When to Use Vertex AI vs Gemini API
|
|
310
|
+
|
|
311
|
+
| Scenario | Recommended |
|
|
312
|
+
|----------|-------------|
|
|
313
|
+
| Quick prototyping | Gemini (API Key) |
|
|
314
|
+
| Personal projects | Gemini (API Key) |
|
|
315
|
+
| Enterprise/production | Vertex AI |
|
|
316
|
+
| Data residency requirements | Vertex AI |
|
|
317
|
+
| High-volume processing | Vertex AI |
|
|
318
|
+
|
|
319
|
+
### Related Google Cloud OCR Services
|
|
320
|
+
|
|
321
|
+
For specialized document processing beyond what Gemini models offer, Google Cloud provides dedicated OCR services:
|
|
322
|
+
|
|
323
|
+
**[Document AI](https://cloud.google.com/document-ai)** - Optimized for structured documents:
|
|
324
|
+
- Invoice Parser, Receipt Parser, Form Parser
|
|
325
|
+
- W2, 1040, Bank Statement processors
|
|
326
|
+
- Custom extractors for domain-specific documents
|
|
327
|
+
- Higher accuracy for tables, forms, and handwritten text
|
|
328
|
+
|
|
329
|
+
**[Vision API](https://cloud.google.com/vision/docs/ocr)** - Optimized for images:
|
|
330
|
+
- Real-time OCR with low latency
|
|
331
|
+
- 80+ language support
|
|
332
|
+
- Handwriting detection
|
|
333
|
+
- Simple integration, ~98% accuracy on clean documents
|
|
334
|
+
|
|
335
|
+
These services are separate from ocr-ai but can complement it for enterprise document pipelines.
|
|
336
|
+
|
|
337
|
+
## Gemini Model Benchmarks
|
|
338
|
+
|
|
339
|
+
Performance benchmarks for Gemini models extracting data from an invoice image:
|
|
340
|
+
|
|
341
|
+
| Model | Text Extraction | JSON Extraction | Best For |
|
|
342
|
+
|-------|-----------------|-----------------|----------|
|
|
343
|
+
| `gemini-2.0-flash-lite` | 2.8s | 2.1s | High-volume processing, cost optimization |
|
|
344
|
+
| `gemini-2.5-flash-lite` | 2.2s | 1.9s | Fastest option, simple documents |
|
|
345
|
+
| `gemini-2.0-flash` | 3.9s | 2.9s | General purpose, good balance |
|
|
346
|
+
| `gemini-2.5-flash` | 5.0s | 5.0s | Standard documents, reliable |
|
|
347
|
+
| `gemini-3-flash-preview` | 12.3s | 10.6s | Complex layouts, newer capabilities |
|
|
348
|
+
| `gemini-3-pro-image-preview` | 8.0s | 11.9s | Image-heavy documents |
|
|
349
|
+
| `gemini-2.5-pro` | 12.6s | 5.5s | High accuracy, complex documents |
|
|
350
|
+
| `gemini-3-pro-preview` | 24.8s | 13.1s | Maximum accuracy, handwritten text |
|
|
351
|
+
|
|
352
|
+
### Model Recommendations
|
|
353
|
+
|
|
354
|
+
**For digital documents (invoices, receipts, forms):**
|
|
355
|
+
```typescript
|
|
356
|
+
// Fast and cost-effective
|
|
357
|
+
const ocr = new OcrAI({
|
|
358
|
+
provider: 'gemini',
|
|
359
|
+
apiKey: 'YOUR_API_KEY',
|
|
360
|
+
model: 'gemini-2.5-flash-lite', // ~2s response time
|
|
361
|
+
});
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
**For complex documents or when accuracy is critical:**
|
|
365
|
+
```typescript
|
|
366
|
+
// Higher accuracy, slower processing
|
|
367
|
+
const ocr = new OcrAI({
|
|
368
|
+
provider: 'gemini',
|
|
369
|
+
apiKey: 'YOUR_API_KEY',
|
|
370
|
+
model: 'gemini-2.5-pro', // Best accuracy/speed ratio
|
|
371
|
+
});
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**For handwritten documents or poor quality scans:**
|
|
375
|
+
```typescript
|
|
376
|
+
// Maximum accuracy for difficult documents
|
|
377
|
+
const ocr = new OcrAI({
|
|
378
|
+
provider: 'gemini',
|
|
379
|
+
apiKey: 'YOUR_API_KEY',
|
|
380
|
+
model: 'gemini-3-pro-preview', // Best for handwriting
|
|
381
|
+
});
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
### Quick Reference
|
|
385
|
+
|
|
386
|
+
| Use Case | Recommended Model |
|
|
387
|
+
|----------|-------------------|
|
|
388
|
+
| High-volume batch processing | `gemini-2.5-flash-lite` |
|
|
389
|
+
| Standard invoices/receipts | `gemini-2.0-flash` |
|
|
390
|
+
| Complex tables and layouts | `gemini-2.5-pro` |
|
|
391
|
+
| Handwritten documents | `gemini-3-pro-preview` |
|
|
392
|
+
| Poor quality scans | `gemini-3-pro-preview` |
|
|
393
|
+
| Real-time applications | `gemini-2.5-flash-lite` |
|
|
394
|
+
|
|
395
|
+
## OpenAI Model Benchmarks
|
|
396
|
+
|
|
397
|
+
Performance benchmarks for OpenAI models extracting data from an invoice image:
|
|
398
|
+
|
|
399
|
+
| Model | Text Extraction | JSON Extraction | Best For |
|
|
400
|
+
|-------|-----------------|-----------------|----------|
|
|
401
|
+
| `gpt-4.1-nano` | 4.4s | 2.4s | Fastest, cost-effective |
|
|
402
|
+
| `gpt-4.1-mini` | 4.8s | 3.2s | Good balance speed/accuracy |
|
|
403
|
+
| `gpt-4.1` | 8.2s | 5.4s | High accuracy, reliable |
|
|
404
|
+
| `gpt-4o-mini` | 7.2s | 5.7s | Budget-friendly |
|
|
405
|
+
| `gpt-4o` | 12.3s | 10.7s | Standard high accuracy |
|
|
406
|
+
| `gpt-5.2` | 6.4s | 5.0s | Latest generation |
|
|
407
|
+
| `gpt-5-mini` | 12.2s | 7.9s | GPT-5 balanced option |
|
|
408
|
+
| `gpt-5-nano` | 19.9s | 16.1s | GPT-5 economy tier |
|
|
409
|
+
|
|
410
|
+
> **Note:** `gpt-5.2-pro` and `gpt-image-1` use different API endpoints and are not currently supported.
|
|
411
|
+
|
|
412
|
+
### Model Recommendations
|
|
413
|
+
|
|
414
|
+
**For digital documents (invoices, receipts, forms):**
|
|
415
|
+
```typescript
|
|
416
|
+
// Fast and cost-effective
|
|
417
|
+
const ocr = new OcrAI({
|
|
418
|
+
provider: 'openai',
|
|
419
|
+
apiKey: 'YOUR_API_KEY',
|
|
420
|
+
model: 'gpt-4.1-nano', // ~2-4s response time
|
|
421
|
+
});
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
**For complex documents or when accuracy is critical:**
|
|
425
|
+
```typescript
|
|
426
|
+
// Higher accuracy, reliable extraction
|
|
427
|
+
const ocr = new OcrAI({
|
|
428
|
+
provider: 'openai',
|
|
429
|
+
apiKey: 'YOUR_API_KEY',
|
|
430
|
+
model: 'gpt-4.1', // Best accuracy/speed ratio
|
|
431
|
+
});
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
**For handwritten documents or poor quality scans:**
|
|
435
|
+
```typescript
|
|
436
|
+
// Maximum accuracy for difficult documents
|
|
437
|
+
const ocr = new OcrAI({
|
|
438
|
+
provider: 'openai',
|
|
439
|
+
apiKey: 'YOUR_API_KEY',
|
|
440
|
+
model: 'gpt-5.2', // Latest generation, best accuracy
|
|
441
|
+
});
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### Quick Reference
|
|
445
|
+
|
|
446
|
+
| Use Case | Recommended Model |
|
|
447
|
+
|----------|-------------------|
|
|
448
|
+
| High-volume batch processing | `gpt-4.1-nano` |
|
|
449
|
+
| Standard invoices/receipts | `gpt-4.1-mini` |
|
|
450
|
+
| Complex tables and layouts | `gpt-4.1` |
|
|
451
|
+
| Handwritten documents | `gpt-5.2` |
|
|
452
|
+
| Poor quality scans | `gpt-5.2` |
|
|
453
|
+
| Real-time applications | `gpt-4.1-nano` |
|
|
454
|
+
| Budget-conscious projects | `gpt-4o-mini` |
|
|
455
|
+
|
|
456
|
+
## License
|
|
457
|
+
|
|
458
|
+
MIT
|