parsefy 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +221 -125
- package/dist/index.cjs +2 -2
- package/dist/index.d.cts +78 -8
- package/dist/index.d.mts +78 -8
- package/dist/index.d.ts +78 -8
- package/dist/index.mjs +2 -2
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
|
-
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="/assets/logo.png" alt="Parsefy Logo" width="120" />
|
|
3
|
+
</p>
|
|
2
4
|
|
|
3
|
-
|
|
5
|
+
<h1 align="center">Parsefy TypeScript / JavaScript SDK</h1>
|
|
4
6
|
|
|
5
|
-
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Official TypeScript / JavaScript SDK for Parsefy - Financial Document Infrastructure for Developers</strong><br>
|
|
9
|
+
|
|
10
|
+
Turn financial PDFs (invoices, receipts, bills) into structured JSON with validation and risk signals.
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
---
|
|
6
14
|
|
|
7
15
|
## Installation
|
|
8
16
|
|
|
@@ -19,21 +27,142 @@ import * as z from 'zod';
|
|
|
19
27
|
const client = new Parsefy('pk_your_api_key');
|
|
20
28
|
|
|
21
29
|
const schema = z.object({
|
|
30
|
+
// REQUIRED - triggers fallback if below confidence threshold
|
|
22
31
|
invoice_number: z.string().describe('The invoice number'),
|
|
23
|
-
|
|
24
|
-
|
|
32
|
+
total: z.number().describe('Total amount including tax'),
|
|
33
|
+
|
|
34
|
+
// OPTIONAL - won't trigger fallback if missing or low confidence
|
|
35
|
+
vendor: z.string().optional().describe('Vendor name'),
|
|
36
|
+
due_date: z.string().optional().describe('Payment due date'),
|
|
25
37
|
});
|
|
26
38
|
|
|
27
|
-
const { object, error } = await client.extract({
|
|
39
|
+
const { object, metadata, error } = await client.extract({
|
|
28
40
|
file: './invoice.pdf',
|
|
29
41
|
schema,
|
|
30
42
|
});
|
|
31
43
|
|
|
32
44
|
if (!error && object) {
|
|
33
45
|
console.log(object.invoice_number); // Fully typed!
|
|
46
|
+
|
|
47
|
+
// Access field-level confidence and evidence
|
|
48
|
+
console.log(`Overall confidence: ${metadata.confidenceScore}`);
|
|
49
|
+
metadata.fieldConfidence.forEach((fc) => {
|
|
50
|
+
console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
51
|
+
});
|
|
34
52
|
}
|
|
35
53
|
```
|
|
36
54
|
|
|
55
|
+
## ⚠️ Required vs Optional Fields (Important for Billing)
|
|
56
|
+
|
|
57
|
+
**All fields are required by default.** This is critical to understand:
|
|
58
|
+
|
|
59
|
+
| User writes (SDK) | SDK converts to (JSON Schema) | API interprets as |
|
|
60
|
+
|-------------------|-------------------------------|-------------------|
|
|
61
|
+
| `name: z.string()` | `required: ["name"]` | **Required** |
|
|
62
|
+
| `name: z.string().optional()` | `required: []` | **Optional** |
|
|
63
|
+
|
|
64
|
+
### Why This Matters
|
|
65
|
+
|
|
66
|
+
If a **required** field returns `null` or falls below the `confidenceThreshold`, the API triggers the **fallback model** (Tier 2), which is more expensive.
|
|
67
|
+
|
|
68
|
+
**To avoid unexpected high billing:**
|
|
69
|
+
|
|
70
|
+
```typescript
|
|
71
|
+
const schema = z.object({
|
|
72
|
+
// REQUIRED - Always present on invoices, keep required
|
|
73
|
+
invoice_number: z.string(),
|
|
74
|
+
total: z.number(),
|
|
75
|
+
|
|
76
|
+
// OPTIONAL - May not appear on all documents, mark optional!
|
|
77
|
+
vendor: z.string().optional(), // Not all invoices have vendor name
|
|
78
|
+
tax_id: z.string().optional(), // Rarely present
|
|
79
|
+
notes: z.string().optional(), // Usually empty
|
|
80
|
+
due_date: z.string().optional(), // Sometimes missing
|
|
81
|
+
});
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Rule of thumb:** If a field might be missing in >20% of your documents, mark it `.optional()`.
|
|
85
|
+
|
|
86
|
+
## Confidence Threshold
|
|
87
|
+
|
|
88
|
+
Control when the fallback model is triggered:
|
|
89
|
+
|
|
90
|
+
```typescript
|
|
91
|
+
const { object, metadata } = await client.extract({
|
|
92
|
+
file: './invoice.pdf',
|
|
93
|
+
schema,
|
|
94
|
+
confidenceThreshold: 0.85, // default
|
|
95
|
+
});
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
| Threshold | Behavior |
|
|
99
|
+
|-----------|----------|
|
|
100
|
+
| Lower (e.g., 0.70) | **Faster** – Accepts Tier 1 results more often |
|
|
101
|
+
| Higher (e.g., 0.95) | **More accurate** – Triggers Tier 2 fallback more often |
|
|
102
|
+
|
|
103
|
+
**Default:** `0.85`
|
|
104
|
+
|
|
105
|
+
## Response Format
|
|
106
|
+
|
|
107
|
+
```typescript
|
|
108
|
+
interface ExtractResult<T> {
|
|
109
|
+
// Extracted data matching your schema, or null if extraction failed
|
|
110
|
+
object: T | null;
|
|
111
|
+
|
|
112
|
+
// Metadata about the extraction
|
|
113
|
+
metadata: {
|
|
114
|
+
processingTimeMs: number; // Processing time in milliseconds
|
|
115
|
+
inputTokens: number; // Input tokens used
|
|
116
|
+
outputTokens: number; // Output tokens generated
|
|
117
|
+
credits: number; // Credits consumed (1 credit = 1 page)
|
|
118
|
+
fallbackTriggered: boolean; // Whether fallback model was used
|
|
119
|
+
|
|
120
|
+
// 🆕 Field-level confidence and evidence
|
|
121
|
+
confidenceScore: number; // Overall confidence (0.0 to 1.0)
|
|
122
|
+
fieldConfidence: Array<{
|
|
123
|
+
field: string; // JSON path (e.g., "$.invoice_number")
|
|
124
|
+
score: number; // Confidence score (0.0 to 1.0)
|
|
125
|
+
reason: string; // "Exact match", "Inferred from header", etc.
|
|
126
|
+
page: number; // Page number where found
|
|
127
|
+
text: string; // Source text evidence
|
|
128
|
+
}>;
|
|
129
|
+
issues: string[]; // Warnings or anomalies detected
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
// Error details if extraction failed
|
|
133
|
+
error: {
|
|
134
|
+
code: string;
|
|
135
|
+
message: string;
|
|
136
|
+
} | null;
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Example Response
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
const { object, metadata } = await client.extract({ file, schema });
|
|
144
|
+
|
|
145
|
+
// object:
|
|
146
|
+
{
|
|
147
|
+
invoice_number: "INV-2024-0042",
|
|
148
|
+
date: "2024-01-15",
|
|
149
|
+
total: 1250.00,
|
|
150
|
+
vendor: "Acme Corp"
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// metadata.confidenceScore: 0.94
|
|
154
|
+
|
|
155
|
+
// metadata.fieldConfidence:
|
|
156
|
+
[
|
|
157
|
+
{ field: "$.invoice_number", score: 0.98, reason: "Exact match", page: 1, text: "Invoice # INV-2024-0042" },
|
|
158
|
+
{ field: "$.date", score: 0.95, reason: "Exact match", page: 1, text: "Date: 01/15/2024" },
|
|
159
|
+
{ field: "$.total", score: 0.92, reason: "Formatting ambiguous", page: 1, text: "Total: $1,250.00" },
|
|
160
|
+
{ field: "$.vendor", score: 0.90, reason: "Inferred from header", page: 1, text: "Acme Corp" }
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
// metadata.issues: []
|
|
164
|
+
```
|
|
165
|
+
|
|
37
166
|
## Configuration
|
|
38
167
|
|
|
39
168
|
### API Key
|
|
@@ -60,35 +189,19 @@ const client = new Parsefy({
|
|
|
60
189
|
| `apiKey` | `string` | `process.env.PARSEFY_API_KEY` | Your Parsefy API key |
|
|
61
190
|
| `timeout` | `number` | `60000` | Request timeout in ms |
|
|
62
191
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
### Basic Extraction
|
|
66
|
-
|
|
67
|
-
```typescript
|
|
68
|
-
import { Parsefy } from 'parsefy';
|
|
69
|
-
import * as z from 'zod';
|
|
70
|
-
|
|
71
|
-
const client = new Parsefy();
|
|
72
|
-
|
|
73
|
-
const schema = z.object({
|
|
74
|
-
name: z.string(),
|
|
75
|
-
email: z.string().email(),
|
|
76
|
-
phone: z.string().optional(),
|
|
77
|
-
});
|
|
192
|
+
### Extract Options
|
|
78
193
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
194
|
+
| Option | Type | Default | Description |
|
|
195
|
+
|--------|------|---------|-------------|
|
|
196
|
+
| `file` | `File \| Blob \| Buffer \| string` | required | Document to extract from |
|
|
197
|
+
| `schema` | `z.ZodType` | required | Zod schema defining extraction structure |
|
|
198
|
+
| `confidenceThreshold` | `number` | `0.85` | Minimum confidence before triggering fallback |
|
|
83
199
|
|
|
84
|
-
|
|
85
|
-
console.log(object);
|
|
86
|
-
}
|
|
87
|
-
```
|
|
200
|
+
## Usage
|
|
88
201
|
|
|
89
202
|
### File Input Options
|
|
90
203
|
|
|
91
|
-
The SDK supports multiple file input types. **Files don't need to be on disk** – you can work entirely in memory
|
|
204
|
+
The SDK supports multiple file input types. **Files don't need to be on disk** – you can work entirely in memory.
|
|
92
205
|
|
|
93
206
|
| Input Type | Usage | Environment |
|
|
94
207
|
|------------|-------|-------------|
|
|
@@ -96,19 +209,18 @@ The SDK supports multiple file input types. **Files don't need to be on disk**
|
|
|
96
209
|
| `Buffer` | In-memory bytes | Node.js |
|
|
97
210
|
| `File` | From file input or FormData | Browser, Node.js 20+, Edge |
|
|
98
211
|
| `Blob` | Raw binary with MIME type | Universal |
|
|
99
|
-
| `ArrayBuffer` | Wrap in `Blob` first | Universal |
|
|
100
212
|
|
|
101
213
|
```typescript
|
|
102
|
-
// Node.js: File path
|
|
214
|
+
// Node.js: File path
|
|
103
215
|
const result = await client.extract({
|
|
104
|
-
file: './
|
|
216
|
+
file: './invoice.pdf',
|
|
105
217
|
schema,
|
|
106
218
|
});
|
|
107
219
|
|
|
108
220
|
// Node.js: Buffer (in-memory)
|
|
109
221
|
import { readFileSync } from 'fs';
|
|
110
222
|
const result = await client.extract({
|
|
111
|
-
file: readFileSync('./
|
|
223
|
+
file: readFileSync('./invoice.pdf'),
|
|
112
224
|
schema,
|
|
113
225
|
});
|
|
114
226
|
|
|
@@ -118,18 +230,43 @@ const result = await client.extract({
|
|
|
118
230
|
file: fileInput.files[0],
|
|
119
231
|
schema,
|
|
120
232
|
});
|
|
233
|
+
```
|
|
121
234
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
235
|
+
### Complex Schemas for Financial Documents
|
|
236
|
+
|
|
237
|
+
Use `.describe()` to guide the AI extraction:
|
|
238
|
+
|
|
239
|
+
```typescript
|
|
240
|
+
const invoiceSchema = z.object({
|
|
241
|
+
// REQUIRED - Core financial data
|
|
242
|
+
invoice_number: z.string().describe('The invoice or receipt number'),
|
|
243
|
+
date: z.string().describe('Invoice date in YYYY-MM-DD format'),
|
|
244
|
+
total: z.number().describe('Total amount due including tax'),
|
|
245
|
+
currency: z.string().describe('3-letter currency code (USD, EUR, etc.)'),
|
|
246
|
+
|
|
247
|
+
// REQUIRED - Line items (usually present)
|
|
248
|
+
line_items: z.array(z.object({
|
|
249
|
+
description: z.string().describe('Item description'),
|
|
250
|
+
quantity: z.number().describe('Number of units'),
|
|
251
|
+
unit_price: z.number().describe('Price per unit'),
|
|
252
|
+
amount: z.number().describe('Total amount for this line'),
|
|
253
|
+
})).describe('List of items on the invoice'),
|
|
254
|
+
|
|
255
|
+
// OPTIONAL - May not appear on all invoices
|
|
256
|
+
vendor: z.object({
|
|
257
|
+
name: z.string().describe('Company name of the vendor'),
|
|
258
|
+
address: z.string().optional().describe('Full address'),
|
|
259
|
+
tax_id: z.string().optional().describe('Tax ID or VAT number'),
|
|
260
|
+
}).optional(),
|
|
261
|
+
subtotal: z.number().optional().describe('Subtotal before tax'),
|
|
262
|
+
tax: z.number().optional().describe('Tax amount'),
|
|
263
|
+
due_date: z.string().optional().describe('Payment due date'),
|
|
264
|
+
payment_terms: z.string().optional().describe('e.g., Net 30'),
|
|
126
265
|
});
|
|
127
266
|
```
|
|
128
267
|
|
|
129
268
|
### Server-Side / API Usage
|
|
130
269
|
|
|
131
|
-
When building APIs that receive file uploads, files are typically kept in memory. The SDK handles this seamlessly:
|
|
132
|
-
|
|
133
270
|
**Express with Multer:**
|
|
134
271
|
|
|
135
272
|
```typescript
|
|
@@ -137,38 +274,22 @@ import express from 'express';
|
|
|
137
274
|
import multer from 'multer';
|
|
138
275
|
import { Parsefy } from 'parsefy';
|
|
139
276
|
|
|
140
|
-
const upload = multer(); // Store in memory
|
|
277
|
+
const upload = multer(); // Store in memory
|
|
141
278
|
const client = new Parsefy();
|
|
142
279
|
|
|
143
280
|
app.post('/extract', upload.single('document'), async (req, res) => {
|
|
144
|
-
const { object, error } = await client.extract({
|
|
145
|
-
file: req.file.buffer,
|
|
281
|
+
const { object, metadata, error } = await client.extract({
|
|
282
|
+
file: req.file.buffer,
|
|
146
283
|
schema,
|
|
284
|
+
confidenceThreshold: 0.80, // Adjust based on your needs
|
|
147
285
|
});
|
|
148
|
-
res.json({ data: object, error });
|
|
149
|
-
});
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
**Fastify:**
|
|
153
|
-
|
|
154
|
-
```typescript
|
|
155
|
-
import Fastify from 'fastify';
|
|
156
|
-
import multipart from '@fastify/multipart';
|
|
157
|
-
import { Parsefy } from 'parsefy';
|
|
158
286
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
const data = await request.file();
|
|
165
|
-
const buffer = await data.toBuffer();
|
|
166
|
-
|
|
167
|
-
const { object, error } = await client.extract({
|
|
168
|
-
file: buffer,
|
|
169
|
-
schema,
|
|
287
|
+
res.json({
|
|
288
|
+
data: object,
|
|
289
|
+
confidence: metadata.confidenceScore,
|
|
290
|
+
fieldDetails: metadata.fieldConfidence,
|
|
291
|
+
error,
|
|
170
292
|
});
|
|
171
|
-
return { data: object, error };
|
|
172
293
|
});
|
|
173
294
|
```
|
|
174
295
|
|
|
@@ -183,43 +304,19 @@ const client = new Parsefy();
|
|
|
183
304
|
|
|
184
305
|
app.post('/extract', async (c) => {
|
|
185
306
|
const formData = await c.req.formData();
|
|
186
|
-
const file = formData.get('document');
|
|
307
|
+
const file = formData.get('document');
|
|
187
308
|
|
|
188
|
-
const { object, error } = await client.extract({
|
|
189
|
-
file,
|
|
309
|
+
const { object, metadata, error } = await client.extract({
|
|
310
|
+
file,
|
|
190
311
|
schema,
|
|
191
312
|
});
|
|
192
|
-
return c.json({ data: object, error });
|
|
193
|
-
});
|
|
194
|
-
```
|
|
195
|
-
|
|
196
|
-
### Complex Schemas
|
|
197
|
-
|
|
198
|
-
Use `.describe()` to guide the AI extraction:
|
|
199
|
-
|
|
200
|
-
```typescript
|
|
201
|
-
const invoiceSchema = z.object({
|
|
202
|
-
invoice_number: z.string().describe('The invoice or receipt number'),
|
|
203
|
-
date: z.string().describe('Invoice date in YYYY-MM-DD format'),
|
|
204
|
-
vendor: z.object({
|
|
205
|
-
name: z.string().describe('Company name of the vendor'),
|
|
206
|
-
address: z.string().describe('Full address of the vendor'),
|
|
207
|
-
}),
|
|
208
|
-
line_items: z.array(z.object({
|
|
209
|
-
description: z.string().describe('Item description'),
|
|
210
|
-
quantity: z.number().describe('Number of units'),
|
|
211
|
-
unit_price: z.number().describe('Price per unit'),
|
|
212
|
-
amount: z.number().describe('Total amount for this line'),
|
|
213
|
-
})).describe('List of items on the invoice'),
|
|
214
|
-
subtotal: z.number().describe('Subtotal before tax'),
|
|
215
|
-
tax: z.number().describe('Tax amount'),
|
|
216
|
-
total: z.number().describe('Total amount due'),
|
|
217
|
-
currency: z.string().describe('3-letter currency code (USD, EUR, etc.)'),
|
|
218
|
-
});
|
|
219
313
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
314
|
+
return c.json({
|
|
315
|
+
data: object,
|
|
316
|
+
confidence: metadata.confidenceScore,
|
|
317
|
+
issues: metadata.issues,
|
|
318
|
+
error,
|
|
319
|
+
});
|
|
223
320
|
});
|
|
224
321
|
```
|
|
225
322
|
|
|
@@ -230,17 +327,24 @@ import { Parsefy, APIError, ValidationError, ParsefyError } from 'parsefy';
|
|
|
230
327
|
|
|
231
328
|
try {
|
|
232
329
|
const { object, error, metadata } = await client.extract({
|
|
233
|
-
file: './
|
|
330
|
+
file: './invoice.pdf',
|
|
234
331
|
schema,
|
|
235
332
|
});
|
|
236
333
|
|
|
237
334
|
// Extraction-level errors (request succeeded, but extraction failed)
|
|
238
335
|
if (error) {
|
|
239
336
|
console.error(`Extraction failed: [${error.code}] ${error.message}`);
|
|
240
|
-
console.log(`
|
|
337
|
+
console.log(`Fallback triggered: ${metadata.fallbackTriggered}`);
|
|
338
|
+
console.log(`Issues: ${metadata.issues.join(', ')}`);
|
|
241
339
|
return;
|
|
242
340
|
}
|
|
243
341
|
|
|
342
|
+
// Check for low confidence fields
|
|
343
|
+
const lowConfidence = metadata.fieldConfidence.filter((fc) => fc.score < 0.80);
|
|
344
|
+
if (lowConfidence.length > 0) {
|
|
345
|
+
console.warn('Low confidence fields:', lowConfidence);
|
|
346
|
+
}
|
|
347
|
+
|
|
244
348
|
console.log('Success:', object);
|
|
245
349
|
} catch (err) {
|
|
246
350
|
// HTTP/Network errors
|
|
@@ -254,30 +358,6 @@ try {
|
|
|
254
358
|
}
|
|
255
359
|
```
|
|
256
360
|
|
|
257
|
-
## Response Format
|
|
258
|
-
|
|
259
|
-
```typescript
|
|
260
|
-
interface ExtractResult<T> {
|
|
261
|
-
// Extracted data matching your schema, or null if extraction failed
|
|
262
|
-
object: T | null;
|
|
263
|
-
|
|
264
|
-
// Metadata about the extraction
|
|
265
|
-
metadata: {
|
|
266
|
-
processingTimeMs: number; // Processing time in milliseconds
|
|
267
|
-
inputTokens: number; // Input tokens used
|
|
268
|
-
outputTokens: number; // Output tokens generated
|
|
269
|
-
credits: number; // Credits consumed (1 credit = 1 page)
|
|
270
|
-
fallbackTriggered: boolean; // Whether fallback model was used
|
|
271
|
-
};
|
|
272
|
-
|
|
273
|
-
// Error details if extraction failed
|
|
274
|
-
error: {
|
|
275
|
-
code: string; // EXTRACTION_FAILED, LLM_ERROR, PARSING_ERROR, TIMEOUT_ERROR
|
|
276
|
-
message: string;
|
|
277
|
-
} | null;
|
|
278
|
-
}
|
|
279
|
-
```
|
|
280
|
-
|
|
281
361
|
## Error Types
|
|
282
362
|
|
|
283
363
|
| Error Class | Description |
|
|
@@ -301,7 +381,23 @@ The API allows 1 request per second. The SDK automatically retries with exponent
|
|
|
301
381
|
- Node.js 18+ (for native `fetch` and `FormData`)
|
|
302
382
|
- Zod 3.x (peer dependency)
|
|
303
383
|
|
|
384
|
+
## TypeScript Types
|
|
385
|
+
|
|
386
|
+
All types are exported for your convenience:
|
|
387
|
+
|
|
388
|
+
```typescript
|
|
389
|
+
import type {
|
|
390
|
+
ParsefyConfig,
|
|
391
|
+
ExtractOptions,
|
|
392
|
+
ExtractResult,
|
|
393
|
+
ExtractionMetadata,
|
|
394
|
+
FieldConfidence,
|
|
395
|
+
APIErrorResponse,
|
|
396
|
+
} from 'parsefy';
|
|
397
|
+
|
|
398
|
+
import { DEFAULT_CONFIDENCE_THRESHOLD } from 'parsefy'; // 0.85
|
|
399
|
+
```
|
|
400
|
+
|
|
304
401
|
## License
|
|
305
402
|
|
|
306
403
|
MIT © [Parsefy](https://parsefy.io)
|
|
307
|
-
|
package/dist/index.cjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
'use strict';var zodToJsonSchema=require('zod-to-json-schema');var
|
|
2
|
-
exports.APIError=
|
|
1
|
+
'use strict';var zodToJsonSchema=require('zod-to-json-schema');var u=.85,d={".pdf":"application/pdf",".docx":"application/vnd.openxmlformats-officedocument.wordprocessingml.document"},l=10*1024*1024,g="https://api.parsefy.io",x=6e4;var s=class extends Error{constructor(e,r){super(e),this.name="ParsefyError",this.code=r,typeof Error.captureStackTrace=="function"&&Error.captureStackTrace(this,this.constructor);}},p=class extends s{constructor(e,r,o){super(e),this.name="APIError",this.statusCode=r,this.response=o;}},y=class extends s{constructor(e,r,o){super(e,r),this.name="ExtractionError",this.metadata=o;}},a=class extends s{constructor(e){super(e),this.name="ValidationError";}};function h(){return typeof process<"u"&&process.versions?.node!==void 0}function R(t){let e=zodToJsonSchema.zodToJsonSchema(t,{$refStrategy:"none",target:"jsonSchema7"});return "$schema"in e&&delete e.$schema,e}function b(t){let e=t.toLowerCase().match(/\.[^.]+$/)?.[0];return e&&d[e]||null}function w(t){if(!b(t)){let r=Object.keys(d).join(", ");throw new a(`Unsupported file type. Supported types: ${r}`)}}function m(t){if(t===0)throw new a("File is empty");if(t>l){let e=l/1048576;throw new a(`File size exceeds maximum limit of ${e}MB`)}}function F(t){let e=t._meta||{confidence_score:1,field_confidence:[],issues:[]};return {object:t.object,metadata:{processingTimeMs:t.metadata.processing_time_ms,inputTokens:t.metadata.input_tokens,outputTokens:t.metadata.output_tokens,credits:t.metadata.credits,fallbackTriggered:t.metadata.fallback_triggered,confidenceScore:e.confidence_score,fieldConfidence:e.field_confidence.map(r=>({field:r.field,score:r.score,reason:r.reason,page:r.page,text:r.text})),issues:e.issues},error:t.error}}function T(t,e){let r=b(e)||"application/octet-stream",o=t.buffer.slice(t.byteOffset,t.byteOffset+t.byteLength);return typeof File<"u"?new File([o],e,{type:r}):new Blob([o],{type:r})}async function I(t){if(!h())throw new a("File path strings are only supported in Node.js. Use File or Blob in the browser.");let e=await import('fs'),r=await import('path');if(!e.existsSync(t))throw new a(`File not found: ${t}`);let o=r.basename(t);w(o);let c=e.readFileSync(t);return m(c.length),{buffer:c,filename:o}}async function _(t){if(typeof t=="string"){let{buffer:e,filename:r}=await I(t);return T(e,r)}if(Buffer.isBuffer(t))return m(t.length),T(t,"document.pdf");if(t instanceof File)return w(t.name),m(t.size),t;if(t instanceof Blob)return m(t.size),t;throw new a("Invalid file input. Expected File, Blob, Buffer, or file path string.")}function P(t){return new Promise(e=>setTimeout(e,t))}function S(t,e=1e3){let r=e*Math.pow(2,t),o=Math.random()*.1*r;return Math.min(r+o,3e4)}var E=class{constructor(e){this.maxRetries=3;let r={};if(typeof e=="string"?r={apiKey:e}:e&&(r=e),this.apiKey=r.apiKey||this.getEnvApiKey(),!this.apiKey)throw new a("API key is required. Provide it in the constructor or set the PARSEFY_API_KEY environment variable.");this.baseUrl=r.baseUrl||g,this.timeout=r.timeout||x;}getEnvApiKey(){return h()&&process.env.PARSEFY_API_KEY||""}async extract(e){let{file:r,schema:o,confidenceThreshold:c}=e,n=R(o),f=await _(r),i=new FormData;return i.append("file",f),i.append("output_schema",JSON.stringify(n)),i.append("confidence_threshold",String(c??.85)),this.makeRequestWithRetry(i)}async makeRequestWithRetry(e,r=0){try{return await this.makeRequest(e)}catch(o){if(o instanceof p&&o.statusCode===429&&r<this.maxRetries){let c=S(r);return await P(c),this.makeRequestWithRetry(e,r+1)}throw o}}async makeRequest(e){let r=`${this.baseUrl}/v1/extract`,o=new AbortController,c=setTimeout(()=>o.abort(),this.timeout);try{let n=await fetch(r,{method:"POST",headers:{Authorization:`Bearer ${this.apiKey}`},body:e,signal:o.signal});if(clearTimeout(c),!n.ok){let i=await this.parseErrorResponse(n);throw new p(i.message||`API request failed with status ${n.status}`,n.status,i)}let f;try{f=await n.json();}catch{throw new s("Failed to parse API response as JSON. The API may have returned an invalid response.","PARSE_ERROR")}try{return F(f)}catch(i){throw new s(`Failed to transform API response: ${i instanceof Error?i.message:String(i)}`,"TRANSFORM_ERROR")}}catch(n){throw clearTimeout(c),n instanceof Error&&n.name==="AbortError"?new s(`Request timed out after ${this.timeout}ms`,"TIMEOUT"):n instanceof s?n:n instanceof TypeError&&n.message.includes("fetch")?new s(`Network error: Unable to connect to the Parsefy API. ${n.message}`,"NETWORK_ERROR"):n instanceof TypeError?new s(`Type error: ${n.message}. This may indicate an API response format issue.`,"TYPE_ERROR"):new s(`Unexpected error: ${n instanceof Error?n.message:String(n)}`,"UNKNOWN_ERROR")}}async parseErrorResponse(e){try{return await e.json()}catch{try{return {message:await e.text()||e.statusText}}catch{return {message:e.statusText}}}}};
|
|
2
|
+
exports.APIError=p;exports.DEFAULT_CONFIDENCE_THRESHOLD=u;exports.ExtractionError=y;exports.Parsefy=E;exports.ParsefyError=s;exports.ValidationError=a;
|
package/dist/index.d.cts
CHANGED
|
@@ -11,6 +11,11 @@ interface ParsefyConfig {
|
|
|
11
11
|
/** Request timeout in milliseconds. Defaults to 60000 (60 seconds). */
|
|
12
12
|
timeout?: number;
|
|
13
13
|
}
|
|
14
|
+
/**
|
|
15
|
+
* Default confidence threshold for extraction.
|
|
16
|
+
* Fields below this threshold on required fields will trigger the fallback model.
|
|
17
|
+
*/
|
|
18
|
+
declare const DEFAULT_CONFIDENCE_THRESHOLD = 0.85;
|
|
14
19
|
/**
|
|
15
20
|
* Options for the extract method.
|
|
16
21
|
*/
|
|
@@ -19,6 +24,35 @@ interface ExtractOptions<T extends z.ZodType> {
|
|
|
19
24
|
file: File | Blob | Buffer | string;
|
|
20
25
|
/** Zod schema defining the structure of data to extract. */
|
|
21
26
|
schema: T;
|
|
27
|
+
/**
|
|
28
|
+
* Confidence threshold for extraction (0.0 to 1.0). Defaults to 0.85.
|
|
29
|
+
*
|
|
30
|
+
* If a **required** field's confidence falls below this threshold (or returns null),
|
|
31
|
+
* the fallback model is triggered for higher accuracy.
|
|
32
|
+
*
|
|
33
|
+
* **Tip**: Lower threshold = faster (accepts Tier 1 more often).
|
|
34
|
+
* Higher threshold = more accurate (triggers Tier 2 fallback more often).
|
|
35
|
+
*
|
|
36
|
+
* **Important**: Mark fields as `.optional()` in your Zod schema if they might not
|
|
37
|
+
* appear in all documents. This prevents unnecessary fallback triggers and reduces costs.
|
|
38
|
+
*/
|
|
39
|
+
confidenceThreshold?: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Confidence details for a single extracted field.
|
|
43
|
+
* Provides evidence and explanation for each extraction.
|
|
44
|
+
*/
|
|
45
|
+
interface FieldConfidence {
|
|
46
|
+
/** JSON path to the field (e.g., "$.invoice_number"). */
|
|
47
|
+
field: string;
|
|
48
|
+
/** Confidence score for this field (0.0 to 1.0). */
|
|
49
|
+
score: number;
|
|
50
|
+
/** Explanation of how the value was extracted (e.g., "Exact match", "Inferred from header"). */
|
|
51
|
+
reason: string;
|
|
52
|
+
/** Page number where the field was found. */
|
|
53
|
+
page: number;
|
|
54
|
+
/** Source text evidence from the document. */
|
|
55
|
+
text: string;
|
|
22
56
|
}
|
|
23
57
|
/**
|
|
24
58
|
* Metadata about the extraction process.
|
|
@@ -34,6 +68,12 @@ interface ExtractionMetadata {
|
|
|
34
68
|
credits: number;
|
|
35
69
|
/** Whether the fallback model was triggered for higher accuracy. */
|
|
36
70
|
fallbackTriggered: boolean;
|
|
71
|
+
/** Overall confidence score for the extraction (0.0 to 1.0). */
|
|
72
|
+
confidenceScore: number;
|
|
73
|
+
/** Per-field confidence details with evidence and explanations. */
|
|
74
|
+
fieldConfidence: FieldConfidence[];
|
|
75
|
+
/** List of issues or warnings encountered during extraction. */
|
|
76
|
+
issues: string[];
|
|
37
77
|
}
|
|
38
78
|
/**
|
|
39
79
|
* Error response from the API.
|
|
@@ -57,7 +97,10 @@ interface ExtractResult<T> {
|
|
|
57
97
|
}
|
|
58
98
|
|
|
59
99
|
/**
|
|
60
|
-
* Parsefy client for extracting structured data from documents.
|
|
100
|
+
* Parsefy client for extracting structured data from financial documents.
|
|
101
|
+
*
|
|
102
|
+
* **Important**: All fields are **required by default**. Use `.optional()` for fields
|
|
103
|
+
* that may not appear in all documents to avoid triggering expensive fallback models.
|
|
61
104
|
*
|
|
62
105
|
* @example
|
|
63
106
|
* ```ts
|
|
@@ -67,13 +110,24 @@ interface ExtractResult<T> {
|
|
|
67
110
|
* const client = new Parsefy('pk_your_api_key');
|
|
68
111
|
*
|
|
69
112
|
* const schema = z.object({
|
|
70
|
-
*
|
|
113
|
+
* // REQUIRED - fallback triggered if below confidence threshold
|
|
114
|
+
* invoice_number: z.string(),
|
|
71
115
|
* total: z.number(),
|
|
116
|
+
*
|
|
117
|
+
* // OPTIONAL - won't trigger fallback if missing
|
|
118
|
+
* vendor: z.string().optional(),
|
|
119
|
+
* notes: z.string().optional(),
|
|
72
120
|
* });
|
|
73
121
|
*
|
|
74
|
-
* const { object, error } = await client.extract({
|
|
122
|
+
* const { object, metadata, error } = await client.extract({
|
|
75
123
|
* file: './invoice.pdf',
|
|
76
124
|
* schema,
|
|
125
|
+
* confidenceThreshold: 0.85, // default
|
|
126
|
+
* });
|
|
127
|
+
*
|
|
128
|
+
* // Check per-field confidence and evidence
|
|
129
|
+
* metadata.fieldConfidence.forEach((fc) => {
|
|
130
|
+
* console.log(`${fc.field}: ${fc.score} - "${fc.text}"`);
|
|
77
131
|
* });
|
|
78
132
|
* ```
|
|
79
133
|
*/
|
|
@@ -109,25 +163,41 @@ declare class Parsefy {
|
|
|
109
163
|
*/
|
|
110
164
|
private getEnvApiKey;
|
|
111
165
|
/**
|
|
112
|
-
* Extracts structured data from a document using the provided Zod schema.
|
|
166
|
+
* Extracts structured data from a financial document using the provided Zod schema.
|
|
167
|
+
*
|
|
168
|
+
* ** Billing Warning**: All fields are **required by default**. If a required field
|
|
169
|
+
* returns `null` or falls below the `confidenceThreshold`, the fallback model is triggered,
|
|
170
|
+
* which is more expensive. Use `.optional()` for fields that may not appear in all documents.
|
|
113
171
|
*
|
|
114
|
-
* @param options - Extraction options including file and
|
|
115
|
-
* @returns Promise resolving to the extraction result with typed data.
|
|
172
|
+
* @param options - Extraction options including file, schema, and confidence threshold.
|
|
173
|
+
* @returns Promise resolving to the extraction result with typed data and field-level confidence.
|
|
116
174
|
*
|
|
117
175
|
* @example
|
|
118
176
|
* ```ts
|
|
119
177
|
* const schema = z.object({
|
|
178
|
+
* // REQUIRED - triggers fallback if confidence < threshold
|
|
120
179
|
* invoice_number: z.string().describe('The invoice number'),
|
|
121
|
-
* total: z.number().describe('Total amount'),
|
|
180
|
+
* total: z.number().describe('Total amount including tax'),
|
|
181
|
+
*
|
|
182
|
+
* // OPTIONAL - won't trigger fallback if missing or low confidence
|
|
183
|
+
* vendor: z.string().optional().describe('Vendor/supplier name'),
|
|
184
|
+
* due_date: z.string().optional().describe('Payment due date'),
|
|
122
185
|
* });
|
|
123
186
|
*
|
|
124
187
|
* const { object, metadata, error } = await client.extract({
|
|
125
188
|
* file: './invoice.pdf',
|
|
126
189
|
* schema,
|
|
190
|
+
* confidenceThreshold: 0.85, // Lower = faster, Higher = more accurate
|
|
127
191
|
* });
|
|
128
192
|
*
|
|
129
193
|
* if (!error && object) {
|
|
130
194
|
* console.log(object.invoice_number); // Fully typed!
|
|
195
|
+
*
|
|
196
|
+
* // Access field-level confidence and evidence
|
|
197
|
+
* console.log(`Overall confidence: ${metadata.confidenceScore}`);
|
|
198
|
+
* metadata.fieldConfidence.forEach((fc) => {
|
|
199
|
+
* console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
200
|
+
* });
|
|
131
201
|
* }
|
|
132
202
|
* ```
|
|
133
203
|
*/
|
|
@@ -180,4 +250,4 @@ declare class ValidationError extends ParsefyError {
|
|
|
180
250
|
constructor(message: string);
|
|
181
251
|
}
|
|
182
252
|
|
|
183
|
-
export { APIError, type APIErrorResponse, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
|
253
|
+
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
package/dist/index.d.mts
CHANGED
|
@@ -11,6 +11,11 @@ interface ParsefyConfig {
|
|
|
11
11
|
/** Request timeout in milliseconds. Defaults to 60000 (60 seconds). */
|
|
12
12
|
timeout?: number;
|
|
13
13
|
}
|
|
14
|
+
/**
|
|
15
|
+
* Default confidence threshold for extraction.
|
|
16
|
+
* Fields below this threshold on required fields will trigger the fallback model.
|
|
17
|
+
*/
|
|
18
|
+
declare const DEFAULT_CONFIDENCE_THRESHOLD = 0.85;
|
|
14
19
|
/**
|
|
15
20
|
* Options for the extract method.
|
|
16
21
|
*/
|
|
@@ -19,6 +24,35 @@ interface ExtractOptions<T extends z.ZodType> {
|
|
|
19
24
|
file: File | Blob | Buffer | string;
|
|
20
25
|
/** Zod schema defining the structure of data to extract. */
|
|
21
26
|
schema: T;
|
|
27
|
+
/**
|
|
28
|
+
* Confidence threshold for extraction (0.0 to 1.0). Defaults to 0.85.
|
|
29
|
+
*
|
|
30
|
+
* If a **required** field's confidence falls below this threshold (or returns null),
|
|
31
|
+
* the fallback model is triggered for higher accuracy.
|
|
32
|
+
*
|
|
33
|
+
* **Tip**: Lower threshold = faster (accepts Tier 1 more often).
|
|
34
|
+
* Higher threshold = more accurate (triggers Tier 2 fallback more often).
|
|
35
|
+
*
|
|
36
|
+
* **Important**: Mark fields as `.optional()` in your Zod schema if they might not
|
|
37
|
+
* appear in all documents. This prevents unnecessary fallback triggers and reduces costs.
|
|
38
|
+
*/
|
|
39
|
+
confidenceThreshold?: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Confidence details for a single extracted field.
|
|
43
|
+
* Provides evidence and explanation for each extraction.
|
|
44
|
+
*/
|
|
45
|
+
interface FieldConfidence {
|
|
46
|
+
/** JSON path to the field (e.g., "$.invoice_number"). */
|
|
47
|
+
field: string;
|
|
48
|
+
/** Confidence score for this field (0.0 to 1.0). */
|
|
49
|
+
score: number;
|
|
50
|
+
/** Explanation of how the value was extracted (e.g., "Exact match", "Inferred from header"). */
|
|
51
|
+
reason: string;
|
|
52
|
+
/** Page number where the field was found. */
|
|
53
|
+
page: number;
|
|
54
|
+
/** Source text evidence from the document. */
|
|
55
|
+
text: string;
|
|
22
56
|
}
|
|
23
57
|
/**
|
|
24
58
|
* Metadata about the extraction process.
|
|
@@ -34,6 +68,12 @@ interface ExtractionMetadata {
|
|
|
34
68
|
credits: number;
|
|
35
69
|
/** Whether the fallback model was triggered for higher accuracy. */
|
|
36
70
|
fallbackTriggered: boolean;
|
|
71
|
+
/** Overall confidence score for the extraction (0.0 to 1.0). */
|
|
72
|
+
confidenceScore: number;
|
|
73
|
+
/** Per-field confidence details with evidence and explanations. */
|
|
74
|
+
fieldConfidence: FieldConfidence[];
|
|
75
|
+
/** List of issues or warnings encountered during extraction. */
|
|
76
|
+
issues: string[];
|
|
37
77
|
}
|
|
38
78
|
/**
|
|
39
79
|
* Error response from the API.
|
|
@@ -57,7 +97,10 @@ interface ExtractResult<T> {
|
|
|
57
97
|
}
|
|
58
98
|
|
|
59
99
|
/**
|
|
60
|
-
* Parsefy client for extracting structured data from documents.
|
|
100
|
+
* Parsefy client for extracting structured data from financial documents.
|
|
101
|
+
*
|
|
102
|
+
* **Important**: All fields are **required by default**. Use `.optional()` for fields
|
|
103
|
+
* that may not appear in all documents to avoid triggering expensive fallback models.
|
|
61
104
|
*
|
|
62
105
|
* @example
|
|
63
106
|
* ```ts
|
|
@@ -67,13 +110,24 @@ interface ExtractResult<T> {
|
|
|
67
110
|
* const client = new Parsefy('pk_your_api_key');
|
|
68
111
|
*
|
|
69
112
|
* const schema = z.object({
|
|
70
|
-
*
|
|
113
|
+
* // REQUIRED - fallback triggered if below confidence threshold
|
|
114
|
+
* invoice_number: z.string(),
|
|
71
115
|
* total: z.number(),
|
|
116
|
+
*
|
|
117
|
+
* // OPTIONAL - won't trigger fallback if missing
|
|
118
|
+
* vendor: z.string().optional(),
|
|
119
|
+
* notes: z.string().optional(),
|
|
72
120
|
* });
|
|
73
121
|
*
|
|
74
|
-
* const { object, error } = await client.extract({
|
|
122
|
+
* const { object, metadata, error } = await client.extract({
|
|
75
123
|
* file: './invoice.pdf',
|
|
76
124
|
* schema,
|
|
125
|
+
* confidenceThreshold: 0.85, // default
|
|
126
|
+
* });
|
|
127
|
+
*
|
|
128
|
+
* // Check per-field confidence and evidence
|
|
129
|
+
* metadata.fieldConfidence.forEach((fc) => {
|
|
130
|
+
* console.log(`${fc.field}: ${fc.score} - "${fc.text}"`);
|
|
77
131
|
* });
|
|
78
132
|
* ```
|
|
79
133
|
*/
|
|
@@ -109,25 +163,41 @@ declare class Parsefy {
|
|
|
109
163
|
*/
|
|
110
164
|
private getEnvApiKey;
|
|
111
165
|
/**
|
|
112
|
-
* Extracts structured data from a document using the provided Zod schema.
|
|
166
|
+
* Extracts structured data from a financial document using the provided Zod schema.
|
|
167
|
+
*
|
|
168
|
+
* ** Billing Warning**: All fields are **required by default**. If a required field
|
|
169
|
+
* returns `null` or falls below the `confidenceThreshold`, the fallback model is triggered,
|
|
170
|
+
* which is more expensive. Use `.optional()` for fields that may not appear in all documents.
|
|
113
171
|
*
|
|
114
|
-
* @param options - Extraction options including file and
|
|
115
|
-
* @returns Promise resolving to the extraction result with typed data.
|
|
172
|
+
* @param options - Extraction options including file, schema, and confidence threshold.
|
|
173
|
+
* @returns Promise resolving to the extraction result with typed data and field-level confidence.
|
|
116
174
|
*
|
|
117
175
|
* @example
|
|
118
176
|
* ```ts
|
|
119
177
|
* const schema = z.object({
|
|
178
|
+
* // REQUIRED - triggers fallback if confidence < threshold
|
|
120
179
|
* invoice_number: z.string().describe('The invoice number'),
|
|
121
|
-
* total: z.number().describe('Total amount'),
|
|
180
|
+
* total: z.number().describe('Total amount including tax'),
|
|
181
|
+
*
|
|
182
|
+
* // OPTIONAL - won't trigger fallback if missing or low confidence
|
|
183
|
+
* vendor: z.string().optional().describe('Vendor/supplier name'),
|
|
184
|
+
* due_date: z.string().optional().describe('Payment due date'),
|
|
122
185
|
* });
|
|
123
186
|
*
|
|
124
187
|
* const { object, metadata, error } = await client.extract({
|
|
125
188
|
* file: './invoice.pdf',
|
|
126
189
|
* schema,
|
|
190
|
+
* confidenceThreshold: 0.85, // Lower = faster, Higher = more accurate
|
|
127
191
|
* });
|
|
128
192
|
*
|
|
129
193
|
* if (!error && object) {
|
|
130
194
|
* console.log(object.invoice_number); // Fully typed!
|
|
195
|
+
*
|
|
196
|
+
* // Access field-level confidence and evidence
|
|
197
|
+
* console.log(`Overall confidence: ${metadata.confidenceScore}`);
|
|
198
|
+
* metadata.fieldConfidence.forEach((fc) => {
|
|
199
|
+
* console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
200
|
+
* });
|
|
131
201
|
* }
|
|
132
202
|
* ```
|
|
133
203
|
*/
|
|
@@ -180,4 +250,4 @@ declare class ValidationError extends ParsefyError {
|
|
|
180
250
|
constructor(message: string);
|
|
181
251
|
}
|
|
182
252
|
|
|
183
|
-
export { APIError, type APIErrorResponse, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
|
253
|
+
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
package/dist/index.d.ts
CHANGED
|
@@ -11,6 +11,11 @@ interface ParsefyConfig {
|
|
|
11
11
|
/** Request timeout in milliseconds. Defaults to 60000 (60 seconds). */
|
|
12
12
|
timeout?: number;
|
|
13
13
|
}
|
|
14
|
+
/**
|
|
15
|
+
* Default confidence threshold for extraction.
|
|
16
|
+
* Fields below this threshold on required fields will trigger the fallback model.
|
|
17
|
+
*/
|
|
18
|
+
declare const DEFAULT_CONFIDENCE_THRESHOLD = 0.85;
|
|
14
19
|
/**
|
|
15
20
|
* Options for the extract method.
|
|
16
21
|
*/
|
|
@@ -19,6 +24,35 @@ interface ExtractOptions<T extends z.ZodType> {
|
|
|
19
24
|
file: File | Blob | Buffer | string;
|
|
20
25
|
/** Zod schema defining the structure of data to extract. */
|
|
21
26
|
schema: T;
|
|
27
|
+
/**
|
|
28
|
+
* Confidence threshold for extraction (0.0 to 1.0). Defaults to 0.85.
|
|
29
|
+
*
|
|
30
|
+
* If a **required** field's confidence falls below this threshold (or returns null),
|
|
31
|
+
* the fallback model is triggered for higher accuracy.
|
|
32
|
+
*
|
|
33
|
+
* **Tip**: Lower threshold = faster (accepts Tier 1 more often).
|
|
34
|
+
* Higher threshold = more accurate (triggers Tier 2 fallback more often).
|
|
35
|
+
*
|
|
36
|
+
* **Important**: Mark fields as `.optional()` in your Zod schema if they might not
|
|
37
|
+
* appear in all documents. This prevents unnecessary fallback triggers and reduces costs.
|
|
38
|
+
*/
|
|
39
|
+
confidenceThreshold?: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Confidence details for a single extracted field.
|
|
43
|
+
* Provides evidence and explanation for each extraction.
|
|
44
|
+
*/
|
|
45
|
+
interface FieldConfidence {
|
|
46
|
+
/** JSON path to the field (e.g., "$.invoice_number"). */
|
|
47
|
+
field: string;
|
|
48
|
+
/** Confidence score for this field (0.0 to 1.0). */
|
|
49
|
+
score: number;
|
|
50
|
+
/** Explanation of how the value was extracted (e.g., "Exact match", "Inferred from header"). */
|
|
51
|
+
reason: string;
|
|
52
|
+
/** Page number where the field was found. */
|
|
53
|
+
page: number;
|
|
54
|
+
/** Source text evidence from the document. */
|
|
55
|
+
text: string;
|
|
22
56
|
}
|
|
23
57
|
/**
|
|
24
58
|
* Metadata about the extraction process.
|
|
@@ -34,6 +68,12 @@ interface ExtractionMetadata {
|
|
|
34
68
|
credits: number;
|
|
35
69
|
/** Whether the fallback model was triggered for higher accuracy. */
|
|
36
70
|
fallbackTriggered: boolean;
|
|
71
|
+
/** Overall confidence score for the extraction (0.0 to 1.0). */
|
|
72
|
+
confidenceScore: number;
|
|
73
|
+
/** Per-field confidence details with evidence and explanations. */
|
|
74
|
+
fieldConfidence: FieldConfidence[];
|
|
75
|
+
/** List of issues or warnings encountered during extraction. */
|
|
76
|
+
issues: string[];
|
|
37
77
|
}
|
|
38
78
|
/**
|
|
39
79
|
* Error response from the API.
|
|
@@ -57,7 +97,10 @@ interface ExtractResult<T> {
|
|
|
57
97
|
}
|
|
58
98
|
|
|
59
99
|
/**
|
|
60
|
-
* Parsefy client for extracting structured data from documents.
|
|
100
|
+
* Parsefy client for extracting structured data from financial documents.
|
|
101
|
+
*
|
|
102
|
+
* **Important**: All fields are **required by default**. Use `.optional()` for fields
|
|
103
|
+
* that may not appear in all documents to avoid triggering expensive fallback models.
|
|
61
104
|
*
|
|
62
105
|
* @example
|
|
63
106
|
* ```ts
|
|
@@ -67,13 +110,24 @@ interface ExtractResult<T> {
|
|
|
67
110
|
* const client = new Parsefy('pk_your_api_key');
|
|
68
111
|
*
|
|
69
112
|
* const schema = z.object({
|
|
70
|
-
*
|
|
113
|
+
* // REQUIRED - fallback triggered if below confidence threshold
|
|
114
|
+
* invoice_number: z.string(),
|
|
71
115
|
* total: z.number(),
|
|
116
|
+
*
|
|
117
|
+
* // OPTIONAL - won't trigger fallback if missing
|
|
118
|
+
* vendor: z.string().optional(),
|
|
119
|
+
* notes: z.string().optional(),
|
|
72
120
|
* });
|
|
73
121
|
*
|
|
74
|
-
* const { object, error } = await client.extract({
|
|
122
|
+
* const { object, metadata, error } = await client.extract({
|
|
75
123
|
* file: './invoice.pdf',
|
|
76
124
|
* schema,
|
|
125
|
+
* confidenceThreshold: 0.85, // default
|
|
126
|
+
* });
|
|
127
|
+
*
|
|
128
|
+
* // Check per-field confidence and evidence
|
|
129
|
+
* metadata.fieldConfidence.forEach((fc) => {
|
|
130
|
+
* console.log(`${fc.field}: ${fc.score} - "${fc.text}"`);
|
|
77
131
|
* });
|
|
78
132
|
* ```
|
|
79
133
|
*/
|
|
@@ -109,25 +163,41 @@ declare class Parsefy {
|
|
|
109
163
|
*/
|
|
110
164
|
private getEnvApiKey;
|
|
111
165
|
/**
|
|
112
|
-
* Extracts structured data from a document using the provided Zod schema.
|
|
166
|
+
* Extracts structured data from a financial document using the provided Zod schema.
|
|
167
|
+
*
|
|
168
|
+
* ** Billing Warning**: All fields are **required by default**. If a required field
|
|
169
|
+
* returns `null` or falls below the `confidenceThreshold`, the fallback model is triggered,
|
|
170
|
+
* which is more expensive. Use `.optional()` for fields that may not appear in all documents.
|
|
113
171
|
*
|
|
114
|
-
* @param options - Extraction options including file and
|
|
115
|
-
* @returns Promise resolving to the extraction result with typed data.
|
|
172
|
+
* @param options - Extraction options including file, schema, and confidence threshold.
|
|
173
|
+
* @returns Promise resolving to the extraction result with typed data and field-level confidence.
|
|
116
174
|
*
|
|
117
175
|
* @example
|
|
118
176
|
* ```ts
|
|
119
177
|
* const schema = z.object({
|
|
178
|
+
* // REQUIRED - triggers fallback if confidence < threshold
|
|
120
179
|
* invoice_number: z.string().describe('The invoice number'),
|
|
121
|
-
* total: z.number().describe('Total amount'),
|
|
180
|
+
* total: z.number().describe('Total amount including tax'),
|
|
181
|
+
*
|
|
182
|
+
* // OPTIONAL - won't trigger fallback if missing or low confidence
|
|
183
|
+
* vendor: z.string().optional().describe('Vendor/supplier name'),
|
|
184
|
+
* due_date: z.string().optional().describe('Payment due date'),
|
|
122
185
|
* });
|
|
123
186
|
*
|
|
124
187
|
* const { object, metadata, error } = await client.extract({
|
|
125
188
|
* file: './invoice.pdf',
|
|
126
189
|
* schema,
|
|
190
|
+
* confidenceThreshold: 0.85, // Lower = faster, Higher = more accurate
|
|
127
191
|
* });
|
|
128
192
|
*
|
|
129
193
|
* if (!error && object) {
|
|
130
194
|
* console.log(object.invoice_number); // Fully typed!
|
|
195
|
+
*
|
|
196
|
+
* // Access field-level confidence and evidence
|
|
197
|
+
* console.log(`Overall confidence: ${metadata.confidenceScore}`);
|
|
198
|
+
* metadata.fieldConfidence.forEach((fc) => {
|
|
199
|
+
* console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
200
|
+
* });
|
|
131
201
|
* }
|
|
132
202
|
* ```
|
|
133
203
|
*/
|
|
@@ -180,4 +250,4 @@ declare class ValidationError extends ParsefyError {
|
|
|
180
250
|
constructor(message: string);
|
|
181
251
|
}
|
|
182
252
|
|
|
183
|
-
export { APIError, type APIErrorResponse, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
|
253
|
+
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
package/dist/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import {zodToJsonSchema}from'zod-to-json-schema';var
|
|
2
|
-
export{
|
|
1
|
+
import {zodToJsonSchema}from'zod-to-json-schema';var u=.85,d={".pdf":"application/pdf",".docx":"application/vnd.openxmlformats-officedocument.wordprocessingml.document"},l=10*1024*1024,g="https://api.parsefy.io",x=6e4;var s=class extends Error{constructor(e,r){super(e),this.name="ParsefyError",this.code=r,typeof Error.captureStackTrace=="function"&&Error.captureStackTrace(this,this.constructor);}},p=class extends s{constructor(e,r,o){super(e),this.name="APIError",this.statusCode=r,this.response=o;}},y=class extends s{constructor(e,r,o){super(e,r),this.name="ExtractionError",this.metadata=o;}},a=class extends s{constructor(e){super(e),this.name="ValidationError";}};function h(){return typeof process<"u"&&process.versions?.node!==void 0}function R(t){let e=zodToJsonSchema(t,{$refStrategy:"none",target:"jsonSchema7"});return "$schema"in e&&delete e.$schema,e}function b(t){let e=t.toLowerCase().match(/\.[^.]+$/)?.[0];return e&&d[e]||null}function w(t){if(!b(t)){let r=Object.keys(d).join(", ");throw new a(`Unsupported file type. Supported types: ${r}`)}}function m(t){if(t===0)throw new a("File is empty");if(t>l){let e=l/1048576;throw new a(`File size exceeds maximum limit of ${e}MB`)}}function F(t){let e=t._meta||{confidence_score:1,field_confidence:[],issues:[]};return {object:t.object,metadata:{processingTimeMs:t.metadata.processing_time_ms,inputTokens:t.metadata.input_tokens,outputTokens:t.metadata.output_tokens,credits:t.metadata.credits,fallbackTriggered:t.metadata.fallback_triggered,confidenceScore:e.confidence_score,fieldConfidence:e.field_confidence.map(r=>({field:r.field,score:r.score,reason:r.reason,page:r.page,text:r.text})),issues:e.issues},error:t.error}}function T(t,e){let r=b(e)||"application/octet-stream",o=t.buffer.slice(t.byteOffset,t.byteOffset+t.byteLength);return typeof File<"u"?new File([o],e,{type:r}):new Blob([o],{type:r})}async function I(t){if(!h())throw new a("File path strings are only supported in Node.js. Use File or Blob in the browser.");let e=await import('fs'),r=await import('path');if(!e.existsSync(t))throw new a(`File not found: ${t}`);let o=r.basename(t);w(o);let c=e.readFileSync(t);return m(c.length),{buffer:c,filename:o}}async function _(t){if(typeof t=="string"){let{buffer:e,filename:r}=await I(t);return T(e,r)}if(Buffer.isBuffer(t))return m(t.length),T(t,"document.pdf");if(t instanceof File)return w(t.name),m(t.size),t;if(t instanceof Blob)return m(t.size),t;throw new a("Invalid file input. Expected File, Blob, Buffer, or file path string.")}function P(t){return new Promise(e=>setTimeout(e,t))}function S(t,e=1e3){let r=e*Math.pow(2,t),o=Math.random()*.1*r;return Math.min(r+o,3e4)}var E=class{constructor(e){this.maxRetries=3;let r={};if(typeof e=="string"?r={apiKey:e}:e&&(r=e),this.apiKey=r.apiKey||this.getEnvApiKey(),!this.apiKey)throw new a("API key is required. Provide it in the constructor or set the PARSEFY_API_KEY environment variable.");this.baseUrl=r.baseUrl||g,this.timeout=r.timeout||x;}getEnvApiKey(){return h()&&process.env.PARSEFY_API_KEY||""}async extract(e){let{file:r,schema:o,confidenceThreshold:c}=e,n=R(o),f=await _(r),i=new FormData;return i.append("file",f),i.append("output_schema",JSON.stringify(n)),i.append("confidence_threshold",String(c??.85)),this.makeRequestWithRetry(i)}async makeRequestWithRetry(e,r=0){try{return await this.makeRequest(e)}catch(o){if(o instanceof p&&o.statusCode===429&&r<this.maxRetries){let c=S(r);return await P(c),this.makeRequestWithRetry(e,r+1)}throw o}}async makeRequest(e){let r=`${this.baseUrl}/v1/extract`,o=new AbortController,c=setTimeout(()=>o.abort(),this.timeout);try{let n=await fetch(r,{method:"POST",headers:{Authorization:`Bearer ${this.apiKey}`},body:e,signal:o.signal});if(clearTimeout(c),!n.ok){let i=await this.parseErrorResponse(n);throw new p(i.message||`API request failed with status ${n.status}`,n.status,i)}let f;try{f=await n.json();}catch{throw new s("Failed to parse API response as JSON. The API may have returned an invalid response.","PARSE_ERROR")}try{return F(f)}catch(i){throw new s(`Failed to transform API response: ${i instanceof Error?i.message:String(i)}`,"TRANSFORM_ERROR")}}catch(n){throw clearTimeout(c),n instanceof Error&&n.name==="AbortError"?new s(`Request timed out after ${this.timeout}ms`,"TIMEOUT"):n instanceof s?n:n instanceof TypeError&&n.message.includes("fetch")?new s(`Network error: Unable to connect to the Parsefy API. ${n.message}`,"NETWORK_ERROR"):n instanceof TypeError?new s(`Type error: ${n.message}. This may indicate an API response format issue.`,"TYPE_ERROR"):new s(`Unexpected error: ${n instanceof Error?n.message:String(n)}`,"UNKNOWN_ERROR")}}async parseErrorResponse(e){try{return await e.json()}catch{try{return {message:await e.text()||e.statusText}}catch{return {message:e.statusText}}}}};
|
|
2
|
+
export{p as APIError,u as DEFAULT_CONFIDENCE_THRESHOLD,y as ExtractionError,E as Parsefy,s as ParsefyError,a as ValidationError};
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "parsefy",
|
|
3
|
-
"version": "1.0.
|
|
4
|
-
"description": "Official TypeScript SDK for Parsefy -
|
|
3
|
+
"version": "1.0.3",
|
|
4
|
+
"description": "Official TypeScript SDK for Parsefy - Financial Document Infrastructure for Developers",
|
|
5
5
|
"author": "",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"repository": {
|