parsefy 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -17
- package/dist/index.cjs +2 -2
- package/dist/index.d.cts +66 -13
- package/dist/index.d.mts +66 -13
- package/dist/index.d.ts +66 -13
- package/dist/index.mjs +2 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -36,19 +36,28 @@ const schema = z.object({
|
|
|
36
36
|
due_date: z.string().optional().describe('Payment due date'),
|
|
37
37
|
});
|
|
38
38
|
|
|
39
|
-
const { object, metadata, error } = await client.extract({
|
|
39
|
+
const { object, metadata, verification, error } = await client.extract({
|
|
40
40
|
file: './invoice.pdf',
|
|
41
41
|
schema,
|
|
42
|
+
enableVerification: true, // Enable math verification
|
|
42
43
|
});
|
|
43
44
|
|
|
44
45
|
if (!error && object) {
|
|
45
46
|
console.log(object.invoice_number); // Fully typed!
|
|
46
47
|
|
|
47
48
|
// Access field-level confidence and evidence
|
|
48
|
-
console.log(`Overall confidence: ${metadata.
|
|
49
|
-
metadata.
|
|
49
|
+
console.log(`Overall confidence: ${metadata.confidence_score}`);
|
|
50
|
+
metadata.field_confidence.forEach((fc) => {
|
|
50
51
|
console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
51
52
|
});
|
|
53
|
+
|
|
54
|
+
// Access verification results if enabled
|
|
55
|
+
if (verification) {
|
|
56
|
+
console.log(`Verification status: ${verification.status}`);
|
|
57
|
+
verification.checks_run.forEach((check) => {
|
|
58
|
+
console.log(`${check.type}: ${check.passed ? 'PASSED' : 'FAILED'}`);
|
|
59
|
+
});
|
|
60
|
+
}
|
|
52
61
|
}
|
|
53
62
|
```
|
|
54
63
|
|
|
@@ -102,6 +111,50 @@ const { object, metadata } = await client.extract({
|
|
|
102
111
|
|
|
103
112
|
**Default:** `0.85`
|
|
104
113
|
|
|
114
|
+
## Math Verification
|
|
115
|
+
|
|
116
|
+
Enable automatic math verification to ensure extracted numeric data is mathematically consistent:
|
|
117
|
+
|
|
118
|
+
```typescript
|
|
119
|
+
const { object, verification } = await client.extract({
|
|
120
|
+
file: './invoice.pdf',
|
|
121
|
+
schema,
|
|
122
|
+
enableVerification: true, // Enable math verification
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
if (verification) {
|
|
126
|
+
console.log(`Verification status: ${verification.status}`);
|
|
127
|
+
console.log(`Checks passed: ${verification.checks_passed}`);
|
|
128
|
+
console.log(`Checks failed: ${verification.checks_failed}`);
|
|
129
|
+
|
|
130
|
+
verification.checks_run.forEach((check) => {
|
|
131
|
+
console.log(`${check.type}: ${check.passed ? 'PASSED' : 'FAILED'}`);
|
|
132
|
+
console.log(` Fields: ${check.fields.join(', ')}`);
|
|
133
|
+
console.log(` Expected: ${check.expected}, Actual: ${check.actual}`);
|
|
134
|
+
console.log(` Delta: ${check.delta}`);
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Verification Status Values
|
|
140
|
+
|
|
141
|
+
| Status | Description |
|
|
142
|
+
|--------|-------------|
|
|
143
|
+
| `PASSED` | All math checks passed |
|
|
144
|
+
| `FAILED` | One or more math checks failed |
|
|
145
|
+
| `PARTIAL` | Some checks passed, some failed or couldn't be verified |
|
|
146
|
+
| `CANNOT_VERIFY` | Required fields are missing (not a math error) |
|
|
147
|
+
| `NO_RULES` | No verifiable fields detected in schema |
|
|
148
|
+
|
|
149
|
+
### Supported Verification Rules
|
|
150
|
+
|
|
151
|
+
- **HORIZONTAL_SUM**: Verifies `total = subtotal + tax`
|
|
152
|
+
- **VERTICAL_SUM**: Verifies `subtotal = sum(line_items)`
|
|
153
|
+
|
|
154
|
+
### Shadow Extraction
|
|
155
|
+
|
|
156
|
+
When `enableVerification: true` and only a single verifiable field is requested (e.g., just `total`), Parsefy automatically extracts supporting fields in the background for verification, then removes them from the response.
|
|
157
|
+
|
|
105
158
|
## Response Format
|
|
106
159
|
|
|
107
160
|
```typescript
|
|
@@ -111,15 +164,13 @@ interface ExtractResult<T> {
|
|
|
111
164
|
|
|
112
165
|
// Metadata about the extraction
|
|
113
166
|
metadata: {
|
|
114
|
-
|
|
115
|
-
inputTokens: number; // Input tokens used
|
|
116
|
-
outputTokens: number; // Output tokens generated
|
|
167
|
+
processing_time_ms: number; // Processing time in milliseconds
|
|
117
168
|
credits: number; // Credits consumed (1 credit = 1 page)
|
|
118
|
-
|
|
169
|
+
fallback_triggered: boolean; // Whether fallback model was used
|
|
119
170
|
|
|
120
171
|
// 🆕 Field-level confidence and evidence
|
|
121
|
-
|
|
122
|
-
|
|
172
|
+
confidence_score: number; // Overall confidence (0.0 to 1.0)
|
|
173
|
+
field_confidence: Array<{
|
|
123
174
|
field: string; // JSON path (e.g., "$.invoice_number")
|
|
124
175
|
score: number; // Confidence score (0.0 to 1.0)
|
|
125
176
|
reason: string; // "Exact match", "Inferred from header", etc.
|
|
@@ -129,6 +180,23 @@ interface ExtractResult<T> {
|
|
|
129
180
|
issues: string[]; // Warnings or anomalies detected
|
|
130
181
|
};
|
|
131
182
|
|
|
183
|
+
// Math verification results (only present if enableVerification was true)
|
|
184
|
+
verification?: {
|
|
185
|
+
status: "PASSED" | "FAILED" | "PARTIAL" | "CANNOT_VERIFY" | "NO_RULES";
|
|
186
|
+
checks_passed: number;
|
|
187
|
+
checks_failed: number;
|
|
188
|
+
cannot_verify_count: number;
|
|
189
|
+
checks_run: Array<{
|
|
190
|
+
type: string; // e.g., "HORIZONTAL_SUM", "VERTICAL_SUM"
|
|
191
|
+
status: string;
|
|
192
|
+
fields: string[];
|
|
193
|
+
passed: boolean;
|
|
194
|
+
delta: number;
|
|
195
|
+
expected: number;
|
|
196
|
+
actual: number;
|
|
197
|
+
}>;
|
|
198
|
+
};
|
|
199
|
+
|
|
132
200
|
// Error details if extraction failed
|
|
133
201
|
error: {
|
|
134
202
|
code: string;
|
|
@@ -140,7 +208,11 @@ interface ExtractResult<T> {
|
|
|
140
208
|
### Example Response
|
|
141
209
|
|
|
142
210
|
```typescript
|
|
143
|
-
const { object, metadata } = await client.extract({
|
|
211
|
+
const { object, metadata, verification } = await client.extract({
|
|
212
|
+
file,
|
|
213
|
+
schema,
|
|
214
|
+
enableVerification: true
|
|
215
|
+
});
|
|
144
216
|
|
|
145
217
|
// object:
|
|
146
218
|
{
|
|
@@ -150,9 +222,9 @@ const { object, metadata } = await client.extract({ file, schema });
|
|
|
150
222
|
vendor: "Acme Corp"
|
|
151
223
|
}
|
|
152
224
|
|
|
153
|
-
// metadata.
|
|
225
|
+
// metadata.confidence_score: 0.94
|
|
154
226
|
|
|
155
|
-
// metadata.
|
|
227
|
+
// metadata.field_confidence:
|
|
156
228
|
[
|
|
157
229
|
{ field: "$.invoice_number", score: 0.98, reason: "Exact match", page: 1, text: "Invoice # INV-2024-0042" },
|
|
158
230
|
{ field: "$.date", score: 0.95, reason: "Exact match", page: 1, text: "Date: 01/15/2024" },
|
|
@@ -161,6 +233,25 @@ const { object, metadata } = await client.extract({ file, schema });
|
|
|
161
233
|
]
|
|
162
234
|
|
|
163
235
|
// metadata.issues: []
|
|
236
|
+
|
|
237
|
+
// verification (only present if enableVerification was true):
|
|
238
|
+
{
|
|
239
|
+
status: "PASSED",
|
|
240
|
+
checks_passed: 1,
|
|
241
|
+
checks_failed: 0,
|
|
242
|
+
cannot_verify_count: 0,
|
|
243
|
+
checks_run: [
|
|
244
|
+
{
|
|
245
|
+
type: "HORIZONTAL_SUM",
|
|
246
|
+
status: "PASSED",
|
|
247
|
+
fields: ["total", "subtotal", "tax"],
|
|
248
|
+
passed: true,
|
|
249
|
+
delta: 0.0,
|
|
250
|
+
expected: 1250.00,
|
|
251
|
+
actual: 1250.00
|
|
252
|
+
}
|
|
253
|
+
]
|
|
254
|
+
}
|
|
164
255
|
```
|
|
165
256
|
|
|
166
257
|
## Configuration
|
|
@@ -196,6 +287,7 @@ const client = new Parsefy({
|
|
|
196
287
|
| `file` | `File \| Blob \| Buffer \| string` | required | Document to extract from |
|
|
197
288
|
| `schema` | `z.ZodType` | required | Zod schema defining extraction structure |
|
|
198
289
|
| `confidenceThreshold` | `number` | `0.85` | Minimum confidence before triggering fallback |
|
|
290
|
+
| `enableVerification` | `boolean` | `false` | Enable math verification (includes shadow extraction) |
|
|
199
291
|
|
|
200
292
|
## Usage
|
|
201
293
|
|
|
@@ -286,8 +378,8 @@ app.post('/extract', upload.single('document'), async (req, res) => {
|
|
|
286
378
|
|
|
287
379
|
res.json({
|
|
288
380
|
data: object,
|
|
289
|
-
confidence: metadata.
|
|
290
|
-
fieldDetails: metadata.
|
|
381
|
+
confidence: metadata.confidence_score,
|
|
382
|
+
fieldDetails: metadata.field_confidence,
|
|
291
383
|
error,
|
|
292
384
|
});
|
|
293
385
|
});
|
|
@@ -313,7 +405,7 @@ app.post('/extract', async (c) => {
|
|
|
313
405
|
|
|
314
406
|
return c.json({
|
|
315
407
|
data: object,
|
|
316
|
-
confidence: metadata.
|
|
408
|
+
confidence: metadata.confidence_score,
|
|
317
409
|
issues: metadata.issues,
|
|
318
410
|
error,
|
|
319
411
|
});
|
|
@@ -334,13 +426,13 @@ try {
|
|
|
334
426
|
// Extraction-level errors (request succeeded, but extraction failed)
|
|
335
427
|
if (error) {
|
|
336
428
|
console.error(`Extraction failed: [${error.code}] ${error.message}`);
|
|
337
|
-
console.log(`Fallback triggered: ${metadata.
|
|
429
|
+
console.log(`Fallback triggered: ${metadata.fallback_triggered}`);
|
|
338
430
|
console.log(`Issues: ${metadata.issues.join(', ')}`);
|
|
339
431
|
return;
|
|
340
432
|
}
|
|
341
433
|
|
|
342
434
|
// Check for low confidence fields
|
|
343
|
-
const lowConfidence = metadata.
|
|
435
|
+
const lowConfidence = metadata.field_confidence.filter((fc) => fc.score < 0.80);
|
|
344
436
|
if (lowConfidence.length > 0) {
|
|
345
437
|
console.warn('Low confidence fields:', lowConfidence);
|
|
346
438
|
}
|
|
@@ -392,6 +484,9 @@ import type {
|
|
|
392
484
|
ExtractResult,
|
|
393
485
|
ExtractionMetadata,
|
|
394
486
|
FieldConfidence,
|
|
487
|
+
Verification,
|
|
488
|
+
VerificationStatus,
|
|
489
|
+
VerificationCheck,
|
|
395
490
|
APIErrorResponse,
|
|
396
491
|
} from 'parsefy';
|
|
397
492
|
|
package/dist/index.cjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
'use strict';var zodToJsonSchema=require('zod-to-json-schema');var
|
|
2
|
-
exports.APIError=
|
|
1
|
+
'use strict';var zodToJsonSchema=require('zod-to-json-schema');var m=.85,l={".pdf":"application/pdf",".docx":"application/vnd.openxmlformats-officedocument.wordprocessingml.document"},y=10*1024*1024,x="https://api.parsefy.io",b=6e4;var s=class extends Error{constructor(t,r){super(t),this.name="ParsefyError",this.code=r,typeof Error.captureStackTrace=="function"&&Error.captureStackTrace(this,this.constructor);}},f=class extends s{constructor(t,r,n){super(t),this.name="APIError",this.statusCode=r,this.response=n;}},h=class extends s{constructor(t,r,n){super(t,r),this.name="ExtractionError",this.metadata=n;}},i=class extends s{constructor(t){super(t),this.name="ValidationError";}};function g(){return typeof process<"u"&&process.versions?.node!==void 0}function R(e){let t=zodToJsonSchema.zodToJsonSchema(e,{$refStrategy:"none",target:"jsonSchema7"});return "$schema"in t&&delete t.$schema,t}function T(e){let t=e.toLowerCase().match(/\.[^.]+$/)?.[0];return t&&l[t]||null}function w(e){if(!T(e)){let r=Object.keys(l).join(", ");throw new i(`Unsupported file type. Supported types: ${r}`)}}function d(e){if(e===0)throw new i("File is empty");if(e>y){let t=y/1048576;throw new i(`File size exceeds maximum limit of ${t}MB`)}}function F(e){let t=e._meta||{confidence_score:1,field_confidence:[],issues:[]},r={object:e.object,metadata:{processing_time_ms:e.metadata.processing_time_ms,credits:e.metadata.credits,fallback_triggered:e.metadata.fallback_triggered,confidence_score:t.confidence_score,field_confidence:t.field_confidence.map(n=>({field:n.field,score:n.score,reason:n.reason,page:n.page,text:n.text})),issues:t.issues},error:e.error};return e.verification&&(r.verification={status:e.verification.status,checks_passed:e.verification.checks_passed,checks_failed:e.verification.checks_failed,cannot_verify_count:e.verification.cannot_verify_count,checks_run:e.verification.checks_run.map(n=>({type:n.type,status:n.status,fields:n.fields,passed:n.passed,delta:n.delta,expected:n.expected,actual:n.actual}))}),r}function _(e,t){let r=T(t)||"application/octet-stream",n=e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength);return typeof File<"u"?new File([n],t,{type:r}):new Blob([n],{type:r})}async function I(e){if(!g())throw new i("File path strings are only supported in Node.js. Use File or Blob in the browser.");let t=await import('fs'),r=await import('path');if(!t.existsSync(e))throw new i(`File not found: ${e}`);let n=r.basename(e);w(n);let a=t.readFileSync(e);return d(a.length),{buffer:a,filename:n}}async function S(e){if(typeof e=="string"){let{buffer:t,filename:r}=await I(e);return _(t,r)}if(Buffer.isBuffer(e))return d(e.length),_(e,"document.pdf");if(e instanceof File)return w(e.name),d(e.size),e;if(e instanceof Blob)return d(e.size),e;throw new i("Invalid file input. Expected File, Blob, Buffer, or file path string.")}function A(e){return new Promise(t=>setTimeout(t,e))}function P(e,t=1e3){let r=t*Math.pow(2,e),n=Math.random()*.1*r;return Math.min(r+n,3e4)}var E=class{constructor(t){this.maxRetries=3;let r={};if(typeof t=="string"?r={apiKey:t}:t&&(r=t),this.apiKey=r.apiKey||this.getEnvApiKey(),!this.apiKey)throw new i("API key is required. Provide it in the constructor or set the PARSEFY_API_KEY environment variable.");this.baseUrl=r.baseUrl||x,this.timeout=r.timeout||b;}getEnvApiKey(){return g()&&process.env.PARSEFY_API_KEY||""}async extract(t){let{file:r,schema:n,confidenceThreshold:a,enableVerification:o}=t,u=R(n),c=await S(r),p=new FormData;return p.append("file",c),p.append("output_schema",JSON.stringify(u)),p.append("confidence_threshold",String(a??.85)),o!==void 0&&p.append("enable_verification",String(o)),this.makeRequestWithRetry(p)}async makeRequestWithRetry(t,r=0){try{return await this.makeRequest(t)}catch(n){if(n instanceof f&&n.statusCode===429&&r<this.maxRetries){let a=P(r);return await A(a),this.makeRequestWithRetry(t,r+1)}throw n}}async makeRequest(t){let r=`${this.baseUrl}/v1/extract`,n=new AbortController,a=setTimeout(()=>n.abort(),this.timeout);try{let o=await fetch(r,{method:"POST",headers:{Authorization:`Bearer ${this.apiKey}`},body:t,signal:n.signal});if(clearTimeout(a),!o.ok){let c=await this.parseErrorResponse(o);throw new f(c.message||`API request failed with status ${o.status}`,o.status,c)}let u;try{u=await o.json();}catch{throw new s("Failed to parse API response as JSON. The API may have returned an invalid response.","PARSE_ERROR")}try{return F(u)}catch(c){throw new s(`Failed to transform API response: ${c instanceof Error?c.message:String(c)}`,"TRANSFORM_ERROR")}}catch(o){throw clearTimeout(a),o instanceof Error&&o.name==="AbortError"?new s(`Request timed out after ${this.timeout}ms`,"TIMEOUT"):o instanceof s?o:o instanceof TypeError&&o.message.includes("fetch")?new s(`Network error: Unable to connect to the Parsefy API. ${o.message}`,"NETWORK_ERROR"):o instanceof TypeError?new s(`Type error: ${o.message}. This may indicate an API response format issue.`,"TYPE_ERROR"):new s(`Unexpected error: ${o instanceof Error?o.message:String(o)}`,"UNKNOWN_ERROR")}}async parseErrorResponse(t){try{return await t.json()}catch{try{return {message:await t.text()||t.statusText}}catch{return {message:t.statusText}}}}};
|
|
2
|
+
exports.APIError=f;exports.DEFAULT_CONFIDENCE_THRESHOLD=m;exports.ExtractionError=h;exports.Parsefy=E;exports.ParsefyError=s;exports.ValidationError=i;
|
package/dist/index.d.cts
CHANGED
|
@@ -37,6 +37,14 @@ interface ExtractOptions<T extends z.ZodType> {
|
|
|
37
37
|
* appear in all documents. This prevents unnecessary fallback triggers and reduces costs.
|
|
38
38
|
*/
|
|
39
39
|
confidenceThreshold?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Enable math verification (includes shadow extraction). Defaults to false.
|
|
42
|
+
*
|
|
43
|
+
* When enabled, Parsefy automatically verifies mathematical consistency of numeric data
|
|
44
|
+
* (totals, subtotals, taxes, line items). If only a single verifiable field is requested,
|
|
45
|
+
* supporting fields are automatically extracted in the background for verification.
|
|
46
|
+
*/
|
|
47
|
+
enableVerification?: boolean;
|
|
40
48
|
}
|
|
41
49
|
/**
|
|
42
50
|
* Confidence details for a single extracted field.
|
|
@@ -59,22 +67,56 @@ interface FieldConfidence {
|
|
|
59
67
|
*/
|
|
60
68
|
interface ExtractionMetadata {
|
|
61
69
|
/** Time taken to process the document in milliseconds. */
|
|
62
|
-
|
|
63
|
-
/** Number of input tokens used. */
|
|
64
|
-
inputTokens: number;
|
|
65
|
-
/** Number of output tokens generated. */
|
|
66
|
-
outputTokens: number;
|
|
70
|
+
processing_time_ms: number;
|
|
67
71
|
/** Number of credits consumed (1 credit = 1 page). */
|
|
68
72
|
credits: number;
|
|
69
73
|
/** Whether the fallback model was triggered for higher accuracy. */
|
|
70
|
-
|
|
74
|
+
fallback_triggered: boolean;
|
|
71
75
|
/** Overall confidence score for the extraction (0.0 to 1.0). */
|
|
72
|
-
|
|
76
|
+
confidence_score: number;
|
|
73
77
|
/** Per-field confidence details with evidence and explanations. */
|
|
74
|
-
|
|
78
|
+
field_confidence: FieldConfidence[];
|
|
75
79
|
/** List of issues or warnings encountered during extraction. */
|
|
76
80
|
issues: string[];
|
|
77
81
|
}
|
|
82
|
+
/**
|
|
83
|
+
* Verification status values.
|
|
84
|
+
*/
|
|
85
|
+
type VerificationStatus = 'PASSED' | 'FAILED' | 'PARTIAL' | 'CANNOT_VERIFY' | 'NO_RULES';
|
|
86
|
+
/**
|
|
87
|
+
* Individual verification check result.
|
|
88
|
+
*/
|
|
89
|
+
interface VerificationCheck {
|
|
90
|
+
/** Type of verification check (e.g., "HORIZONTAL_SUM", "VERTICAL_SUM"). */
|
|
91
|
+
type: string;
|
|
92
|
+
/** Status of this check. */
|
|
93
|
+
status: string;
|
|
94
|
+
/** Fields involved in this check. */
|
|
95
|
+
fields: string[];
|
|
96
|
+
/** Whether this check passed. */
|
|
97
|
+
passed: boolean;
|
|
98
|
+
/** Difference between expected and actual values. */
|
|
99
|
+
delta: number;
|
|
100
|
+
/** Expected value based on the rule. */
|
|
101
|
+
expected: number;
|
|
102
|
+
/** Actual extracted value. */
|
|
103
|
+
actual: number;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Math verification results.
|
|
107
|
+
*/
|
|
108
|
+
interface Verification {
|
|
109
|
+
/** Overall verification status. */
|
|
110
|
+
status: VerificationStatus;
|
|
111
|
+
/** Number of checks that passed. */
|
|
112
|
+
checks_passed: number;
|
|
113
|
+
/** Number of checks that failed. */
|
|
114
|
+
checks_failed: number;
|
|
115
|
+
/** Number of checks that could not be verified. */
|
|
116
|
+
cannot_verify_count: number;
|
|
117
|
+
/** Detailed results for each check that was run. */
|
|
118
|
+
checks_run: VerificationCheck[];
|
|
119
|
+
}
|
|
78
120
|
/**
|
|
79
121
|
* Error response from the API.
|
|
80
122
|
*/
|
|
@@ -92,6 +134,8 @@ interface ExtractResult<T> {
|
|
|
92
134
|
object: T | null;
|
|
93
135
|
/** Metadata about the extraction process. */
|
|
94
136
|
metadata: ExtractionMetadata;
|
|
137
|
+
/** Math verification results (only present if enableVerification was true). */
|
|
138
|
+
verification?: Verification;
|
|
95
139
|
/** Error details if extraction failed, or null on success. */
|
|
96
140
|
error: APIErrorResponse | null;
|
|
97
141
|
}
|
|
@@ -126,7 +170,7 @@ interface ExtractResult<T> {
|
|
|
126
170
|
* });
|
|
127
171
|
*
|
|
128
172
|
* // Check per-field confidence and evidence
|
|
129
|
-
* metadata.
|
|
173
|
+
* metadata.field_confidence.forEach((fc) => {
|
|
130
174
|
* console.log(`${fc.field}: ${fc.score} - "${fc.text}"`);
|
|
131
175
|
* });
|
|
132
176
|
* ```
|
|
@@ -184,20 +228,29 @@ declare class Parsefy {
|
|
|
184
228
|
* due_date: z.string().optional().describe('Payment due date'),
|
|
185
229
|
* });
|
|
186
230
|
*
|
|
187
|
-
* const { object, metadata, error } = await client.extract({
|
|
231
|
+
* const { object, metadata, verification, error } = await client.extract({
|
|
188
232
|
* file: './invoice.pdf',
|
|
189
233
|
* schema,
|
|
190
234
|
* confidenceThreshold: 0.85, // Lower = faster, Higher = more accurate
|
|
235
|
+
* enableVerification: true, // Enable math verification
|
|
191
236
|
* });
|
|
192
237
|
*
|
|
193
238
|
* if (!error && object) {
|
|
194
239
|
* console.log(object.invoice_number); // Fully typed!
|
|
195
240
|
*
|
|
196
241
|
* // Access field-level confidence and evidence
|
|
197
|
-
* console.log(`Overall confidence: ${metadata.
|
|
198
|
-
* metadata.
|
|
242
|
+
* console.log(`Overall confidence: ${metadata.confidence_score}`);
|
|
243
|
+
* metadata.field_confidence.forEach((fc) => {
|
|
199
244
|
* console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
200
245
|
* });
|
|
246
|
+
*
|
|
247
|
+
* // Access verification results if enabled
|
|
248
|
+
* if (verification) {
|
|
249
|
+
* console.log(`Verification status: ${verification.status}`);
|
|
250
|
+
* verification.checks_run.forEach((check) => {
|
|
251
|
+
* console.log(`${check.type}: ${check.passed ? 'PASSED' : 'FAILED'}`);
|
|
252
|
+
* });
|
|
253
|
+
* }
|
|
201
254
|
* }
|
|
202
255
|
* ```
|
|
203
256
|
*/
|
|
@@ -250,4 +303,4 @@ declare class ValidationError extends ParsefyError {
|
|
|
250
303
|
constructor(message: string);
|
|
251
304
|
}
|
|
252
305
|
|
|
253
|
-
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
|
306
|
+
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError, type Verification, type VerificationCheck, type VerificationStatus };
|
package/dist/index.d.mts
CHANGED
|
@@ -37,6 +37,14 @@ interface ExtractOptions<T extends z.ZodType> {
|
|
|
37
37
|
* appear in all documents. This prevents unnecessary fallback triggers and reduces costs.
|
|
38
38
|
*/
|
|
39
39
|
confidenceThreshold?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Enable math verification (includes shadow extraction). Defaults to false.
|
|
42
|
+
*
|
|
43
|
+
* When enabled, Parsefy automatically verifies mathematical consistency of numeric data
|
|
44
|
+
* (totals, subtotals, taxes, line items). If only a single verifiable field is requested,
|
|
45
|
+
* supporting fields are automatically extracted in the background for verification.
|
|
46
|
+
*/
|
|
47
|
+
enableVerification?: boolean;
|
|
40
48
|
}
|
|
41
49
|
/**
|
|
42
50
|
* Confidence details for a single extracted field.
|
|
@@ -59,22 +67,56 @@ interface FieldConfidence {
|
|
|
59
67
|
*/
|
|
60
68
|
interface ExtractionMetadata {
|
|
61
69
|
/** Time taken to process the document in milliseconds. */
|
|
62
|
-
|
|
63
|
-
/** Number of input tokens used. */
|
|
64
|
-
inputTokens: number;
|
|
65
|
-
/** Number of output tokens generated. */
|
|
66
|
-
outputTokens: number;
|
|
70
|
+
processing_time_ms: number;
|
|
67
71
|
/** Number of credits consumed (1 credit = 1 page). */
|
|
68
72
|
credits: number;
|
|
69
73
|
/** Whether the fallback model was triggered for higher accuracy. */
|
|
70
|
-
|
|
74
|
+
fallback_triggered: boolean;
|
|
71
75
|
/** Overall confidence score for the extraction (0.0 to 1.0). */
|
|
72
|
-
|
|
76
|
+
confidence_score: number;
|
|
73
77
|
/** Per-field confidence details with evidence and explanations. */
|
|
74
|
-
|
|
78
|
+
field_confidence: FieldConfidence[];
|
|
75
79
|
/** List of issues or warnings encountered during extraction. */
|
|
76
80
|
issues: string[];
|
|
77
81
|
}
|
|
82
|
+
/**
|
|
83
|
+
* Verification status values.
|
|
84
|
+
*/
|
|
85
|
+
type VerificationStatus = 'PASSED' | 'FAILED' | 'PARTIAL' | 'CANNOT_VERIFY' | 'NO_RULES';
|
|
86
|
+
/**
|
|
87
|
+
* Individual verification check result.
|
|
88
|
+
*/
|
|
89
|
+
interface VerificationCheck {
|
|
90
|
+
/** Type of verification check (e.g., "HORIZONTAL_SUM", "VERTICAL_SUM"). */
|
|
91
|
+
type: string;
|
|
92
|
+
/** Status of this check. */
|
|
93
|
+
status: string;
|
|
94
|
+
/** Fields involved in this check. */
|
|
95
|
+
fields: string[];
|
|
96
|
+
/** Whether this check passed. */
|
|
97
|
+
passed: boolean;
|
|
98
|
+
/** Difference between expected and actual values. */
|
|
99
|
+
delta: number;
|
|
100
|
+
/** Expected value based on the rule. */
|
|
101
|
+
expected: number;
|
|
102
|
+
/** Actual extracted value. */
|
|
103
|
+
actual: number;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Math verification results.
|
|
107
|
+
*/
|
|
108
|
+
interface Verification {
|
|
109
|
+
/** Overall verification status. */
|
|
110
|
+
status: VerificationStatus;
|
|
111
|
+
/** Number of checks that passed. */
|
|
112
|
+
checks_passed: number;
|
|
113
|
+
/** Number of checks that failed. */
|
|
114
|
+
checks_failed: number;
|
|
115
|
+
/** Number of checks that could not be verified. */
|
|
116
|
+
cannot_verify_count: number;
|
|
117
|
+
/** Detailed results for each check that was run. */
|
|
118
|
+
checks_run: VerificationCheck[];
|
|
119
|
+
}
|
|
78
120
|
/**
|
|
79
121
|
* Error response from the API.
|
|
80
122
|
*/
|
|
@@ -92,6 +134,8 @@ interface ExtractResult<T> {
|
|
|
92
134
|
object: T | null;
|
|
93
135
|
/** Metadata about the extraction process. */
|
|
94
136
|
metadata: ExtractionMetadata;
|
|
137
|
+
/** Math verification results (only present if enableVerification was true). */
|
|
138
|
+
verification?: Verification;
|
|
95
139
|
/** Error details if extraction failed, or null on success. */
|
|
96
140
|
error: APIErrorResponse | null;
|
|
97
141
|
}
|
|
@@ -126,7 +170,7 @@ interface ExtractResult<T> {
|
|
|
126
170
|
* });
|
|
127
171
|
*
|
|
128
172
|
* // Check per-field confidence and evidence
|
|
129
|
-
* metadata.
|
|
173
|
+
* metadata.field_confidence.forEach((fc) => {
|
|
130
174
|
* console.log(`${fc.field}: ${fc.score} - "${fc.text}"`);
|
|
131
175
|
* });
|
|
132
176
|
* ```
|
|
@@ -184,20 +228,29 @@ declare class Parsefy {
|
|
|
184
228
|
* due_date: z.string().optional().describe('Payment due date'),
|
|
185
229
|
* });
|
|
186
230
|
*
|
|
187
|
-
* const { object, metadata, error } = await client.extract({
|
|
231
|
+
* const { object, metadata, verification, error } = await client.extract({
|
|
188
232
|
* file: './invoice.pdf',
|
|
189
233
|
* schema,
|
|
190
234
|
* confidenceThreshold: 0.85, // Lower = faster, Higher = more accurate
|
|
235
|
+
* enableVerification: true, // Enable math verification
|
|
191
236
|
* });
|
|
192
237
|
*
|
|
193
238
|
* if (!error && object) {
|
|
194
239
|
* console.log(object.invoice_number); // Fully typed!
|
|
195
240
|
*
|
|
196
241
|
* // Access field-level confidence and evidence
|
|
197
|
-
* console.log(`Overall confidence: ${metadata.
|
|
198
|
-
* metadata.
|
|
242
|
+
* console.log(`Overall confidence: ${metadata.confidence_score}`);
|
|
243
|
+
* metadata.field_confidence.forEach((fc) => {
|
|
199
244
|
* console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
200
245
|
* });
|
|
246
|
+
*
|
|
247
|
+
* // Access verification results if enabled
|
|
248
|
+
* if (verification) {
|
|
249
|
+
* console.log(`Verification status: ${verification.status}`);
|
|
250
|
+
* verification.checks_run.forEach((check) => {
|
|
251
|
+
* console.log(`${check.type}: ${check.passed ? 'PASSED' : 'FAILED'}`);
|
|
252
|
+
* });
|
|
253
|
+
* }
|
|
201
254
|
* }
|
|
202
255
|
* ```
|
|
203
256
|
*/
|
|
@@ -250,4 +303,4 @@ declare class ValidationError extends ParsefyError {
|
|
|
250
303
|
constructor(message: string);
|
|
251
304
|
}
|
|
252
305
|
|
|
253
|
-
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
|
306
|
+
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError, type Verification, type VerificationCheck, type VerificationStatus };
|
package/dist/index.d.ts
CHANGED
|
@@ -37,6 +37,14 @@ interface ExtractOptions<T extends z.ZodType> {
|
|
|
37
37
|
* appear in all documents. This prevents unnecessary fallback triggers and reduces costs.
|
|
38
38
|
*/
|
|
39
39
|
confidenceThreshold?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Enable math verification (includes shadow extraction). Defaults to false.
|
|
42
|
+
*
|
|
43
|
+
* When enabled, Parsefy automatically verifies mathematical consistency of numeric data
|
|
44
|
+
* (totals, subtotals, taxes, line items). If only a single verifiable field is requested,
|
|
45
|
+
* supporting fields are automatically extracted in the background for verification.
|
|
46
|
+
*/
|
|
47
|
+
enableVerification?: boolean;
|
|
40
48
|
}
|
|
41
49
|
/**
|
|
42
50
|
* Confidence details for a single extracted field.
|
|
@@ -59,22 +67,56 @@ interface FieldConfidence {
|
|
|
59
67
|
*/
|
|
60
68
|
interface ExtractionMetadata {
|
|
61
69
|
/** Time taken to process the document in milliseconds. */
|
|
62
|
-
|
|
63
|
-
/** Number of input tokens used. */
|
|
64
|
-
inputTokens: number;
|
|
65
|
-
/** Number of output tokens generated. */
|
|
66
|
-
outputTokens: number;
|
|
70
|
+
processing_time_ms: number;
|
|
67
71
|
/** Number of credits consumed (1 credit = 1 page). */
|
|
68
72
|
credits: number;
|
|
69
73
|
/** Whether the fallback model was triggered for higher accuracy. */
|
|
70
|
-
|
|
74
|
+
fallback_triggered: boolean;
|
|
71
75
|
/** Overall confidence score for the extraction (0.0 to 1.0). */
|
|
72
|
-
|
|
76
|
+
confidence_score: number;
|
|
73
77
|
/** Per-field confidence details with evidence and explanations. */
|
|
74
|
-
|
|
78
|
+
field_confidence: FieldConfidence[];
|
|
75
79
|
/** List of issues or warnings encountered during extraction. */
|
|
76
80
|
issues: string[];
|
|
77
81
|
}
|
|
82
|
+
/**
|
|
83
|
+
* Verification status values.
|
|
84
|
+
*/
|
|
85
|
+
type VerificationStatus = 'PASSED' | 'FAILED' | 'PARTIAL' | 'CANNOT_VERIFY' | 'NO_RULES';
|
|
86
|
+
/**
|
|
87
|
+
* Individual verification check result.
|
|
88
|
+
*/
|
|
89
|
+
interface VerificationCheck {
|
|
90
|
+
/** Type of verification check (e.g., "HORIZONTAL_SUM", "VERTICAL_SUM"). */
|
|
91
|
+
type: string;
|
|
92
|
+
/** Status of this check. */
|
|
93
|
+
status: string;
|
|
94
|
+
/** Fields involved in this check. */
|
|
95
|
+
fields: string[];
|
|
96
|
+
/** Whether this check passed. */
|
|
97
|
+
passed: boolean;
|
|
98
|
+
/** Difference between expected and actual values. */
|
|
99
|
+
delta: number;
|
|
100
|
+
/** Expected value based on the rule. */
|
|
101
|
+
expected: number;
|
|
102
|
+
/** Actual extracted value. */
|
|
103
|
+
actual: number;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Math verification results.
|
|
107
|
+
*/
|
|
108
|
+
interface Verification {
|
|
109
|
+
/** Overall verification status. */
|
|
110
|
+
status: VerificationStatus;
|
|
111
|
+
/** Number of checks that passed. */
|
|
112
|
+
checks_passed: number;
|
|
113
|
+
/** Number of checks that failed. */
|
|
114
|
+
checks_failed: number;
|
|
115
|
+
/** Number of checks that could not be verified. */
|
|
116
|
+
cannot_verify_count: number;
|
|
117
|
+
/** Detailed results for each check that was run. */
|
|
118
|
+
checks_run: VerificationCheck[];
|
|
119
|
+
}
|
|
78
120
|
/**
|
|
79
121
|
* Error response from the API.
|
|
80
122
|
*/
|
|
@@ -92,6 +134,8 @@ interface ExtractResult<T> {
|
|
|
92
134
|
object: T | null;
|
|
93
135
|
/** Metadata about the extraction process. */
|
|
94
136
|
metadata: ExtractionMetadata;
|
|
137
|
+
/** Math verification results (only present if enableVerification was true). */
|
|
138
|
+
verification?: Verification;
|
|
95
139
|
/** Error details if extraction failed, or null on success. */
|
|
96
140
|
error: APIErrorResponse | null;
|
|
97
141
|
}
|
|
@@ -126,7 +170,7 @@ interface ExtractResult<T> {
|
|
|
126
170
|
* });
|
|
127
171
|
*
|
|
128
172
|
* // Check per-field confidence and evidence
|
|
129
|
-
* metadata.
|
|
173
|
+
* metadata.field_confidence.forEach((fc) => {
|
|
130
174
|
* console.log(`${fc.field}: ${fc.score} - "${fc.text}"`);
|
|
131
175
|
* });
|
|
132
176
|
* ```
|
|
@@ -184,20 +228,29 @@ declare class Parsefy {
|
|
|
184
228
|
* due_date: z.string().optional().describe('Payment due date'),
|
|
185
229
|
* });
|
|
186
230
|
*
|
|
187
|
-
* const { object, metadata, error } = await client.extract({
|
|
231
|
+
* const { object, metadata, verification, error } = await client.extract({
|
|
188
232
|
* file: './invoice.pdf',
|
|
189
233
|
* schema,
|
|
190
234
|
* confidenceThreshold: 0.85, // Lower = faster, Higher = more accurate
|
|
235
|
+
* enableVerification: true, // Enable math verification
|
|
191
236
|
* });
|
|
192
237
|
*
|
|
193
238
|
* if (!error && object) {
|
|
194
239
|
* console.log(object.invoice_number); // Fully typed!
|
|
195
240
|
*
|
|
196
241
|
* // Access field-level confidence and evidence
|
|
197
|
-
* console.log(`Overall confidence: ${metadata.
|
|
198
|
-
* metadata.
|
|
242
|
+
* console.log(`Overall confidence: ${metadata.confidence_score}`);
|
|
243
|
+
* metadata.field_confidence.forEach((fc) => {
|
|
199
244
|
* console.log(`${fc.field}: ${fc.score} (${fc.reason}) - "${fc.text}"`);
|
|
200
245
|
* });
|
|
246
|
+
*
|
|
247
|
+
* // Access verification results if enabled
|
|
248
|
+
* if (verification) {
|
|
249
|
+
* console.log(`Verification status: ${verification.status}`);
|
|
250
|
+
* verification.checks_run.forEach((check) => {
|
|
251
|
+
* console.log(`${check.type}: ${check.passed ? 'PASSED' : 'FAILED'}`);
|
|
252
|
+
* });
|
|
253
|
+
* }
|
|
201
254
|
* }
|
|
202
255
|
* ```
|
|
203
256
|
*/
|
|
@@ -250,4 +303,4 @@ declare class ValidationError extends ParsefyError {
|
|
|
250
303
|
constructor(message: string);
|
|
251
304
|
}
|
|
252
305
|
|
|
253
|
-
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError };
|
|
306
|
+
export { APIError, type APIErrorResponse, DEFAULT_CONFIDENCE_THRESHOLD, type ExtractOptions, type ExtractResult, ExtractionError, type ExtractionMetadata, type FieldConfidence, Parsefy, type ParsefyConfig, ParsefyError, ValidationError, type Verification, type VerificationCheck, type VerificationStatus };
|
package/dist/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import {zodToJsonSchema}from'zod-to-json-schema';var
|
|
2
|
-
export{
|
|
1
|
+
import {zodToJsonSchema}from'zod-to-json-schema';var m=.85,l={".pdf":"application/pdf",".docx":"application/vnd.openxmlformats-officedocument.wordprocessingml.document"},y=10*1024*1024,x="https://api.parsefy.io",b=6e4;var s=class extends Error{constructor(t,r){super(t),this.name="ParsefyError",this.code=r,typeof Error.captureStackTrace=="function"&&Error.captureStackTrace(this,this.constructor);}},f=class extends s{constructor(t,r,n){super(t),this.name="APIError",this.statusCode=r,this.response=n;}},h=class extends s{constructor(t,r,n){super(t,r),this.name="ExtractionError",this.metadata=n;}},i=class extends s{constructor(t){super(t),this.name="ValidationError";}};function g(){return typeof process<"u"&&process.versions?.node!==void 0}function R(e){let t=zodToJsonSchema(e,{$refStrategy:"none",target:"jsonSchema7"});return "$schema"in t&&delete t.$schema,t}function T(e){let t=e.toLowerCase().match(/\.[^.]+$/)?.[0];return t&&l[t]||null}function w(e){if(!T(e)){let r=Object.keys(l).join(", ");throw new i(`Unsupported file type. Supported types: ${r}`)}}function d(e){if(e===0)throw new i("File is empty");if(e>y){let t=y/1048576;throw new i(`File size exceeds maximum limit of ${t}MB`)}}function F(e){let t=e._meta||{confidence_score:1,field_confidence:[],issues:[]},r={object:e.object,metadata:{processing_time_ms:e.metadata.processing_time_ms,credits:e.metadata.credits,fallback_triggered:e.metadata.fallback_triggered,confidence_score:t.confidence_score,field_confidence:t.field_confidence.map(n=>({field:n.field,score:n.score,reason:n.reason,page:n.page,text:n.text})),issues:t.issues},error:e.error};return e.verification&&(r.verification={status:e.verification.status,checks_passed:e.verification.checks_passed,checks_failed:e.verification.checks_failed,cannot_verify_count:e.verification.cannot_verify_count,checks_run:e.verification.checks_run.map(n=>({type:n.type,status:n.status,fields:n.fields,passed:n.passed,delta:n.delta,expected:n.expected,actual:n.actual}))}),r}function _(e,t){let r=T(t)||"application/octet-stream",n=e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength);return typeof File<"u"?new File([n],t,{type:r}):new Blob([n],{type:r})}async function I(e){if(!g())throw new i("File path strings are only supported in Node.js. Use File or Blob in the browser.");let t=await import('fs'),r=await import('path');if(!t.existsSync(e))throw new i(`File not found: ${e}`);let n=r.basename(e);w(n);let a=t.readFileSync(e);return d(a.length),{buffer:a,filename:n}}async function S(e){if(typeof e=="string"){let{buffer:t,filename:r}=await I(e);return _(t,r)}if(Buffer.isBuffer(e))return d(e.length),_(e,"document.pdf");if(e instanceof File)return w(e.name),d(e.size),e;if(e instanceof Blob)return d(e.size),e;throw new i("Invalid file input. Expected File, Blob, Buffer, or file path string.")}function A(e){return new Promise(t=>setTimeout(t,e))}function P(e,t=1e3){let r=t*Math.pow(2,e),n=Math.random()*.1*r;return Math.min(r+n,3e4)}var E=class{constructor(t){this.maxRetries=3;let r={};if(typeof t=="string"?r={apiKey:t}:t&&(r=t),this.apiKey=r.apiKey||this.getEnvApiKey(),!this.apiKey)throw new i("API key is required. Provide it in the constructor or set the PARSEFY_API_KEY environment variable.");this.baseUrl=r.baseUrl||x,this.timeout=r.timeout||b;}getEnvApiKey(){return g()&&process.env.PARSEFY_API_KEY||""}async extract(t){let{file:r,schema:n,confidenceThreshold:a,enableVerification:o}=t,u=R(n),c=await S(r),p=new FormData;return p.append("file",c),p.append("output_schema",JSON.stringify(u)),p.append("confidence_threshold",String(a??.85)),o!==void 0&&p.append("enable_verification",String(o)),this.makeRequestWithRetry(p)}async makeRequestWithRetry(t,r=0){try{return await this.makeRequest(t)}catch(n){if(n instanceof f&&n.statusCode===429&&r<this.maxRetries){let a=P(r);return await A(a),this.makeRequestWithRetry(t,r+1)}throw n}}async makeRequest(t){let r=`${this.baseUrl}/v1/extract`,n=new AbortController,a=setTimeout(()=>n.abort(),this.timeout);try{let o=await fetch(r,{method:"POST",headers:{Authorization:`Bearer ${this.apiKey}`},body:t,signal:n.signal});if(clearTimeout(a),!o.ok){let c=await this.parseErrorResponse(o);throw new f(c.message||`API request failed with status ${o.status}`,o.status,c)}let u;try{u=await o.json();}catch{throw new s("Failed to parse API response as JSON. The API may have returned an invalid response.","PARSE_ERROR")}try{return F(u)}catch(c){throw new s(`Failed to transform API response: ${c instanceof Error?c.message:String(c)}`,"TRANSFORM_ERROR")}}catch(o){throw clearTimeout(a),o instanceof Error&&o.name==="AbortError"?new s(`Request timed out after ${this.timeout}ms`,"TIMEOUT"):o instanceof s?o:o instanceof TypeError&&o.message.includes("fetch")?new s(`Network error: Unable to connect to the Parsefy API. ${o.message}`,"NETWORK_ERROR"):o instanceof TypeError?new s(`Type error: ${o.message}. This may indicate an API response format issue.`,"TYPE_ERROR"):new s(`Unexpected error: ${o instanceof Error?o.message:String(o)}`,"UNKNOWN_ERROR")}}async parseErrorResponse(t){try{return await t.json()}catch{try{return {message:await t.text()||t.statusText}}catch{return {message:t.statusText}}}}};
|
|
2
|
+
export{f as APIError,m as DEFAULT_CONFIDENCE_THRESHOLD,h as ExtractionError,E as Parsefy,s as ParsefyError,i as ValidationError};
|