@okrapdf/cli 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +9 -7
- package/dist/cli.js.map +1 -1
- package/dist/commands/docs.d.ts.map +1 -1
- package/dist/commands/docs.js +1 -49
- package/dist/commands/docs.js.map +1 -1
- package/dist/commands/jobs.d.ts.map +1 -1
- package/dist/commands/jobs.js +6 -8
- package/dist/commands/jobs.js.map +1 -1
- package/dist/commands/shortcuts.d.ts.map +1 -1
- package/dist/commands/shortcuts.js +33 -31
- package/dist/commands/shortcuts.js.map +1 -1
- package/dist/lib/scorer.d.ts +305 -0
- package/dist/lib/scorer.d.ts.map +1 -0
- package/dist/lib/scorer.js +796 -0
- package/dist/lib/scorer.js.map +1 -0
- package/dist/lib/scorer.test.d.ts +8 -0
- package/dist/lib/scorer.test.d.ts.map +1 -0
- package/dist/lib/scorer.test.js +974 -0
- package/dist/lib/scorer.test.js.map +1 -0
- package/dist/types.d.ts +6 -3
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,796 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OCR Review Scorer - Deterministic scoring for prioritizing page review
|
|
3
|
+
*
|
|
4
|
+
* Composable scoring strategies inspired by document understanding research:
|
|
5
|
+
* - Direct scoring: page-level metrics only
|
|
6
|
+
* - Structure-aware: considers table density and layout
|
|
7
|
+
* - Comparison-based: OCR layer vs rendered markdown
|
|
8
|
+
*
|
|
9
|
+
* Financial verification strategies from industry best practices:
|
|
10
|
+
* - Cross-querying: verify related fields match (e.g., Assets = Liabilities + Equity)
|
|
11
|
+
* - Check-sum logic: validate mathematical relationships between cells
|
|
12
|
+
* - Confidence filtering: auto-approve >95%, spot-check 70-95%, manual <70%
|
|
13
|
+
* - Anomaly detection: flag statistical outliers (e.g., decimal point errors)
|
|
14
|
+
*
|
|
15
|
+
* @see https://arxiv.org/html/2510.10138v1
|
|
16
|
+
*/
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Default Configuration
|
|
19
|
+
// ============================================================================
|
|
20
|
+
export const DEFAULT_CONFIG = {
|
|
21
|
+
weights: {
|
|
22
|
+
tableCount: 10, // Points per table on page
|
|
23
|
+
inverseConfidence: 50, // Points for low confidence (scaled by 1-conf)
|
|
24
|
+
coverageGap: 25, // Points if page has coverage gaps
|
|
25
|
+
flaggedEntity: 30, // Points per flagged entity
|
|
26
|
+
structurePenalty: 20, // Points for structural issues
|
|
27
|
+
comparisonDelta: 15, // Points for OCR/markdown mismatch
|
|
28
|
+
},
|
|
29
|
+
thresholds: {
|
|
30
|
+
autoApproveConfidence: 0.95,
|
|
31
|
+
requireReviewConfidence: 0.7,
|
|
32
|
+
maxTablesForAutoApprove: 0,
|
|
33
|
+
minOcrLinesForContent: 5,
|
|
34
|
+
},
|
|
35
|
+
};
|
|
36
|
+
// ============================================================================
|
|
37
|
+
// Core Scorer Class
|
|
38
|
+
// ============================================================================
|
|
39
|
+
export class OcrReviewScorer {
|
|
40
|
+
config;
|
|
41
|
+
tablesByPage = new Map();
|
|
42
|
+
constructor(config = {}) {
|
|
43
|
+
this.config = {
|
|
44
|
+
weights: { ...DEFAULT_CONFIG.weights, ...config.weights },
|
|
45
|
+
thresholds: { ...DEFAULT_CONFIG.thresholds, ...config.thresholds },
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Load table data for scoring
|
|
50
|
+
*/
|
|
51
|
+
loadTables(tables) {
|
|
52
|
+
this.tablesByPage.clear();
|
|
53
|
+
for (const table of tables) {
|
|
54
|
+
const pageNum = table.page_number;
|
|
55
|
+
if (!this.tablesByPage.has(pageNum)) {
|
|
56
|
+
this.tablesByPage.set(pageNum, []);
|
|
57
|
+
}
|
|
58
|
+
this.tablesByPage.get(pageNum).push(table);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Score a single page - deterministic priority calculation
|
|
63
|
+
*/
|
|
64
|
+
scorePage(page) {
|
|
65
|
+
const tables = this.tablesByPage.get(page.page) || [];
|
|
66
|
+
const tableCount = tables.length;
|
|
67
|
+
const { weights } = this.config;
|
|
68
|
+
// Calculate component scores
|
|
69
|
+
const tableScore = tableCount * weights.tableCount;
|
|
70
|
+
const confidenceScore = (1 - page.avgConfidence) * weights.inverseConfidence;
|
|
71
|
+
const coverageScore = page.hasCoverageGaps ? weights.coverageGap : 0;
|
|
72
|
+
const flaggedScore = page.flagged * weights.flaggedEntity;
|
|
73
|
+
// Structure score: penalize pages with many entities but low verification
|
|
74
|
+
const verificationRatio = page.total > 0 ? page.verified / page.total : 1;
|
|
75
|
+
const structureScore = page.total > 0 ? (1 - verificationRatio) * weights.structurePenalty : 0;
|
|
76
|
+
// Comparison score: placeholder for OCR vs markdown comparison
|
|
77
|
+
const comparisonScore = 0; // Computed separately via compareOcrToMarkdown
|
|
78
|
+
const breakdown = {
|
|
79
|
+
tableScore,
|
|
80
|
+
confidenceScore,
|
|
81
|
+
coverageScore,
|
|
82
|
+
flaggedScore,
|
|
83
|
+
structureScore,
|
|
84
|
+
comparisonScore,
|
|
85
|
+
};
|
|
86
|
+
const score = tableScore + confidenceScore + coverageScore + flaggedScore + structureScore;
|
|
87
|
+
// Generate flags for quick filtering
|
|
88
|
+
const flags = [];
|
|
89
|
+
if (tableCount > 0)
|
|
90
|
+
flags.push('has_tables');
|
|
91
|
+
if (page.avgConfidence < this.config.thresholds.requireReviewConfidence)
|
|
92
|
+
flags.push('low_confidence');
|
|
93
|
+
if (page.hasCoverageGaps)
|
|
94
|
+
flags.push('coverage_gaps');
|
|
95
|
+
if (page.flagged > 0)
|
|
96
|
+
flags.push('has_flagged');
|
|
97
|
+
if (page.isStale)
|
|
98
|
+
flags.push('stale');
|
|
99
|
+
if (tables.some(t => t.verification_status === 'flagged'))
|
|
100
|
+
flags.push('flagged_table');
|
|
101
|
+
return {
|
|
102
|
+
page: page.page,
|
|
103
|
+
score,
|
|
104
|
+
breakdown,
|
|
105
|
+
tableCount,
|
|
106
|
+
avgConfidence: page.avgConfidence,
|
|
107
|
+
status: page.status,
|
|
108
|
+
flags,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Score all pages and return sorted by priority (highest first)
|
|
113
|
+
*/
|
|
114
|
+
scoreAll(pages) {
|
|
115
|
+
return pages
|
|
116
|
+
.map(p => this.scorePage(p))
|
|
117
|
+
.sort((a, b) => b.score - a.score);
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Filter pages by criteria before scoring
|
|
121
|
+
*/
|
|
122
|
+
filter(pages, options) {
|
|
123
|
+
return pages.filter(page => {
|
|
124
|
+
const tableCount = (this.tablesByPage.get(page.page) || []).length;
|
|
125
|
+
if (options.status) {
|
|
126
|
+
const statuses = Array.isArray(options.status) ? options.status : [options.status];
|
|
127
|
+
if (!statuses.includes(page.status))
|
|
128
|
+
return false;
|
|
129
|
+
}
|
|
130
|
+
if (options.minConfidence !== undefined && page.avgConfidence < options.minConfidence) {
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
if (options.maxConfidence !== undefined && page.avgConfidence > options.maxConfidence) {
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
if (options.minTables !== undefined && tableCount < options.minTables) {
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
if (options.maxTables !== undefined && tableCount > options.maxTables) {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
if (options.hasGaps === true && !page.hasCoverageGaps) {
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
if (options.hasGaps === false && page.hasCoverageGaps) {
|
|
146
|
+
return false;
|
|
147
|
+
}
|
|
148
|
+
if (options.hasTables === true && tableCount === 0) {
|
|
149
|
+
return false;
|
|
150
|
+
}
|
|
151
|
+
if (options.hasTables === false && tableCount > 0) {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
if (options.isStale !== undefined && page.isStale !== options.isStale) {
|
|
155
|
+
return false;
|
|
156
|
+
}
|
|
157
|
+
return true;
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Get pages eligible for auto-approval
|
|
162
|
+
*/
|
|
163
|
+
getAutoApprovable(pages) {
|
|
164
|
+
const { thresholds } = this.config;
|
|
165
|
+
return pages.filter(page => {
|
|
166
|
+
const tableCount = (this.tablesByPage.get(page.page) || []).length;
|
|
167
|
+
return (page.avgConfidence >= thresholds.autoApproveConfidence &&
|
|
168
|
+
tableCount <= thresholds.maxTablesForAutoApprove &&
|
|
169
|
+
page.status === 'pending' &&
|
|
170
|
+
!page.hasCoverageGaps &&
|
|
171
|
+
page.flagged === 0);
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Get pages that require human review
|
|
176
|
+
*/
|
|
177
|
+
getRequireReview(pages) {
|
|
178
|
+
const { thresholds } = this.config;
|
|
179
|
+
const needsReview = pages.filter(page => {
|
|
180
|
+
const tableCount = (this.tablesByPage.get(page.page) || []).length;
|
|
181
|
+
return (page.avgConfidence < thresholds.requireReviewConfidence ||
|
|
182
|
+
tableCount > 0 ||
|
|
183
|
+
page.hasCoverageGaps ||
|
|
184
|
+
page.flagged > 0);
|
|
185
|
+
});
|
|
186
|
+
return this.scoreAll(needsReview);
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Compute review statistics
|
|
190
|
+
*/
|
|
191
|
+
computeStats(pages) {
|
|
192
|
+
const scored = this.scoreAll(pages);
|
|
193
|
+
const pagesWithTables = pages.filter(p => (this.tablesByPage.get(p.page) || []).length > 0);
|
|
194
|
+
let totalTables = 0;
|
|
195
|
+
this.tablesByPage.forEach(tables => {
|
|
196
|
+
totalTables += tables.length;
|
|
197
|
+
});
|
|
198
|
+
const avgConfidence = pages.length > 0
|
|
199
|
+
? pages.reduce((sum, p) => sum + p.avgConfidence, 0) / pages.length
|
|
200
|
+
: 0;
|
|
201
|
+
const lowConfidencePages = pages.filter(p => p.avgConfidence < this.config.thresholds.requireReviewConfidence).length;
|
|
202
|
+
const pagesWithGaps = pages.filter(p => p.hasCoverageGaps).length;
|
|
203
|
+
const byStatus = {};
|
|
204
|
+
for (const page of pages) {
|
|
205
|
+
byStatus[page.status] = (byStatus[page.status] || 0) + 1;
|
|
206
|
+
}
|
|
207
|
+
const HIGH_THRESHOLD = 50;
|
|
208
|
+
const MEDIUM_THRESHOLD = 20;
|
|
209
|
+
const byPriority = {
|
|
210
|
+
high: scored.filter(s => s.score >= HIGH_THRESHOLD).length,
|
|
211
|
+
medium: scored.filter(s => s.score >= MEDIUM_THRESHOLD && s.score < HIGH_THRESHOLD).length,
|
|
212
|
+
low: scored.filter(s => s.score < MEDIUM_THRESHOLD).length,
|
|
213
|
+
};
|
|
214
|
+
// Estimate 2 min per high priority, 1 min per medium, 0 for low (auto-approve)
|
|
215
|
+
const estimatedMinutes = byPriority.high * 2 + byPriority.medium * 1;
|
|
216
|
+
return {
|
|
217
|
+
totalPages: pages.length,
|
|
218
|
+
pagesWithTables: pagesWithTables.length,
|
|
219
|
+
totalTables,
|
|
220
|
+
avgConfidence,
|
|
221
|
+
lowConfidencePages,
|
|
222
|
+
pagesWithGaps,
|
|
223
|
+
byStatus,
|
|
224
|
+
byPriority,
|
|
225
|
+
estimatedReviewTime: {
|
|
226
|
+
pages: byPriority.high + byPriority.medium,
|
|
227
|
+
minutes: estimatedMinutes,
|
|
228
|
+
},
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
// ============================================================================
|
|
233
|
+
// OCR vs Markdown Comparison (Composable Strategy)
|
|
234
|
+
// ============================================================================
|
|
235
|
+
/**
|
|
236
|
+
* Compare OCR text blocks to rendered markdown for quality assessment
|
|
237
|
+
* Inspired by spatial structure preservation evaluation
|
|
238
|
+
*/
|
|
239
|
+
export function compareOcrToMarkdown(ocrBlocks, markdownContent) {
|
|
240
|
+
// Extract text from OCR blocks
|
|
241
|
+
const ocrText = ocrBlocks.map(b => b.text).join(' ');
|
|
242
|
+
const ocrCharCount = ocrText.replace(/\s+/g, '').length;
|
|
243
|
+
// Clean markdown for comparison
|
|
244
|
+
const cleanMarkdown = markdownContent
|
|
245
|
+
.replace(/[#*_\[\]()|\-]+/g, ' ') // Remove markdown syntax
|
|
246
|
+
.replace(/\s+/g, ' ')
|
|
247
|
+
.trim();
|
|
248
|
+
const markdownCharCount = cleanMarkdown.replace(/\s+/g, '').length;
|
|
249
|
+
// Character delta
|
|
250
|
+
const delta = Math.abs(ocrCharCount - markdownCharCount);
|
|
251
|
+
const maxChars = Math.max(ocrCharCount, markdownCharCount, 1);
|
|
252
|
+
const deltaPct = delta / maxChars;
|
|
253
|
+
// Spatial integrity: check if block positions are preserved
|
|
254
|
+
// Higher score = better spatial preservation
|
|
255
|
+
const sortedBlocks = [...ocrBlocks].sort((a, b) => {
|
|
256
|
+
if (!a.bbox || !b.bbox)
|
|
257
|
+
return 0;
|
|
258
|
+
return a.bbox.y - b.bbox.y || a.bbox.x - b.bbox.x;
|
|
259
|
+
});
|
|
260
|
+
const spatialText = sortedBlocks.map(b => b.text).join(' ');
|
|
261
|
+
const spatialIntegrity = computeStringSimilarity(spatialText, cleanMarkdown);
|
|
262
|
+
// Table structure score: check if table delimiters are preserved
|
|
263
|
+
const tableStructureScore = computeTableStructureScore(ocrBlocks, markdownContent);
|
|
264
|
+
// Generate comparison flags
|
|
265
|
+
const flags = [];
|
|
266
|
+
if (deltaPct > 0.2)
|
|
267
|
+
flags.push('high_char_delta');
|
|
268
|
+
if (spatialIntegrity < 0.7)
|
|
269
|
+
flags.push('spatial_mismatch');
|
|
270
|
+
if (tableStructureScore < 0.5)
|
|
271
|
+
flags.push('table_structure_issue');
|
|
272
|
+
if (ocrCharCount === 0 && markdownCharCount > 0)
|
|
273
|
+
flags.push('missing_ocr');
|
|
274
|
+
if (markdownCharCount === 0 && ocrCharCount > 0)
|
|
275
|
+
flags.push('missing_markdown');
|
|
276
|
+
// Composite score: lower is better (0 = perfect match)
|
|
277
|
+
const score = (deltaPct * 40) + ((1 - spatialIntegrity) * 35) + ((1 - tableStructureScore) * 25);
|
|
278
|
+
return {
|
|
279
|
+
page: 0, // Caller should set
|
|
280
|
+
ocrCharCount,
|
|
281
|
+
markdownCharCount,
|
|
282
|
+
delta,
|
|
283
|
+
deltaPct,
|
|
284
|
+
spatialIntegrity,
|
|
285
|
+
tableStructureScore,
|
|
286
|
+
flags,
|
|
287
|
+
score,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Simple string similarity using bigrams (Dice coefficient)
|
|
292
|
+
*/
|
|
293
|
+
function computeStringSimilarity(a, b) {
|
|
294
|
+
if (a.length === 0 && b.length === 0)
|
|
295
|
+
return 1;
|
|
296
|
+
if (a.length < 2 || b.length < 2)
|
|
297
|
+
return a === b ? 1 : 0;
|
|
298
|
+
const bigramsA = new Set();
|
|
299
|
+
for (let i = 0; i < a.length - 1; i++) {
|
|
300
|
+
bigramsA.add(a.slice(i, i + 2).toLowerCase());
|
|
301
|
+
}
|
|
302
|
+
const bigramsB = new Set();
|
|
303
|
+
for (let i = 0; i < b.length - 1; i++) {
|
|
304
|
+
bigramsB.add(b.slice(i, i + 2).toLowerCase());
|
|
305
|
+
}
|
|
306
|
+
let intersection = 0;
|
|
307
|
+
bigramsA.forEach(bg => {
|
|
308
|
+
if (bigramsB.has(bg))
|
|
309
|
+
intersection++;
|
|
310
|
+
});
|
|
311
|
+
return (2 * intersection) / (bigramsA.size + bigramsB.size);
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Evaluate table structure preservation
|
|
315
|
+
*/
|
|
316
|
+
function computeTableStructureScore(ocrBlocks, markdown) {
|
|
317
|
+
// Check for table-like patterns in markdown
|
|
318
|
+
const hasMarkdownTable = /\|.*\|/.test(markdown);
|
|
319
|
+
if (!hasMarkdownTable)
|
|
320
|
+
return 1; // No table to evaluate
|
|
321
|
+
// Count table delimiters
|
|
322
|
+
const pipeCount = (markdown.match(/\|/g) || []).length;
|
|
323
|
+
const dashCount = (markdown.match(/-{3,}/g) || []).length;
|
|
324
|
+
// Check if OCR captured tabular structure
|
|
325
|
+
const ocrText = ocrBlocks.map(b => b.text).join('\n');
|
|
326
|
+
const ocrHasColumns = ocrBlocks.length > 3 && ocrBlocks.some(b => b.bbox && b.bbox.x > 100);
|
|
327
|
+
// Score based on structure indicators
|
|
328
|
+
let score = 0.5; // Base score
|
|
329
|
+
if (pipeCount > 4)
|
|
330
|
+
score += 0.2;
|
|
331
|
+
if (dashCount > 0)
|
|
332
|
+
score += 0.15;
|
|
333
|
+
if (ocrHasColumns)
|
|
334
|
+
score += 0.15;
|
|
335
|
+
return Math.min(score, 1);
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Create a scorer with a specific strategy
|
|
339
|
+
*/
|
|
340
|
+
export function createScorer(strategy, config = {}) {
|
|
341
|
+
const strategyConfig = { ...config };
|
|
342
|
+
switch (strategy) {
|
|
343
|
+
case 'direct':
|
|
344
|
+
// Page-level metrics only, no structure analysis
|
|
345
|
+
strategyConfig.weights = {
|
|
346
|
+
...DEFAULT_CONFIG.weights,
|
|
347
|
+
tableCount: 0,
|
|
348
|
+
structurePenalty: 0,
|
|
349
|
+
...config.weights,
|
|
350
|
+
};
|
|
351
|
+
break;
|
|
352
|
+
case 'structure':
|
|
353
|
+
// Emphasize table and structure analysis
|
|
354
|
+
strategyConfig.weights = {
|
|
355
|
+
...DEFAULT_CONFIG.weights,
|
|
356
|
+
tableCount: 20,
|
|
357
|
+
structurePenalty: 30,
|
|
358
|
+
inverseConfidence: 30,
|
|
359
|
+
...config.weights,
|
|
360
|
+
};
|
|
361
|
+
break;
|
|
362
|
+
case 'comparison':
|
|
363
|
+
// Emphasize OCR vs markdown comparison
|
|
364
|
+
strategyConfig.weights = {
|
|
365
|
+
...DEFAULT_CONFIG.weights,
|
|
366
|
+
comparisonDelta: 40,
|
|
367
|
+
tableCount: 5,
|
|
368
|
+
...config.weights,
|
|
369
|
+
};
|
|
370
|
+
break;
|
|
371
|
+
case 'combined':
|
|
372
|
+
default:
|
|
373
|
+
// Balanced approach
|
|
374
|
+
break;
|
|
375
|
+
}
|
|
376
|
+
return new OcrReviewScorer(strategyConfig);
|
|
377
|
+
}
|
|
378
|
+
// ============================================================================
|
|
379
|
+
// Utility Functions for CLI Integration
|
|
380
|
+
// ============================================================================
|
|
381
|
+
/**
|
|
382
|
+
* Parse pages JSON from CLI output
|
|
383
|
+
*/
|
|
384
|
+
export function parsePagesJson(json) {
|
|
385
|
+
const data = JSON.parse(json);
|
|
386
|
+
return Array.isArray(data) ? data : data.pages || [];
|
|
387
|
+
}
|
|
388
|
+
/**
|
|
389
|
+
* Parse tables JSON from CLI output
|
|
390
|
+
*/
|
|
391
|
+
export function parseTablesJson(json) {
|
|
392
|
+
const data = JSON.parse(json);
|
|
393
|
+
return Array.isArray(data) ? data : data.tables || [];
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Format scored pages for output
|
|
397
|
+
*/
|
|
398
|
+
export function formatScoredPages(pages, format = 'table') {
|
|
399
|
+
switch (format) {
|
|
400
|
+
case 'json':
|
|
401
|
+
return JSON.stringify(pages, null, 2);
|
|
402
|
+
case 'jsonl':
|
|
403
|
+
return pages.map(p => JSON.stringify(p)).join('\n');
|
|
404
|
+
case 'csv':
|
|
405
|
+
const headers = ['page', 'score', 'tableCount', 'avgConfidence', 'status', 'flags'];
|
|
406
|
+
const rows = pages.map(p => [
|
|
407
|
+
p.page,
|
|
408
|
+
p.score.toFixed(2),
|
|
409
|
+
p.tableCount,
|
|
410
|
+
(p.avgConfidence * 100).toFixed(1) + '%',
|
|
411
|
+
p.status,
|
|
412
|
+
p.flags.join(';'),
|
|
413
|
+
]);
|
|
414
|
+
return [headers.join(','), ...rows.map(r => r.join(','))].join('\n');
|
|
415
|
+
case 'table':
|
|
416
|
+
default:
|
|
417
|
+
const colWidths = { page: 6, score: 8, tables: 8, conf: 8, status: 12, flags: 30 };
|
|
418
|
+
const header = [
|
|
419
|
+
'Page'.padEnd(colWidths.page),
|
|
420
|
+
'Score'.padEnd(colWidths.score),
|
|
421
|
+
'Tables'.padEnd(colWidths.tables),
|
|
422
|
+
'Conf'.padEnd(colWidths.conf),
|
|
423
|
+
'Status'.padEnd(colWidths.status),
|
|
424
|
+
'Flags',
|
|
425
|
+
].join(' ');
|
|
426
|
+
const divider = '-'.repeat(header.length);
|
|
427
|
+
const tableRows = pages.map(p => [
|
|
428
|
+
String(p.page).padEnd(colWidths.page),
|
|
429
|
+
p.score.toFixed(1).padEnd(colWidths.score),
|
|
430
|
+
String(p.tableCount).padEnd(colWidths.tables),
|
|
431
|
+
((p.avgConfidence * 100).toFixed(0) + '%').padEnd(colWidths.conf),
|
|
432
|
+
p.status.padEnd(colWidths.status),
|
|
433
|
+
p.flags.slice(0, 3).join(', '),
|
|
434
|
+
].join(' '));
|
|
435
|
+
return [header, divider, ...tableRows].join('\n');
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
/**
|
|
439
|
+
* Format stats for output
|
|
440
|
+
*/
|
|
441
|
+
export function formatStats(stats, format = 'table') {
|
|
442
|
+
if (format === 'json') {
|
|
443
|
+
return JSON.stringify(stats, null, 2);
|
|
444
|
+
}
|
|
445
|
+
const lines = [
|
|
446
|
+
`Total pages: ${stats.totalPages}`,
|
|
447
|
+
`Pages with tables: ${stats.pagesWithTables} (${(stats.pagesWithTables / stats.totalPages * 100).toFixed(1)}%)`,
|
|
448
|
+
`Total tables: ${stats.totalTables}`,
|
|
449
|
+
`Average confidence: ${(stats.avgConfidence * 100).toFixed(1)}%`,
|
|
450
|
+
`Low confidence pages: ${stats.lowConfidencePages}`,
|
|
451
|
+
`Pages with coverage gaps: ${stats.pagesWithGaps}`,
|
|
452
|
+
'',
|
|
453
|
+
'By Status:',
|
|
454
|
+
...Object.entries(stats.byStatus).map(([k, v]) => ` ${k}: ${v}`),
|
|
455
|
+
'',
|
|
456
|
+
'Review Priority:',
|
|
457
|
+
` High (score >= 50): ${stats.byPriority.high} pages`,
|
|
458
|
+
` Medium (score 20-49): ${stats.byPriority.medium} pages`,
|
|
459
|
+
` Low (score < 20): ${stats.byPriority.low} pages`,
|
|
460
|
+
'',
|
|
461
|
+
`Estimated review: ${stats.estimatedReviewTime.pages} pages, ~${stats.estimatedReviewTime.minutes} minutes`,
|
|
462
|
+
];
|
|
463
|
+
return lines.join('\n');
|
|
464
|
+
}
|
|
465
|
+
// ============================================================================
|
|
466
|
+
// Financial Verification Strategies
|
|
467
|
+
// ============================================================================
|
|
468
|
+
/**
|
|
469
|
+
* Confidence-based filtering thresholds
|
|
470
|
+
* Based on industry best practices for financial document verification
|
|
471
|
+
*/
|
|
472
|
+
export const CONFIDENCE_TIERS = {
|
|
473
|
+
AUTO_APPROVE: 0.95, // Score > 95%: Auto-approve
|
|
474
|
+
SPOT_CHECK: 0.70, // Score 70%–95%: Spot-check
|
|
475
|
+
MANUAL_REVIEW: 0.70, // Score < 70%: Mandatory manual verification
|
|
476
|
+
};
|
|
477
|
+
/**
|
|
478
|
+
* Categorize a confidence score into action tiers
|
|
479
|
+
*/
|
|
480
|
+
export function getConfidenceTier(confidence) {
|
|
481
|
+
if (confidence >= CONFIDENCE_TIERS.AUTO_APPROVE)
|
|
482
|
+
return 'auto_approve';
|
|
483
|
+
if (confidence >= CONFIDENCE_TIERS.SPOT_CHECK)
|
|
484
|
+
return 'spot_check';
|
|
485
|
+
return 'manual_review';
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Batch categorize pages by confidence tier
|
|
489
|
+
*/
|
|
490
|
+
export function categorizeByConfidence(pages) {
|
|
491
|
+
const result = {
|
|
492
|
+
auto_approve: [],
|
|
493
|
+
spot_check: [],
|
|
494
|
+
manual_review: [],
|
|
495
|
+
};
|
|
496
|
+
for (const page of pages) {
|
|
497
|
+
const tier = getConfidenceTier(page.avgConfidence);
|
|
498
|
+
result[tier].push(page);
|
|
499
|
+
}
|
|
500
|
+
return result;
|
|
501
|
+
}
|
|
502
|
+
/**
|
|
503
|
+
* Common financial check-sum rules
|
|
504
|
+
*/
|
|
505
|
+
export const FINANCIAL_CHECKSUMS = [
|
|
506
|
+
{
|
|
507
|
+
name: 'Balance Sheet Identity',
|
|
508
|
+
formula: 'Total Assets = Total Liabilities + Equity',
|
|
509
|
+
fields: ['total_assets', 'total_liabilities', 'equity', 'shareholders_equity'],
|
|
510
|
+
validate: (v) => {
|
|
511
|
+
const assets = v.total_assets || 0;
|
|
512
|
+
const liabilities = v.total_liabilities || 0;
|
|
513
|
+
const equity = v.equity || v.shareholders_equity || 0;
|
|
514
|
+
const expected = liabilities + equity;
|
|
515
|
+
const difference = Math.abs(assets - expected);
|
|
516
|
+
const tolerancePct = assets > 0 ? (difference / assets) * 100 : 0;
|
|
517
|
+
return {
|
|
518
|
+
passed: tolerancePct < 1, // 1% tolerance for rounding
|
|
519
|
+
expected,
|
|
520
|
+
actual: assets,
|
|
521
|
+
difference,
|
|
522
|
+
tolerancePct,
|
|
523
|
+
};
|
|
524
|
+
},
|
|
525
|
+
},
|
|
526
|
+
{
|
|
527
|
+
name: 'Net Income Check',
|
|
528
|
+
formula: 'Net Income = Revenue - Expenses',
|
|
529
|
+
fields: ['net_income', 'total_revenue', 'total_expenses'],
|
|
530
|
+
validate: (v) => {
|
|
531
|
+
const netIncome = v.net_income || 0;
|
|
532
|
+
const revenue = v.total_revenue || 0;
|
|
533
|
+
const expenses = v.total_expenses || 0;
|
|
534
|
+
const expected = revenue - expenses;
|
|
535
|
+
const difference = Math.abs(netIncome - expected);
|
|
536
|
+
const tolerancePct = Math.abs(expected) > 0 ? (difference / Math.abs(expected)) * 100 : 0;
|
|
537
|
+
return {
|
|
538
|
+
passed: tolerancePct < 1,
|
|
539
|
+
expected,
|
|
540
|
+
actual: netIncome,
|
|
541
|
+
difference,
|
|
542
|
+
tolerancePct,
|
|
543
|
+
};
|
|
544
|
+
},
|
|
545
|
+
},
|
|
546
|
+
{
|
|
547
|
+
name: 'Working Capital',
|
|
548
|
+
formula: 'Working Capital = Current Assets - Current Liabilities',
|
|
549
|
+
fields: ['working_capital', 'current_assets', 'current_liabilities'],
|
|
550
|
+
validate: (v) => {
|
|
551
|
+
const workingCapital = v.working_capital || 0;
|
|
552
|
+
const currentAssets = v.current_assets || 0;
|
|
553
|
+
const currentLiabilities = v.current_liabilities || 0;
|
|
554
|
+
const expected = currentAssets - currentLiabilities;
|
|
555
|
+
const difference = Math.abs(workingCapital - expected);
|
|
556
|
+
const tolerancePct = Math.abs(expected) > 0 ? (difference / Math.abs(expected)) * 100 : 0;
|
|
557
|
+
return {
|
|
558
|
+
passed: tolerancePct < 1,
|
|
559
|
+
expected,
|
|
560
|
+
actual: workingCapital,
|
|
561
|
+
difference,
|
|
562
|
+
tolerancePct,
|
|
563
|
+
};
|
|
564
|
+
},
|
|
565
|
+
},
|
|
566
|
+
];
|
|
567
|
+
/**
|
|
568
|
+
* Run all applicable check-sum validations on extracted values
|
|
569
|
+
*/
|
|
570
|
+
export function runCheckSums(values, rules = FINANCIAL_CHECKSUMS) {
|
|
571
|
+
const results = [];
|
|
572
|
+
for (const rule of rules) {
|
|
573
|
+
// Check if we have at least 2 of the required fields
|
|
574
|
+
const presentFields = rule.fields.filter(f => values[f] !== undefined);
|
|
575
|
+
if (presentFields.length >= 2) {
|
|
576
|
+
results.push({
|
|
577
|
+
rule,
|
|
578
|
+
result: rule.validate(values),
|
|
579
|
+
});
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
return results;
|
|
583
|
+
}
|
|
584
|
+
/**
|
|
585
|
+
* Detect anomalies in extracted values using statistical analysis
|
|
586
|
+
* Flags potential OCR errors like decimal point shifts
|
|
587
|
+
*/
|
|
588
|
+
export function detectAnomalies(currentValues, historicalValues, zScoreThreshold = 3.0) {
|
|
589
|
+
const anomalies = [];
|
|
590
|
+
if (historicalValues.length < 2)
|
|
591
|
+
return anomalies;
|
|
592
|
+
for (const [field, currentValue] of Object.entries(currentValues)) {
|
|
593
|
+
const historicalData = historicalValues
|
|
594
|
+
.map(h => h[field])
|
|
595
|
+
.filter((v) => v !== undefined);
|
|
596
|
+
if (historicalData.length < 2)
|
|
597
|
+
continue;
|
|
598
|
+
// Calculate mean and standard deviation
|
|
599
|
+
const mean = historicalData.reduce((a, b) => a + b, 0) / historicalData.length;
|
|
600
|
+
const variance = historicalData.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / historicalData.length;
|
|
601
|
+
const stdDev = Math.sqrt(variance);
|
|
602
|
+
// Calculate z-score
|
|
603
|
+
const zScore = stdDev > 0 ? Math.abs(currentValue - mean) / stdDev : 0;
|
|
604
|
+
if (zScore > zScoreThreshold) {
|
|
605
|
+
// Try to identify the cause
|
|
606
|
+
let possibleCause = 'Unexpected value';
|
|
607
|
+
// Check for decimal point error (value is 10x, 100x, or 1000x off)
|
|
608
|
+
const ratios = [10, 100, 1000, 0.1, 0.01, 0.001];
|
|
609
|
+
for (const ratio of ratios) {
|
|
610
|
+
const adjustedZScore = stdDev > 0 ? Math.abs((currentValue / ratio) - mean) / stdDev : 0;
|
|
611
|
+
if (adjustedZScore < 1) {
|
|
612
|
+
possibleCause = `Possible decimal point error (value appears to be ${ratio}x expected)`;
|
|
613
|
+
break;
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
// Check for sign error
|
|
617
|
+
if (Math.abs((-currentValue) - mean) / stdDev < 1) {
|
|
618
|
+
possibleCause = 'Possible sign error (negative vs positive)';
|
|
619
|
+
}
|
|
620
|
+
anomalies.push({
|
|
621
|
+
field,
|
|
622
|
+
currentValue,
|
|
623
|
+
historicalMean: mean,
|
|
624
|
+
historicalStdDev: stdDev,
|
|
625
|
+
zScore,
|
|
626
|
+
isAnomaly: true,
|
|
627
|
+
possibleCause,
|
|
628
|
+
});
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
return anomalies;
|
|
632
|
+
}
|
|
633
|
+
export const CROSS_QUERY_RULES = [
|
|
634
|
+
{
|
|
635
|
+
name: 'Total Assets Components',
|
|
636
|
+
totalField: 'total_assets',
|
|
637
|
+
componentFields: ['current_assets', 'non_current_assets', 'fixed_assets', 'intangible_assets'],
|
|
638
|
+
tolerance: 2,
|
|
639
|
+
},
|
|
640
|
+
{
|
|
641
|
+
name: 'Total Revenue Components',
|
|
642
|
+
totalField: 'total_revenue',
|
|
643
|
+
componentFields: ['product_revenue', 'service_revenue', 'other_revenue'],
|
|
644
|
+
tolerance: 2,
|
|
645
|
+
},
|
|
646
|
+
{
|
|
647
|
+
name: 'Total Liabilities Components',
|
|
648
|
+
totalField: 'total_liabilities',
|
|
649
|
+
componentFields: ['current_liabilities', 'non_current_liabilities', 'long_term_debt'],
|
|
650
|
+
tolerance: 2,
|
|
651
|
+
},
|
|
652
|
+
];
|
|
653
|
+
/**
|
|
654
|
+
* Validate that total fields match sum of their components
|
|
655
|
+
*/
|
|
656
|
+
export function validateCrossQuery(values, rules = CROSS_QUERY_RULES) {
|
|
657
|
+
const results = [];
|
|
658
|
+
for (const rule of rules) {
|
|
659
|
+
const totalValue = values[rule.totalField];
|
|
660
|
+
if (totalValue === undefined)
|
|
661
|
+
continue;
|
|
662
|
+
const presentComponents = rule.componentFields.filter(f => values[f] !== undefined);
|
|
663
|
+
if (presentComponents.length === 0)
|
|
664
|
+
continue;
|
|
665
|
+
const componentSum = presentComponents.reduce((sum, f) => sum + (values[f] || 0), 0);
|
|
666
|
+
const difference = Math.abs(totalValue - componentSum);
|
|
667
|
+
const differencePercent = totalValue > 0 ? (difference / totalValue) * 100 : 0;
|
|
668
|
+
results.push({
|
|
669
|
+
rule,
|
|
670
|
+
passed: differencePercent <= rule.tolerance,
|
|
671
|
+
totalValue,
|
|
672
|
+
componentSum,
|
|
673
|
+
difference,
|
|
674
|
+
differencePercent,
|
|
675
|
+
presentComponents,
|
|
676
|
+
});
|
|
677
|
+
}
|
|
678
|
+
return results;
|
|
679
|
+
}
|
|
680
|
+
/**
|
|
681
|
+
* Run comprehensive financial verification
|
|
682
|
+
*/
|
|
683
|
+
export function runFinancialVerification(extractedValues, confidence, historicalValues = []) {
|
|
684
|
+
const confidenceTier = getConfidenceTier(confidence);
|
|
685
|
+
const checkSums = runCheckSums(extractedValues);
|
|
686
|
+
const anomalies = detectAnomalies(extractedValues, historicalValues);
|
|
687
|
+
const crossQueries = validateCrossQuery(extractedValues);
|
|
688
|
+
const issues = [];
|
|
689
|
+
const recommendations = [];
|
|
690
|
+
// Analyze confidence tier
|
|
691
|
+
if (confidenceTier === 'manual_review') {
|
|
692
|
+
issues.push(`Low OCR confidence (${(confidence * 100).toFixed(1)}%) - mandatory manual verification required`);
|
|
693
|
+
recommendations.push('Review original document alongside extracted data');
|
|
694
|
+
}
|
|
695
|
+
else if (confidenceTier === 'spot_check') {
|
|
696
|
+
recommendations.push(`Moderate OCR confidence (${(confidence * 100).toFixed(1)}%) - spot-check recommended`);
|
|
697
|
+
}
|
|
698
|
+
// Analyze check-sums
|
|
699
|
+
for (const { rule, result } of checkSums) {
|
|
700
|
+
if (!result.passed) {
|
|
701
|
+
issues.push(`${rule.name} failed: expected ${result.expected.toLocaleString()}, got ${result.actual.toLocaleString()} (${result.tolerancePct.toFixed(2)}% off)`);
|
|
702
|
+
recommendations.push(`Verify ${rule.formula}`);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
// Analyze anomalies
|
|
706
|
+
for (const anomaly of anomalies) {
|
|
707
|
+
issues.push(`Anomaly detected in "${anomaly.field}": ${anomaly.possibleCause}`);
|
|
708
|
+
recommendations.push(`Compare "${anomaly.field}" value (${anomaly.currentValue.toLocaleString()}) with source document`);
|
|
709
|
+
}
|
|
710
|
+
// Analyze cross-queries
|
|
711
|
+
for (const cq of crossQueries) {
|
|
712
|
+
if (!cq.passed) {
|
|
713
|
+
issues.push(`${cq.rule.name}: total (${cq.totalValue.toLocaleString()}) doesn't match component sum (${cq.componentSum.toLocaleString()})`);
|
|
714
|
+
recommendations.push(`Verify components: ${cq.presentComponents.join(', ')}`);
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
// Determine overall status
|
|
718
|
+
let overallStatus;
|
|
719
|
+
if (issues.length === 0) {
|
|
720
|
+
overallStatus = 'pass';
|
|
721
|
+
}
|
|
722
|
+
else if (confidenceTier === 'manual_review' || checkSums.some(c => !c.result.passed) || anomalies.length > 0) {
|
|
723
|
+
overallStatus = 'fail';
|
|
724
|
+
}
|
|
725
|
+
else {
|
|
726
|
+
overallStatus = 'warning';
|
|
727
|
+
}
|
|
728
|
+
return {
|
|
729
|
+
confidenceTier,
|
|
730
|
+
checkSums,
|
|
731
|
+
anomalies,
|
|
732
|
+
crossQueries,
|
|
733
|
+
overallStatus,
|
|
734
|
+
issues,
|
|
735
|
+
recommendations,
|
|
736
|
+
};
|
|
737
|
+
}
|
|
738
|
+
/**
|
|
739
|
+
* Format verification report for display
|
|
740
|
+
*/
|
|
741
|
+
export function formatVerificationReport(report) {
|
|
742
|
+
const lines = [
|
|
743
|
+
'═══════════════════════════════════════════════════════════',
|
|
744
|
+
' FINANCIAL VERIFICATION REPORT ',
|
|
745
|
+
'═══════════════════════════════════════════════════════════',
|
|
746
|
+
'',
|
|
747
|
+
`Overall Status: ${report.overallStatus.toUpperCase()}`,
|
|
748
|
+
`Confidence Tier: ${report.confidenceTier.replace('_', ' ').toUpperCase()}`,
|
|
749
|
+
'',
|
|
750
|
+
];
|
|
751
|
+
if (report.checkSums.length > 0) {
|
|
752
|
+
lines.push('Check-Sum Validations:');
|
|
753
|
+
for (const { rule, result } of report.checkSums) {
|
|
754
|
+
const status = result.passed ? '✓' : '✗';
|
|
755
|
+
lines.push(` ${status} ${rule.name}`);
|
|
756
|
+
if (!result.passed) {
|
|
757
|
+
lines.push(` Expected: ${result.expected.toLocaleString()}, Actual: ${result.actual.toLocaleString()}`);
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
lines.push('');
|
|
761
|
+
}
|
|
762
|
+
if (report.anomalies.length > 0) {
|
|
763
|
+
lines.push('Anomalies Detected:');
|
|
764
|
+
for (const anomaly of report.anomalies) {
|
|
765
|
+
lines.push(` ⚠ ${anomaly.field}: ${anomaly.possibleCause}`);
|
|
766
|
+
lines.push(` Value: ${anomaly.currentValue.toLocaleString()}, Expected ~${anomaly.historicalMean.toLocaleString()}`);
|
|
767
|
+
}
|
|
768
|
+
lines.push('');
|
|
769
|
+
}
|
|
770
|
+
if (report.crossQueries.length > 0) {
|
|
771
|
+
lines.push('Cross-Query Validations:');
|
|
772
|
+
for (const cq of report.crossQueries) {
|
|
773
|
+
const status = cq.passed ? '✓' : '✗';
|
|
774
|
+
lines.push(` ${status} ${cq.rule.name}`);
|
|
775
|
+
if (!cq.passed) {
|
|
776
|
+
lines.push(` Total: ${cq.totalValue.toLocaleString()}, Sum: ${cq.componentSum.toLocaleString()} (${cq.differencePercent.toFixed(1)}% off)`);
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
lines.push('');
|
|
780
|
+
}
|
|
781
|
+
if (report.issues.length > 0) {
|
|
782
|
+
lines.push('Issues:');
|
|
783
|
+
for (const issue of report.issues) {
|
|
784
|
+
lines.push(` • ${issue}`);
|
|
785
|
+
}
|
|
786
|
+
lines.push('');
|
|
787
|
+
}
|
|
788
|
+
if (report.recommendations.length > 0) {
|
|
789
|
+
lines.push('Recommendations:');
|
|
790
|
+
for (const rec of report.recommendations) {
|
|
791
|
+
lines.push(` → ${rec}`);
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
return lines.join('\n');
|
|
795
|
+
}
|
|
796
|
+
//# sourceMappingURL=scorer.js.map
|