@okrapdf/cli 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,796 @@
1
+ /**
2
+ * OCR Review Scorer - Deterministic scoring for prioritizing page review
3
+ *
4
+ * Composable scoring strategies inspired by document understanding research:
5
+ * - Direct scoring: page-level metrics only
6
+ * - Structure-aware: considers table density and layout
7
+ * - Comparison-based: OCR layer vs rendered markdown
8
+ *
9
+ * Financial verification strategies from industry best practices:
10
+ * - Cross-querying: verify related fields match (e.g., Assets = Liabilities + Equity)
11
+ * - Check-sum logic: validate mathematical relationships between cells
12
+ * - Confidence filtering: auto-approve >95%, spot-check 70-95%, manual <70%
13
+ * - Anomaly detection: flag statistical outliers (e.g., decimal point errors)
14
+ *
15
+ * @see https://arxiv.org/html/2510.10138v1
16
+ */
17
+ // ============================================================================
18
+ // Default Configuration
19
+ // ============================================================================
20
+ export const DEFAULT_CONFIG = {
21
+ weights: {
22
+ tableCount: 10, // Points per table on page
23
+ inverseConfidence: 50, // Points for low confidence (scaled by 1-conf)
24
+ coverageGap: 25, // Points if page has coverage gaps
25
+ flaggedEntity: 30, // Points per flagged entity
26
+ structurePenalty: 20, // Points for structural issues
27
+ comparisonDelta: 15, // Points for OCR/markdown mismatch
28
+ },
29
+ thresholds: {
30
+ autoApproveConfidence: 0.95,
31
+ requireReviewConfidence: 0.7,
32
+ maxTablesForAutoApprove: 0,
33
+ minOcrLinesForContent: 5,
34
+ },
35
+ };
36
+ // ============================================================================
37
+ // Core Scorer Class
38
+ // ============================================================================
39
+ export class OcrReviewScorer {
40
+ config;
41
+ tablesByPage = new Map();
42
+ constructor(config = {}) {
43
+ this.config = {
44
+ weights: { ...DEFAULT_CONFIG.weights, ...config.weights },
45
+ thresholds: { ...DEFAULT_CONFIG.thresholds, ...config.thresholds },
46
+ };
47
+ }
48
+ /**
49
+ * Load table data for scoring
50
+ */
51
+ loadTables(tables) {
52
+ this.tablesByPage.clear();
53
+ for (const table of tables) {
54
+ const pageNum = table.page_number;
55
+ if (!this.tablesByPage.has(pageNum)) {
56
+ this.tablesByPage.set(pageNum, []);
57
+ }
58
+ this.tablesByPage.get(pageNum).push(table);
59
+ }
60
+ }
61
+ /**
62
+ * Score a single page - deterministic priority calculation
63
+ */
64
+ scorePage(page) {
65
+ const tables = this.tablesByPage.get(page.page) || [];
66
+ const tableCount = tables.length;
67
+ const { weights } = this.config;
68
+ // Calculate component scores
69
+ const tableScore = tableCount * weights.tableCount;
70
+ const confidenceScore = (1 - page.avgConfidence) * weights.inverseConfidence;
71
+ const coverageScore = page.hasCoverageGaps ? weights.coverageGap : 0;
72
+ const flaggedScore = page.flagged * weights.flaggedEntity;
73
+ // Structure score: penalize pages with many entities but low verification
74
+ const verificationRatio = page.total > 0 ? page.verified / page.total : 1;
75
+ const structureScore = page.total > 0 ? (1 - verificationRatio) * weights.structurePenalty : 0;
76
+ // Comparison score: placeholder for OCR vs markdown comparison
77
+ const comparisonScore = 0; // Computed separately via compareOcrToMarkdown
78
+ const breakdown = {
79
+ tableScore,
80
+ confidenceScore,
81
+ coverageScore,
82
+ flaggedScore,
83
+ structureScore,
84
+ comparisonScore,
85
+ };
86
+ const score = tableScore + confidenceScore + coverageScore + flaggedScore + structureScore;
87
+ // Generate flags for quick filtering
88
+ const flags = [];
89
+ if (tableCount > 0)
90
+ flags.push('has_tables');
91
+ if (page.avgConfidence < this.config.thresholds.requireReviewConfidence)
92
+ flags.push('low_confidence');
93
+ if (page.hasCoverageGaps)
94
+ flags.push('coverage_gaps');
95
+ if (page.flagged > 0)
96
+ flags.push('has_flagged');
97
+ if (page.isStale)
98
+ flags.push('stale');
99
+ if (tables.some(t => t.verification_status === 'flagged'))
100
+ flags.push('flagged_table');
101
+ return {
102
+ page: page.page,
103
+ score,
104
+ breakdown,
105
+ tableCount,
106
+ avgConfidence: page.avgConfidence,
107
+ status: page.status,
108
+ flags,
109
+ };
110
+ }
111
+ /**
112
+ * Score all pages and return sorted by priority (highest first)
113
+ */
114
+ scoreAll(pages) {
115
+ return pages
116
+ .map(p => this.scorePage(p))
117
+ .sort((a, b) => b.score - a.score);
118
+ }
119
+ /**
120
+ * Filter pages by criteria before scoring
121
+ */
122
+ filter(pages, options) {
123
+ return pages.filter(page => {
124
+ const tableCount = (this.tablesByPage.get(page.page) || []).length;
125
+ if (options.status) {
126
+ const statuses = Array.isArray(options.status) ? options.status : [options.status];
127
+ if (!statuses.includes(page.status))
128
+ return false;
129
+ }
130
+ if (options.minConfidence !== undefined && page.avgConfidence < options.minConfidence) {
131
+ return false;
132
+ }
133
+ if (options.maxConfidence !== undefined && page.avgConfidence > options.maxConfidence) {
134
+ return false;
135
+ }
136
+ if (options.minTables !== undefined && tableCount < options.minTables) {
137
+ return false;
138
+ }
139
+ if (options.maxTables !== undefined && tableCount > options.maxTables) {
140
+ return false;
141
+ }
142
+ if (options.hasGaps === true && !page.hasCoverageGaps) {
143
+ return false;
144
+ }
145
+ if (options.hasGaps === false && page.hasCoverageGaps) {
146
+ return false;
147
+ }
148
+ if (options.hasTables === true && tableCount === 0) {
149
+ return false;
150
+ }
151
+ if (options.hasTables === false && tableCount > 0) {
152
+ return false;
153
+ }
154
+ if (options.isStale !== undefined && page.isStale !== options.isStale) {
155
+ return false;
156
+ }
157
+ return true;
158
+ });
159
+ }
160
+ /**
161
+ * Get pages eligible for auto-approval
162
+ */
163
+ getAutoApprovable(pages) {
164
+ const { thresholds } = this.config;
165
+ return pages.filter(page => {
166
+ const tableCount = (this.tablesByPage.get(page.page) || []).length;
167
+ return (page.avgConfidence >= thresholds.autoApproveConfidence &&
168
+ tableCount <= thresholds.maxTablesForAutoApprove &&
169
+ page.status === 'pending' &&
170
+ !page.hasCoverageGaps &&
171
+ page.flagged === 0);
172
+ });
173
+ }
174
+ /**
175
+ * Get pages that require human review
176
+ */
177
+ getRequireReview(pages) {
178
+ const { thresholds } = this.config;
179
+ const needsReview = pages.filter(page => {
180
+ const tableCount = (this.tablesByPage.get(page.page) || []).length;
181
+ return (page.avgConfidence < thresholds.requireReviewConfidence ||
182
+ tableCount > 0 ||
183
+ page.hasCoverageGaps ||
184
+ page.flagged > 0);
185
+ });
186
+ return this.scoreAll(needsReview);
187
+ }
188
+ /**
189
+ * Compute review statistics
190
+ */
191
+ computeStats(pages) {
192
+ const scored = this.scoreAll(pages);
193
+ const pagesWithTables = pages.filter(p => (this.tablesByPage.get(p.page) || []).length > 0);
194
+ let totalTables = 0;
195
+ this.tablesByPage.forEach(tables => {
196
+ totalTables += tables.length;
197
+ });
198
+ const avgConfidence = pages.length > 0
199
+ ? pages.reduce((sum, p) => sum + p.avgConfidence, 0) / pages.length
200
+ : 0;
201
+ const lowConfidencePages = pages.filter(p => p.avgConfidence < this.config.thresholds.requireReviewConfidence).length;
202
+ const pagesWithGaps = pages.filter(p => p.hasCoverageGaps).length;
203
+ const byStatus = {};
204
+ for (const page of pages) {
205
+ byStatus[page.status] = (byStatus[page.status] || 0) + 1;
206
+ }
207
+ const HIGH_THRESHOLD = 50;
208
+ const MEDIUM_THRESHOLD = 20;
209
+ const byPriority = {
210
+ high: scored.filter(s => s.score >= HIGH_THRESHOLD).length,
211
+ medium: scored.filter(s => s.score >= MEDIUM_THRESHOLD && s.score < HIGH_THRESHOLD).length,
212
+ low: scored.filter(s => s.score < MEDIUM_THRESHOLD).length,
213
+ };
214
+ // Estimate 2 min per high priority, 1 min per medium, 0 for low (auto-approve)
215
+ const estimatedMinutes = byPriority.high * 2 + byPriority.medium * 1;
216
+ return {
217
+ totalPages: pages.length,
218
+ pagesWithTables: pagesWithTables.length,
219
+ totalTables,
220
+ avgConfidence,
221
+ lowConfidencePages,
222
+ pagesWithGaps,
223
+ byStatus,
224
+ byPriority,
225
+ estimatedReviewTime: {
226
+ pages: byPriority.high + byPriority.medium,
227
+ minutes: estimatedMinutes,
228
+ },
229
+ };
230
+ }
231
+ }
232
+ // ============================================================================
233
+ // OCR vs Markdown Comparison (Composable Strategy)
234
+ // ============================================================================
235
+ /**
236
+ * Compare OCR text blocks to rendered markdown for quality assessment
237
+ * Inspired by spatial structure preservation evaluation
238
+ */
239
+ export function compareOcrToMarkdown(ocrBlocks, markdownContent) {
240
+ // Extract text from OCR blocks
241
+ const ocrText = ocrBlocks.map(b => b.text).join(' ');
242
+ const ocrCharCount = ocrText.replace(/\s+/g, '').length;
243
+ // Clean markdown for comparison
244
+ const cleanMarkdown = markdownContent
245
+ .replace(/[#*_\[\]()|\-]+/g, ' ') // Remove markdown syntax
246
+ .replace(/\s+/g, ' ')
247
+ .trim();
248
+ const markdownCharCount = cleanMarkdown.replace(/\s+/g, '').length;
249
+ // Character delta
250
+ const delta = Math.abs(ocrCharCount - markdownCharCount);
251
+ const maxChars = Math.max(ocrCharCount, markdownCharCount, 1);
252
+ const deltaPct = delta / maxChars;
253
+ // Spatial integrity: check if block positions are preserved
254
+ // Higher score = better spatial preservation
255
+ const sortedBlocks = [...ocrBlocks].sort((a, b) => {
256
+ if (!a.bbox || !b.bbox)
257
+ return 0;
258
+ return a.bbox.y - b.bbox.y || a.bbox.x - b.bbox.x;
259
+ });
260
+ const spatialText = sortedBlocks.map(b => b.text).join(' ');
261
+ const spatialIntegrity = computeStringSimilarity(spatialText, cleanMarkdown);
262
+ // Table structure score: check if table delimiters are preserved
263
+ const tableStructureScore = computeTableStructureScore(ocrBlocks, markdownContent);
264
+ // Generate comparison flags
265
+ const flags = [];
266
+ if (deltaPct > 0.2)
267
+ flags.push('high_char_delta');
268
+ if (spatialIntegrity < 0.7)
269
+ flags.push('spatial_mismatch');
270
+ if (tableStructureScore < 0.5)
271
+ flags.push('table_structure_issue');
272
+ if (ocrCharCount === 0 && markdownCharCount > 0)
273
+ flags.push('missing_ocr');
274
+ if (markdownCharCount === 0 && ocrCharCount > 0)
275
+ flags.push('missing_markdown');
276
+ // Composite score: lower is better (0 = perfect match)
277
+ const score = (deltaPct * 40) + ((1 - spatialIntegrity) * 35) + ((1 - tableStructureScore) * 25);
278
+ return {
279
+ page: 0, // Caller should set
280
+ ocrCharCount,
281
+ markdownCharCount,
282
+ delta,
283
+ deltaPct,
284
+ spatialIntegrity,
285
+ tableStructureScore,
286
+ flags,
287
+ score,
288
+ };
289
+ }
290
+ /**
291
+ * Simple string similarity using bigrams (Dice coefficient)
292
+ */
293
+ function computeStringSimilarity(a, b) {
294
+ if (a.length === 0 && b.length === 0)
295
+ return 1;
296
+ if (a.length < 2 || b.length < 2)
297
+ return a === b ? 1 : 0;
298
+ const bigramsA = new Set();
299
+ for (let i = 0; i < a.length - 1; i++) {
300
+ bigramsA.add(a.slice(i, i + 2).toLowerCase());
301
+ }
302
+ const bigramsB = new Set();
303
+ for (let i = 0; i < b.length - 1; i++) {
304
+ bigramsB.add(b.slice(i, i + 2).toLowerCase());
305
+ }
306
+ let intersection = 0;
307
+ bigramsA.forEach(bg => {
308
+ if (bigramsB.has(bg))
309
+ intersection++;
310
+ });
311
+ return (2 * intersection) / (bigramsA.size + bigramsB.size);
312
+ }
313
+ /**
314
+ * Evaluate table structure preservation
315
+ */
316
+ function computeTableStructureScore(ocrBlocks, markdown) {
317
+ // Check for table-like patterns in markdown
318
+ const hasMarkdownTable = /\|.*\|/.test(markdown);
319
+ if (!hasMarkdownTable)
320
+ return 1; // No table to evaluate
321
+ // Count table delimiters
322
+ const pipeCount = (markdown.match(/\|/g) || []).length;
323
+ const dashCount = (markdown.match(/-{3,}/g) || []).length;
324
+ // Check if OCR captured tabular structure
325
+ const ocrText = ocrBlocks.map(b => b.text).join('\n');
326
+ const ocrHasColumns = ocrBlocks.length > 3 && ocrBlocks.some(b => b.bbox && b.bbox.x > 100);
327
+ // Score based on structure indicators
328
+ let score = 0.5; // Base score
329
+ if (pipeCount > 4)
330
+ score += 0.2;
331
+ if (dashCount > 0)
332
+ score += 0.15;
333
+ if (ocrHasColumns)
334
+ score += 0.15;
335
+ return Math.min(score, 1);
336
+ }
337
+ /**
338
+ * Create a scorer with a specific strategy
339
+ */
340
+ export function createScorer(strategy, config = {}) {
341
+ const strategyConfig = { ...config };
342
+ switch (strategy) {
343
+ case 'direct':
344
+ // Page-level metrics only, no structure analysis
345
+ strategyConfig.weights = {
346
+ ...DEFAULT_CONFIG.weights,
347
+ tableCount: 0,
348
+ structurePenalty: 0,
349
+ ...config.weights,
350
+ };
351
+ break;
352
+ case 'structure':
353
+ // Emphasize table and structure analysis
354
+ strategyConfig.weights = {
355
+ ...DEFAULT_CONFIG.weights,
356
+ tableCount: 20,
357
+ structurePenalty: 30,
358
+ inverseConfidence: 30,
359
+ ...config.weights,
360
+ };
361
+ break;
362
+ case 'comparison':
363
+ // Emphasize OCR vs markdown comparison
364
+ strategyConfig.weights = {
365
+ ...DEFAULT_CONFIG.weights,
366
+ comparisonDelta: 40,
367
+ tableCount: 5,
368
+ ...config.weights,
369
+ };
370
+ break;
371
+ case 'combined':
372
+ default:
373
+ // Balanced approach
374
+ break;
375
+ }
376
+ return new OcrReviewScorer(strategyConfig);
377
+ }
378
+ // ============================================================================
379
+ // Utility Functions for CLI Integration
380
+ // ============================================================================
381
+ /**
382
+ * Parse pages JSON from CLI output
383
+ */
384
+ export function parsePagesJson(json) {
385
+ const data = JSON.parse(json);
386
+ return Array.isArray(data) ? data : data.pages || [];
387
+ }
388
+ /**
389
+ * Parse tables JSON from CLI output
390
+ */
391
+ export function parseTablesJson(json) {
392
+ const data = JSON.parse(json);
393
+ return Array.isArray(data) ? data : data.tables || [];
394
+ }
395
+ /**
396
+ * Format scored pages for output
397
+ */
398
+ export function formatScoredPages(pages, format = 'table') {
399
+ switch (format) {
400
+ case 'json':
401
+ return JSON.stringify(pages, null, 2);
402
+ case 'jsonl':
403
+ return pages.map(p => JSON.stringify(p)).join('\n');
404
+ case 'csv':
405
+ const headers = ['page', 'score', 'tableCount', 'avgConfidence', 'status', 'flags'];
406
+ const rows = pages.map(p => [
407
+ p.page,
408
+ p.score.toFixed(2),
409
+ p.tableCount,
410
+ (p.avgConfidence * 100).toFixed(1) + '%',
411
+ p.status,
412
+ p.flags.join(';'),
413
+ ]);
414
+ return [headers.join(','), ...rows.map(r => r.join(','))].join('\n');
415
+ case 'table':
416
+ default:
417
+ const colWidths = { page: 6, score: 8, tables: 8, conf: 8, status: 12, flags: 30 };
418
+ const header = [
419
+ 'Page'.padEnd(colWidths.page),
420
+ 'Score'.padEnd(colWidths.score),
421
+ 'Tables'.padEnd(colWidths.tables),
422
+ 'Conf'.padEnd(colWidths.conf),
423
+ 'Status'.padEnd(colWidths.status),
424
+ 'Flags',
425
+ ].join(' ');
426
+ const divider = '-'.repeat(header.length);
427
+ const tableRows = pages.map(p => [
428
+ String(p.page).padEnd(colWidths.page),
429
+ p.score.toFixed(1).padEnd(colWidths.score),
430
+ String(p.tableCount).padEnd(colWidths.tables),
431
+ ((p.avgConfidence * 100).toFixed(0) + '%').padEnd(colWidths.conf),
432
+ p.status.padEnd(colWidths.status),
433
+ p.flags.slice(0, 3).join(', '),
434
+ ].join(' '));
435
+ return [header, divider, ...tableRows].join('\n');
436
+ }
437
+ }
438
+ /**
439
+ * Format stats for output
440
+ */
441
+ export function formatStats(stats, format = 'table') {
442
+ if (format === 'json') {
443
+ return JSON.stringify(stats, null, 2);
444
+ }
445
+ const lines = [
446
+ `Total pages: ${stats.totalPages}`,
447
+ `Pages with tables: ${stats.pagesWithTables} (${(stats.pagesWithTables / stats.totalPages * 100).toFixed(1)}%)`,
448
+ `Total tables: ${stats.totalTables}`,
449
+ `Average confidence: ${(stats.avgConfidence * 100).toFixed(1)}%`,
450
+ `Low confidence pages: ${stats.lowConfidencePages}`,
451
+ `Pages with coverage gaps: ${stats.pagesWithGaps}`,
452
+ '',
453
+ 'By Status:',
454
+ ...Object.entries(stats.byStatus).map(([k, v]) => ` ${k}: ${v}`),
455
+ '',
456
+ 'Review Priority:',
457
+ ` High (score >= 50): ${stats.byPriority.high} pages`,
458
+ ` Medium (score 20-49): ${stats.byPriority.medium} pages`,
459
+ ` Low (score < 20): ${stats.byPriority.low} pages`,
460
+ '',
461
+ `Estimated review: ${stats.estimatedReviewTime.pages} pages, ~${stats.estimatedReviewTime.minutes} minutes`,
462
+ ];
463
+ return lines.join('\n');
464
+ }
465
+ // ============================================================================
466
+ // Financial Verification Strategies
467
+ // ============================================================================
468
+ /**
469
+ * Confidence-based filtering thresholds
470
+ * Based on industry best practices for financial document verification
471
+ */
472
+ export const CONFIDENCE_TIERS = {
473
+ AUTO_APPROVE: 0.95, // Score > 95%: Auto-approve
474
+ SPOT_CHECK: 0.70, // Score 70%–95%: Spot-check
475
+ MANUAL_REVIEW: 0.70, // Score < 70%: Mandatory manual verification
476
+ };
477
+ /**
478
+ * Categorize a confidence score into action tiers
479
+ */
480
+ export function getConfidenceTier(confidence) {
481
+ if (confidence >= CONFIDENCE_TIERS.AUTO_APPROVE)
482
+ return 'auto_approve';
483
+ if (confidence >= CONFIDENCE_TIERS.SPOT_CHECK)
484
+ return 'spot_check';
485
+ return 'manual_review';
486
+ }
487
+ /**
488
+ * Batch categorize pages by confidence tier
489
+ */
490
+ export function categorizeByConfidence(pages) {
491
+ const result = {
492
+ auto_approve: [],
493
+ spot_check: [],
494
+ manual_review: [],
495
+ };
496
+ for (const page of pages) {
497
+ const tier = getConfidenceTier(page.avgConfidence);
498
+ result[tier].push(page);
499
+ }
500
+ return result;
501
+ }
502
+ /**
503
+ * Common financial check-sum rules
504
+ */
505
+ export const FINANCIAL_CHECKSUMS = [
506
+ {
507
+ name: 'Balance Sheet Identity',
508
+ formula: 'Total Assets = Total Liabilities + Equity',
509
+ fields: ['total_assets', 'total_liabilities', 'equity', 'shareholders_equity'],
510
+ validate: (v) => {
511
+ const assets = v.total_assets || 0;
512
+ const liabilities = v.total_liabilities || 0;
513
+ const equity = v.equity || v.shareholders_equity || 0;
514
+ const expected = liabilities + equity;
515
+ const difference = Math.abs(assets - expected);
516
+ const tolerancePct = assets > 0 ? (difference / assets) * 100 : 0;
517
+ return {
518
+ passed: tolerancePct < 1, // 1% tolerance for rounding
519
+ expected,
520
+ actual: assets,
521
+ difference,
522
+ tolerancePct,
523
+ };
524
+ },
525
+ },
526
+ {
527
+ name: 'Net Income Check',
528
+ formula: 'Net Income = Revenue - Expenses',
529
+ fields: ['net_income', 'total_revenue', 'total_expenses'],
530
+ validate: (v) => {
531
+ const netIncome = v.net_income || 0;
532
+ const revenue = v.total_revenue || 0;
533
+ const expenses = v.total_expenses || 0;
534
+ const expected = revenue - expenses;
535
+ const difference = Math.abs(netIncome - expected);
536
+ const tolerancePct = Math.abs(expected) > 0 ? (difference / Math.abs(expected)) * 100 : 0;
537
+ return {
538
+ passed: tolerancePct < 1,
539
+ expected,
540
+ actual: netIncome,
541
+ difference,
542
+ tolerancePct,
543
+ };
544
+ },
545
+ },
546
+ {
547
+ name: 'Working Capital',
548
+ formula: 'Working Capital = Current Assets - Current Liabilities',
549
+ fields: ['working_capital', 'current_assets', 'current_liabilities'],
550
+ validate: (v) => {
551
+ const workingCapital = v.working_capital || 0;
552
+ const currentAssets = v.current_assets || 0;
553
+ const currentLiabilities = v.current_liabilities || 0;
554
+ const expected = currentAssets - currentLiabilities;
555
+ const difference = Math.abs(workingCapital - expected);
556
+ const tolerancePct = Math.abs(expected) > 0 ? (difference / Math.abs(expected)) * 100 : 0;
557
+ return {
558
+ passed: tolerancePct < 1,
559
+ expected,
560
+ actual: workingCapital,
561
+ difference,
562
+ tolerancePct,
563
+ };
564
+ },
565
+ },
566
+ ];
567
+ /**
568
+ * Run all applicable check-sum validations on extracted values
569
+ */
570
+ export function runCheckSums(values, rules = FINANCIAL_CHECKSUMS) {
571
+ const results = [];
572
+ for (const rule of rules) {
573
+ // Check if we have at least 2 of the required fields
574
+ const presentFields = rule.fields.filter(f => values[f] !== undefined);
575
+ if (presentFields.length >= 2) {
576
+ results.push({
577
+ rule,
578
+ result: rule.validate(values),
579
+ });
580
+ }
581
+ }
582
+ return results;
583
+ }
584
+ /**
585
+ * Detect anomalies in extracted values using statistical analysis
586
+ * Flags potential OCR errors like decimal point shifts
587
+ */
588
+ export function detectAnomalies(currentValues, historicalValues, zScoreThreshold = 3.0) {
589
+ const anomalies = [];
590
+ if (historicalValues.length < 2)
591
+ return anomalies;
592
+ for (const [field, currentValue] of Object.entries(currentValues)) {
593
+ const historicalData = historicalValues
594
+ .map(h => h[field])
595
+ .filter((v) => v !== undefined);
596
+ if (historicalData.length < 2)
597
+ continue;
598
+ // Calculate mean and standard deviation
599
+ const mean = historicalData.reduce((a, b) => a + b, 0) / historicalData.length;
600
+ const variance = historicalData.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / historicalData.length;
601
+ const stdDev = Math.sqrt(variance);
602
+ // Calculate z-score
603
+ const zScore = stdDev > 0 ? Math.abs(currentValue - mean) / stdDev : 0;
604
+ if (zScore > zScoreThreshold) {
605
+ // Try to identify the cause
606
+ let possibleCause = 'Unexpected value';
607
+ // Check for decimal point error (value is 10x, 100x, or 1000x off)
608
+ const ratios = [10, 100, 1000, 0.1, 0.01, 0.001];
609
+ for (const ratio of ratios) {
610
+ const adjustedZScore = stdDev > 0 ? Math.abs((currentValue / ratio) - mean) / stdDev : 0;
611
+ if (adjustedZScore < 1) {
612
+ possibleCause = `Possible decimal point error (value appears to be ${ratio}x expected)`;
613
+ break;
614
+ }
615
+ }
616
+ // Check for sign error
617
+ if (Math.abs((-currentValue) - mean) / stdDev < 1) {
618
+ possibleCause = 'Possible sign error (negative vs positive)';
619
+ }
620
+ anomalies.push({
621
+ field,
622
+ currentValue,
623
+ historicalMean: mean,
624
+ historicalStdDev: stdDev,
625
+ zScore,
626
+ isAnomaly: true,
627
+ possibleCause,
628
+ });
629
+ }
630
+ }
631
+ return anomalies;
632
+ }
633
+ export const CROSS_QUERY_RULES = [
634
+ {
635
+ name: 'Total Assets Components',
636
+ totalField: 'total_assets',
637
+ componentFields: ['current_assets', 'non_current_assets', 'fixed_assets', 'intangible_assets'],
638
+ tolerance: 2,
639
+ },
640
+ {
641
+ name: 'Total Revenue Components',
642
+ totalField: 'total_revenue',
643
+ componentFields: ['product_revenue', 'service_revenue', 'other_revenue'],
644
+ tolerance: 2,
645
+ },
646
+ {
647
+ name: 'Total Liabilities Components',
648
+ totalField: 'total_liabilities',
649
+ componentFields: ['current_liabilities', 'non_current_liabilities', 'long_term_debt'],
650
+ tolerance: 2,
651
+ },
652
+ ];
653
+ /**
654
+ * Validate that total fields match sum of their components
655
+ */
656
+ export function validateCrossQuery(values, rules = CROSS_QUERY_RULES) {
657
+ const results = [];
658
+ for (const rule of rules) {
659
+ const totalValue = values[rule.totalField];
660
+ if (totalValue === undefined)
661
+ continue;
662
+ const presentComponents = rule.componentFields.filter(f => values[f] !== undefined);
663
+ if (presentComponents.length === 0)
664
+ continue;
665
+ const componentSum = presentComponents.reduce((sum, f) => sum + (values[f] || 0), 0);
666
+ const difference = Math.abs(totalValue - componentSum);
667
+ const differencePercent = totalValue > 0 ? (difference / totalValue) * 100 : 0;
668
+ results.push({
669
+ rule,
670
+ passed: differencePercent <= rule.tolerance,
671
+ totalValue,
672
+ componentSum,
673
+ difference,
674
+ differencePercent,
675
+ presentComponents,
676
+ });
677
+ }
678
+ return results;
679
+ }
680
+ /**
681
+ * Run comprehensive financial verification
682
+ */
683
+ export function runFinancialVerification(extractedValues, confidence, historicalValues = []) {
684
+ const confidenceTier = getConfidenceTier(confidence);
685
+ const checkSums = runCheckSums(extractedValues);
686
+ const anomalies = detectAnomalies(extractedValues, historicalValues);
687
+ const crossQueries = validateCrossQuery(extractedValues);
688
+ const issues = [];
689
+ const recommendations = [];
690
+ // Analyze confidence tier
691
+ if (confidenceTier === 'manual_review') {
692
+ issues.push(`Low OCR confidence (${(confidence * 100).toFixed(1)}%) - mandatory manual verification required`);
693
+ recommendations.push('Review original document alongside extracted data');
694
+ }
695
+ else if (confidenceTier === 'spot_check') {
696
+ recommendations.push(`Moderate OCR confidence (${(confidence * 100).toFixed(1)}%) - spot-check recommended`);
697
+ }
698
+ // Analyze check-sums
699
+ for (const { rule, result } of checkSums) {
700
+ if (!result.passed) {
701
+ issues.push(`${rule.name} failed: expected ${result.expected.toLocaleString()}, got ${result.actual.toLocaleString()} (${result.tolerancePct.toFixed(2)}% off)`);
702
+ recommendations.push(`Verify ${rule.formula}`);
703
+ }
704
+ }
705
+ // Analyze anomalies
706
+ for (const anomaly of anomalies) {
707
+ issues.push(`Anomaly detected in "${anomaly.field}": ${anomaly.possibleCause}`);
708
+ recommendations.push(`Compare "${anomaly.field}" value (${anomaly.currentValue.toLocaleString()}) with source document`);
709
+ }
710
+ // Analyze cross-queries
711
+ for (const cq of crossQueries) {
712
+ if (!cq.passed) {
713
+ issues.push(`${cq.rule.name}: total (${cq.totalValue.toLocaleString()}) doesn't match component sum (${cq.componentSum.toLocaleString()})`);
714
+ recommendations.push(`Verify components: ${cq.presentComponents.join(', ')}`);
715
+ }
716
+ }
717
+ // Determine overall status
718
+ let overallStatus;
719
+ if (issues.length === 0) {
720
+ overallStatus = 'pass';
721
+ }
722
+ else if (confidenceTier === 'manual_review' || checkSums.some(c => !c.result.passed) || anomalies.length > 0) {
723
+ overallStatus = 'fail';
724
+ }
725
+ else {
726
+ overallStatus = 'warning';
727
+ }
728
+ return {
729
+ confidenceTier,
730
+ checkSums,
731
+ anomalies,
732
+ crossQueries,
733
+ overallStatus,
734
+ issues,
735
+ recommendations,
736
+ };
737
+ }
738
+ /**
739
+ * Format verification report for display
740
+ */
741
+ export function formatVerificationReport(report) {
742
+ const lines = [
743
+ '═══════════════════════════════════════════════════════════',
744
+ ' FINANCIAL VERIFICATION REPORT ',
745
+ '═══════════════════════════════════════════════════════════',
746
+ '',
747
+ `Overall Status: ${report.overallStatus.toUpperCase()}`,
748
+ `Confidence Tier: ${report.confidenceTier.replace('_', ' ').toUpperCase()}`,
749
+ '',
750
+ ];
751
+ if (report.checkSums.length > 0) {
752
+ lines.push('Check-Sum Validations:');
753
+ for (const { rule, result } of report.checkSums) {
754
+ const status = result.passed ? '✓' : '✗';
755
+ lines.push(` ${status} ${rule.name}`);
756
+ if (!result.passed) {
757
+ lines.push(` Expected: ${result.expected.toLocaleString()}, Actual: ${result.actual.toLocaleString()}`);
758
+ }
759
+ }
760
+ lines.push('');
761
+ }
762
+ if (report.anomalies.length > 0) {
763
+ lines.push('Anomalies Detected:');
764
+ for (const anomaly of report.anomalies) {
765
+ lines.push(` ⚠ ${anomaly.field}: ${anomaly.possibleCause}`);
766
+ lines.push(` Value: ${anomaly.currentValue.toLocaleString()}, Expected ~${anomaly.historicalMean.toLocaleString()}`);
767
+ }
768
+ lines.push('');
769
+ }
770
+ if (report.crossQueries.length > 0) {
771
+ lines.push('Cross-Query Validations:');
772
+ for (const cq of report.crossQueries) {
773
+ const status = cq.passed ? '✓' : '✗';
774
+ lines.push(` ${status} ${cq.rule.name}`);
775
+ if (!cq.passed) {
776
+ lines.push(` Total: ${cq.totalValue.toLocaleString()}, Sum: ${cq.componentSum.toLocaleString()} (${cq.differencePercent.toFixed(1)}% off)`);
777
+ }
778
+ }
779
+ lines.push('');
780
+ }
781
+ if (report.issues.length > 0) {
782
+ lines.push('Issues:');
783
+ for (const issue of report.issues) {
784
+ lines.push(` • ${issue}`);
785
+ }
786
+ lines.push('');
787
+ }
788
+ if (report.recommendations.length > 0) {
789
+ lines.push('Recommendations:');
790
+ for (const rec of report.recommendations) {
791
+ lines.push(` → ${rec}`);
792
+ }
793
+ }
794
+ return lines.join('\n');
795
+ }
796
+ //# sourceMappingURL=scorer.js.map