nodebench-mcp 2.11.0 → 2.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +809 -809
- package/README.md +443 -431
- package/STYLE_GUIDE.md +477 -477
- package/dist/__tests__/evalHarness.test.js +1 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +9 -14
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityEval.test.js +88 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +9 -5
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +165 -17
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +36 -7
- package/dist/__tests__/helpers/answerMatch.js +224 -35
- package/dist/__tests__/helpers/answerMatch.js.map +1 -1
- package/dist/__tests__/helpers/textLlm.d.ts +1 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +1 -0
- package/dist/__tests__/presetRealWorldBench.test.js +850 -0
- package/dist/__tests__/presetRealWorldBench.test.js.map +1 -0
- package/dist/__tests__/tools.test.js +20 -7
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/__tests__/toolsetGatingEval.test.js +21 -11
- package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
- package/dist/db.js +21 -0
- package/dist/db.js.map +1 -1
- package/dist/index.js +424 -327
- package/dist/index.js.map +1 -1
- package/dist/tools/agentBootstrapTools.js +258 -258
- package/dist/tools/boilerplateTools.js +144 -144
- package/dist/tools/cCompilerBenchmarkTools.js +33 -33
- package/dist/tools/documentationTools.js +59 -59
- package/dist/tools/flywheelTools.js +6 -6
- package/dist/tools/gitWorkflowTools.d.ts +11 -0
- package/dist/tools/gitWorkflowTools.js +580 -0
- package/dist/tools/gitWorkflowTools.js.map +1 -0
- package/dist/tools/learningTools.js +26 -26
- package/dist/tools/localFileTools.d.ts +3 -0
- package/dist/tools/localFileTools.js +3164 -125
- package/dist/tools/localFileTools.js.map +1 -1
- package/dist/tools/metaTools.js +82 -0
- package/dist/tools/metaTools.js.map +1 -1
- package/dist/tools/parallelAgentTools.js +228 -0
- package/dist/tools/parallelAgentTools.js.map +1 -1
- package/dist/tools/patternTools.d.ts +13 -0
- package/dist/tools/patternTools.js +456 -0
- package/dist/tools/patternTools.js.map +1 -0
- package/dist/tools/reconTools.js +31 -31
- package/dist/tools/selfEvalTools.js +44 -44
- package/dist/tools/seoTools.d.ts +16 -0
- package/dist/tools/seoTools.js +866 -0
- package/dist/tools/seoTools.js.map +1 -0
- package/dist/tools/sessionMemoryTools.d.ts +15 -0
- package/dist/tools/sessionMemoryTools.js +348 -0
- package/dist/tools/sessionMemoryTools.js.map +1 -0
- package/dist/tools/toolRegistry.d.ts +4 -0
- package/dist/tools/toolRegistry.js +489 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/toonTools.d.ts +15 -0
- package/dist/tools/toonTools.js +94 -0
- package/dist/tools/toonTools.js.map +1 -0
- package/dist/tools/verificationTools.js +41 -41
- package/dist/tools/visionTools.js +17 -17
- package/dist/tools/voiceBridgeTools.d.ts +15 -0
- package/dist/tools/voiceBridgeTools.js +1427 -0
- package/dist/tools/voiceBridgeTools.js.map +1 -0
- package/dist/tools/webTools.js +18 -18
- package/package.json +102 -101
|
@@ -91,6 +91,67 @@ function toIntegerOrNull(value) {
|
|
|
91
91
|
const n = Number.parseInt(m[0], 10);
|
|
92
92
|
return Number.isFinite(n) ? n : null;
|
|
93
93
|
}
|
|
94
|
+
function extractChunkedIntsFromText(text, opts) {
|
|
95
|
+
const chunkSize = typeof opts?.chunkSize === "number" && opts.chunkSize > 0 ? Math.trunc(opts.chunkSize) : 2;
|
|
96
|
+
const min = typeof opts?.min === "number" && Number.isFinite(opts.min) ? opts.min : 0;
|
|
97
|
+
const max = typeof opts?.max === "number" && Number.isFinite(opts.max) ? opts.max : 200;
|
|
98
|
+
const runs = String(text ?? "").match(/\d+/g) ?? [];
|
|
99
|
+
const out = [];
|
|
100
|
+
for (const run of runs) {
|
|
101
|
+
let s = String(run ?? "").trim();
|
|
102
|
+
if (!s)
|
|
103
|
+
continue;
|
|
104
|
+
const pushIfOk = (n) => {
|
|
105
|
+
if (!Number.isFinite(n))
|
|
106
|
+
return;
|
|
107
|
+
if (n < min || n > max)
|
|
108
|
+
return;
|
|
109
|
+
out.push(n);
|
|
110
|
+
};
|
|
111
|
+
// Common OCR noise: isolated digits when the underlying data are 2-digit tokens.
|
|
112
|
+
if (s.length < chunkSize) {
|
|
113
|
+
pushIfOk(Number.parseInt(s, 10));
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
if (s.length === chunkSize) {
|
|
117
|
+
pushIfOk(Number.parseInt(s, 10));
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
// Sometimes OCR concatenates adjacent numbers (e.g. "247428"). Split into fixed-size chunks.
|
|
121
|
+
// For 2-digit chunking, handle odd-length runs by shifting or trimming leading zeros.
|
|
122
|
+
if (chunkSize === 2) {
|
|
123
|
+
// Fix common leading/trailing zero artifacts: "074" -> "74", "580" -> "58".
|
|
124
|
+
if (s.length === 3 && s.startsWith("0"))
|
|
125
|
+
s = s.slice(1);
|
|
126
|
+
if (s.length === 3 && s.endsWith("0"))
|
|
127
|
+
s = s.slice(0, 2);
|
|
128
|
+
// For any remaining odd-length run, prefer chunking that yields more in-range 2-digit values.
|
|
129
|
+
const chunkFrom = (start) => {
|
|
130
|
+
const nums = [];
|
|
131
|
+
for (let i = start; i + 2 <= s.length; i += 2) {
|
|
132
|
+
nums.push(Number.parseInt(s.slice(i, i + 2), 10));
|
|
133
|
+
}
|
|
134
|
+
return nums;
|
|
135
|
+
};
|
|
136
|
+
if (s.length % 2 === 1) {
|
|
137
|
+
const a = chunkFrom(0);
|
|
138
|
+
const b = chunkFrom(1);
|
|
139
|
+
const score = (arr) => arr.filter((n) => Number.isFinite(n) && n >= min && n <= max).length;
|
|
140
|
+
const best = score(b) > score(a) ? b : a;
|
|
141
|
+
for (const n of best)
|
|
142
|
+
pushIfOk(n);
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
for (const n of chunkFrom(0))
|
|
146
|
+
pushIfOk(n);
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
for (let i = 0; i + chunkSize <= s.length; i += chunkSize) {
|
|
150
|
+
pushIfOk(Number.parseInt(s.slice(i, i + chunkSize), 10));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return out;
|
|
154
|
+
}
|
|
94
155
|
function gcdInt(a, b) {
|
|
95
156
|
let x = Math.abs(Math.trunc(a));
|
|
96
157
|
let y = Math.abs(Math.trunc(b));
|
|
@@ -405,35 +466,189 @@ function toOcrBbox(raw) {
|
|
|
405
466
|
return null;
|
|
406
467
|
return { x0, y0, x1, y1 };
|
|
407
468
|
}
|
|
408
|
-
|
|
409
|
-
const
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
469
|
+
function parseTesseractTsv(tsv) {
|
|
470
|
+
const text = String(tsv ?? "").trim();
|
|
471
|
+
if (!text)
|
|
472
|
+
return { words: [], lines: [] };
|
|
473
|
+
const rows = text.split(/\r?\n/);
|
|
474
|
+
if (rows.length <= 1)
|
|
475
|
+
return { words: [], lines: [] };
|
|
476
|
+
const wordsRaw = [];
|
|
477
|
+
for (let i = 1; i < rows.length; i++) {
|
|
478
|
+
const row = rows[i];
|
|
479
|
+
if (!row)
|
|
480
|
+
continue;
|
|
481
|
+
const cols = row.split("\t");
|
|
482
|
+
if (cols.length < 12)
|
|
483
|
+
continue;
|
|
484
|
+
const level = Number.parseInt(cols[0], 10);
|
|
485
|
+
if (!Number.isFinite(level))
|
|
486
|
+
continue;
|
|
487
|
+
const page = cols[1];
|
|
488
|
+
const block = cols[2];
|
|
489
|
+
const par = cols[3];
|
|
490
|
+
const line = cols[4];
|
|
491
|
+
const word = cols[5];
|
|
492
|
+
const left = Number.parseInt(cols[6], 10);
|
|
493
|
+
const top = Number.parseInt(cols[7], 10);
|
|
494
|
+
const w = Number.parseInt(cols[8], 10);
|
|
495
|
+
const h = Number.parseInt(cols[9], 10);
|
|
496
|
+
const confN = Number.parseFloat(cols[10]);
|
|
497
|
+
const conf = Number.isFinite(confN) ? confN : null;
|
|
498
|
+
const t = String(cols.slice(11).join("\t") ?? "").trim();
|
|
499
|
+
if (![left, top, w, h].every(Number.isFinite))
|
|
500
|
+
continue;
|
|
501
|
+
if (!t)
|
|
502
|
+
continue;
|
|
503
|
+
const key = `${page}:${block}:${par}:${line}`;
|
|
504
|
+
if (level === 5) {
|
|
505
|
+
wordsRaw.push({ key, left, top, width: w, height: h, conf, text: t });
|
|
506
|
+
}
|
|
413
507
|
}
|
|
414
|
-
const
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
508
|
+
const words = wordsRaw.map((w) => ({
|
|
509
|
+
text: w.text,
|
|
510
|
+
confidence: w.conf,
|
|
511
|
+
bbox: { x0: w.left, y0: w.top, x1: w.left + w.width, y1: w.top + w.height },
|
|
512
|
+
}));
|
|
513
|
+
// Build lines by grouping words with the same (page,block,par,line) key.
|
|
514
|
+
const byLine = new Map();
|
|
515
|
+
for (const w of wordsRaw) {
|
|
516
|
+
const arr = byLine.get(w.key);
|
|
517
|
+
if (arr)
|
|
518
|
+
arr.push(w);
|
|
519
|
+
else
|
|
520
|
+
byLine.set(w.key, [w]);
|
|
521
|
+
}
|
|
522
|
+
const lines = [];
|
|
523
|
+
for (const arr of byLine.values()) {
|
|
524
|
+
const sorted = [...arr].sort((a, b) => a.left - b.left);
|
|
525
|
+
const lineText = sorted.map((x) => x.text).join(" ").trim();
|
|
526
|
+
if (!lineText)
|
|
527
|
+
continue;
|
|
528
|
+
const x0 = Math.min(...sorted.map((x) => x.left));
|
|
529
|
+
const y0 = Math.min(...sorted.map((x) => x.top));
|
|
530
|
+
const x1 = Math.max(...sorted.map((x) => x.left + x.width));
|
|
531
|
+
const y1 = Math.max(...sorted.map((x) => x.top + x.height));
|
|
532
|
+
const confs = sorted.map((x) => x.conf).filter((c) => typeof c === "number" && Number.isFinite(c));
|
|
533
|
+
const avgConf = confs.length ? confs.reduce((s, n) => s + n, 0) / confs.length : null;
|
|
534
|
+
lines.push({ text: lineText, confidence: avgConf, bbox: { x0, y0, x1, y1 } });
|
|
535
|
+
}
|
|
536
|
+
// Keep a stable reading order.
|
|
537
|
+
lines.sort((a, b) => {
|
|
538
|
+
const ay = a.bbox ? a.bbox.y0 : 0;
|
|
539
|
+
const by = b.bbox ? b.bbox.y0 : 0;
|
|
540
|
+
const ax = a.bbox ? a.bbox.x0 : 0;
|
|
541
|
+
const bx = b.bbox ? b.bbox.x0 : 0;
|
|
542
|
+
return ay - by || ax - bx;
|
|
543
|
+
});
|
|
544
|
+
return { words, lines };
|
|
545
|
+
}
|
|
546
|
+
const OCR_WORKER_POOL = new Map();
|
|
547
|
+
const OCR_WORKER_IDLE_TERMINATE_MS = 3000;
|
|
548
|
+
function ocrWorkerKey(lang, langPathEffective) {
|
|
549
|
+
return `${lang}::${langPathEffective ?? ""}`;
|
|
550
|
+
}
|
|
551
|
+
async function getOrCreateOcrWorkerEntry(args) {
|
|
552
|
+
const key = ocrWorkerKey(args.lang, args.langPathEffective);
|
|
553
|
+
const existing = OCR_WORKER_POOL.get(key);
|
|
554
|
+
if (existing)
|
|
555
|
+
return { key, entry: existing };
|
|
556
|
+
const entry = {
|
|
557
|
+
workerPromise: (async () => {
|
|
558
|
+
const tesseract = await getTesseract();
|
|
559
|
+
const createWorker = tesseract?.createWorker;
|
|
560
|
+
if (typeof createWorker !== "function") {
|
|
561
|
+
throw new Error("tesseract.js missing createWorker() export (unsupported version)");
|
|
562
|
+
}
|
|
563
|
+
// createWorker() returns a wrapper that manages a worker_threads Worker under the hood.
|
|
564
|
+
const worker = await createWorker(args.lang, undefined, {
|
|
565
|
+
...(args.langPathEffective ? { langPath: args.langPathEffective } : {}),
|
|
566
|
+
logger: () => {
|
|
567
|
+
// silence
|
|
568
|
+
},
|
|
569
|
+
});
|
|
570
|
+
return worker;
|
|
571
|
+
})(),
|
|
572
|
+
chain: Promise.resolve(),
|
|
573
|
+
activeCount: 0,
|
|
574
|
+
idleTimer: null,
|
|
575
|
+
};
|
|
576
|
+
OCR_WORKER_POOL.set(key, entry);
|
|
577
|
+
return { key, entry };
|
|
578
|
+
}
|
|
579
|
+
function scheduleOcrWorkerIdleTerminate(key, entry) {
|
|
580
|
+
if (entry.idleTimer)
|
|
581
|
+
clearTimeout(entry.idleTimer);
|
|
582
|
+
entry.idleTimer = setTimeout(() => {
|
|
583
|
+
// Fire-and-forget; do not block the event loop on teardown.
|
|
584
|
+
void (async () => {
|
|
585
|
+
OCR_WORKER_POOL.delete(key);
|
|
586
|
+
try {
|
|
587
|
+
const worker = await entry.workerPromise;
|
|
588
|
+
if (worker && typeof worker.terminate === "function")
|
|
589
|
+
await worker.terminate();
|
|
590
|
+
}
|
|
591
|
+
catch {
|
|
592
|
+
// ignore
|
|
593
|
+
}
|
|
594
|
+
})();
|
|
595
|
+
}, OCR_WORKER_IDLE_TERMINATE_MS);
|
|
596
|
+
// Keep the timer from preventing exit, while the worker thread still keeps the loop alive.
|
|
597
|
+
entry.idleTimer?.unref?.();
|
|
598
|
+
}
|
|
599
|
+
async function withOcrWorker(args, fn) {
|
|
600
|
+
const { key, entry } = await getOrCreateOcrWorkerEntry(args);
|
|
601
|
+
// If we were about to tear down, keep the worker alive for this request burst.
|
|
602
|
+
if (entry.idleTimer) {
|
|
603
|
+
clearTimeout(entry.idleTimer);
|
|
604
|
+
entry.idleTimer = null;
|
|
605
|
+
}
|
|
606
|
+
entry.activeCount += 1;
|
|
607
|
+
const run = async () => {
|
|
608
|
+
const worker = await entry.workerPromise;
|
|
609
|
+
return await fn(worker);
|
|
610
|
+
};
|
|
611
|
+
const p = entry.chain.then(run, run);
|
|
612
|
+
entry.chain = p.then(() => undefined, () => undefined);
|
|
613
|
+
return p.finally(() => {
|
|
614
|
+
entry.activeCount -= 1;
|
|
615
|
+
if (entry.activeCount <= 0) {
|
|
616
|
+
entry.activeCount = 0;
|
|
617
|
+
scheduleOcrWorkerIdleTerminate(key, entry);
|
|
618
|
+
}
|
|
419
619
|
});
|
|
620
|
+
}
|
|
621
|
+
async function ocrRecognizeBuffer(args) {
|
|
622
|
+
// NOTE: Tesseract.recognize() (top-level) does not accept OutputFormats like TSV.
|
|
623
|
+
// We must use a worker's recognize() and request output.tsv explicitly.
|
|
624
|
+
const result = await withOcrWorker({ lang: args.lang, langPathEffective: args.langPathEffective }, (worker) => worker.recognize(args.buffer, args.tessOptions ?? {}, args.output ?? { text: true, tsv: true }));
|
|
420
625
|
const data = result?.data ?? {};
|
|
421
626
|
const text = String(data.text ?? "").trim();
|
|
422
627
|
const confidence = typeof data.confidence === "number" ? data.confidence : null;
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
628
|
+
let words = [];
|
|
629
|
+
let lines = [];
|
|
630
|
+
// tesseract.js v7 returns layout data primarily via TSV/HOCR; earlier versions may populate data.words/lines.
|
|
631
|
+
if (Array.isArray(data.words) || Array.isArray(data.lines)) {
|
|
632
|
+
words = Array.isArray(data.words)
|
|
633
|
+
? data.words.map((w) => ({
|
|
634
|
+
text: String(w?.text ?? ""),
|
|
635
|
+
confidence: typeof w?.confidence === "number" ? w.confidence : null,
|
|
636
|
+
bbox: toOcrBbox(w?.bbox),
|
|
637
|
+
}))
|
|
638
|
+
: [];
|
|
639
|
+
lines = Array.isArray(data.lines)
|
|
640
|
+
? data.lines.map((l) => ({
|
|
641
|
+
text: String(l?.text ?? ""),
|
|
642
|
+
confidence: typeof l?.confidence === "number" ? l.confidence : null,
|
|
643
|
+
bbox: toOcrBbox(l?.bbox),
|
|
644
|
+
}))
|
|
645
|
+
: [];
|
|
646
|
+
}
|
|
647
|
+
else if (typeof data.tsv === "string" && data.tsv.trim()) {
|
|
648
|
+
const parsed = parseTesseractTsv(String(data.tsv));
|
|
649
|
+
words = parsed.words;
|
|
650
|
+
lines = parsed.lines;
|
|
651
|
+
}
|
|
437
652
|
return { text, confidence, words, lines };
|
|
438
653
|
}
|
|
439
654
|
async function ocrRecognizeImageFile(args) {
|
|
@@ -473,6 +688,8 @@ async function ocrRecognizeImageFileWithColorMask(args) {
|
|
|
473
688
|
const { data, info } = await image.ensureAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
474
689
|
const out = Buffer.alloc(info.width * info.height);
|
|
475
690
|
// Convert matching colored pixels to black ink on a white background.
|
|
691
|
+
const minPrimary = typeof args.minPrimary === "number" && Number.isFinite(args.minPrimary) ? args.minPrimary : 80;
|
|
692
|
+
const minDelta = typeof args.minDelta === "number" && Number.isFinite(args.minDelta) ? args.minDelta : 25;
|
|
476
693
|
for (let i = 0, j = 0; i < data.length; i += 4, j++) {
|
|
477
694
|
const r = data[i];
|
|
478
695
|
const g = data[i + 1];
|
|
@@ -480,117 +697,1108 @@ async function ocrRecognizeImageFileWithColorMask(args) {
|
|
|
480
697
|
const a = data[i + 3];
|
|
481
698
|
let match = false;
|
|
482
699
|
if (a >= 40) {
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
else {
|
|
487
|
-
match = g >= 90 && g - r >= 35 && g - b >= 35;
|
|
488
|
-
}
|
|
700
|
+
const primary = args.color === "red" ? r : g;
|
|
701
|
+
const other = args.color === "red" ? g : r;
|
|
702
|
+
match = primary >= minPrimary && primary - other >= minDelta && primary - b >= minDelta;
|
|
489
703
|
}
|
|
490
704
|
out[j] = match ? 0 : 255;
|
|
491
705
|
}
|
|
492
|
-
// Upscale
|
|
493
|
-
const
|
|
494
|
-
.
|
|
495
|
-
.
|
|
496
|
-
|
|
497
|
-
|
|
706
|
+
// Upscale for OCR; keep it deterministic.
|
|
707
|
+
const requestedUpscale = typeof args.upscale === "number" && Number.isFinite(args.upscale) && args.upscale >= 1
|
|
708
|
+
? Math.trunc(args.upscale)
|
|
709
|
+
: info.width < 900
|
|
710
|
+
? 4
|
|
711
|
+
: info.width < 1600
|
|
712
|
+
? 3
|
|
713
|
+
: 2;
|
|
714
|
+
// Don't allow the upscaled image to explode in size.
|
|
715
|
+
const scaledMaxPixels = Math.max(maxPixels, Math.floor(maxPixels * 4));
|
|
716
|
+
let upscale = Math.max(1, Math.min(10, requestedUpscale));
|
|
717
|
+
while (upscale > 1 && info.width * upscale * info.height * upscale > scaledMaxPixels)
|
|
718
|
+
upscale--;
|
|
719
|
+
const blurSigma = typeof args.blurSigma === "number" && Number.isFinite(args.blurSigma) ? clampNumber(args.blurSigma, 0, 10) : 0.3;
|
|
720
|
+
const threshold = typeof args.threshold === "number" && Number.isFinite(args.threshold) ? clampInt(args.threshold, 180, 1, 254) : 180;
|
|
721
|
+
let pipeline = sharp(out, { raw: { width: info.width, height: info.height, channels: 1 } }).resize({
|
|
722
|
+
width: info.width * upscale,
|
|
723
|
+
height: info.height * upscale,
|
|
724
|
+
kernel: "nearest",
|
|
725
|
+
});
|
|
726
|
+
if (blurSigma >= 0.3)
|
|
727
|
+
pipeline = pipeline.blur(blurSigma);
|
|
728
|
+
const masked = await pipeline.threshold(threshold).png().toBuffer();
|
|
498
729
|
const result = await ocrRecognizeBuffer({
|
|
499
730
|
buffer: masked,
|
|
500
731
|
lang: args.lang,
|
|
501
732
|
langPathEffective: args.langPathEffective,
|
|
733
|
+
tessOptions: {
|
|
734
|
+
tessedit_char_whitelist: "0123456789",
|
|
735
|
+
// Sparse text works better for number grids (keeps tokens separate, reduces concatenation).
|
|
736
|
+
tessedit_pageseg_mode: "11",
|
|
737
|
+
user_defined_dpi: "300",
|
|
738
|
+
},
|
|
739
|
+
output: { text: true, tsv: false },
|
|
502
740
|
});
|
|
503
741
|
return { text: result.text, confidence: result.confidence, usedSharp: true };
|
|
504
742
|
}
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
743
|
+
async function ocrRecognizeImageFileWithPurpleMask(args) {
|
|
744
|
+
const sharp = await getSharpOptional();
|
|
745
|
+
if (!sharp) {
|
|
746
|
+
throw new Error("Missing optional dependency: sharp. Install it to use color-masked OCR.");
|
|
747
|
+
}
|
|
748
|
+
const buffer = await readFile(args.filePath);
|
|
749
|
+
const image = sharp(buffer);
|
|
750
|
+
const meta = await image.metadata();
|
|
751
|
+
const w = meta.width ?? 0;
|
|
752
|
+
const h = meta.height ?? 0;
|
|
753
|
+
if (!w || !h)
|
|
754
|
+
throw new Error("Unable to read image dimensions");
|
|
755
|
+
const maxPixels = typeof args.maxPixels === "number" && args.maxPixels > 0 ? args.maxPixels : 6_000_000;
|
|
756
|
+
if (w * h > maxPixels) {
|
|
757
|
+
throw new Error(`Refusing huge image (${w}x${h}) for masked OCR (maxPixels=${maxPixels})`);
|
|
758
|
+
}
|
|
759
|
+
const { data, info } = await image.ensureAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
760
|
+
const out = Buffer.alloc(info.width * info.height);
|
|
761
|
+
// Purple labels: high R and B, relatively low G.
|
|
762
|
+
const minPrimary = typeof args.minPrimary === "number" && Number.isFinite(args.minPrimary) ? args.minPrimary : 90;
|
|
763
|
+
const maxGreen = typeof args.maxGreen === "number" && Number.isFinite(args.maxGreen) ? args.maxGreen : 170;
|
|
764
|
+
const minDelta = typeof args.minDelta === "number" && Number.isFinite(args.minDelta) ? args.minDelta : 25;
|
|
765
|
+
for (let i = 0, j = 0; i < data.length; i += 4, j++) {
|
|
766
|
+
const r = data[i];
|
|
767
|
+
const g = data[i + 1];
|
|
768
|
+
const b = data[i + 2];
|
|
769
|
+
const a = data[i + 3];
|
|
770
|
+
const match = a >= 40 && r >= minPrimary && b >= minPrimary && g <= maxGreen && r - g >= minDelta && b - g >= minDelta;
|
|
771
|
+
out[j] = match ? 0 : 255;
|
|
772
|
+
}
|
|
773
|
+
// Upscale for OCR; keep it deterministic.
|
|
774
|
+
const requestedUpscale = typeof args.upscale === "number" && Number.isFinite(args.upscale) && args.upscale >= 1
|
|
775
|
+
? Math.trunc(args.upscale)
|
|
776
|
+
: info.width < 900
|
|
777
|
+
? 4
|
|
778
|
+
: info.width < 1600
|
|
779
|
+
? 3
|
|
780
|
+
: 2;
|
|
781
|
+
// Don't allow the upscaled image to explode in size.
|
|
782
|
+
const scaledMaxPixels = Math.max(maxPixels, Math.floor(maxPixels * 4));
|
|
783
|
+
let upscale = Math.max(1, Math.min(10, requestedUpscale));
|
|
784
|
+
while (upscale > 1 && info.width * upscale * info.height * upscale > scaledMaxPixels)
|
|
785
|
+
upscale--;
|
|
786
|
+
const blurSigma = typeof args.blurSigma === "number" && Number.isFinite(args.blurSigma) ? clampNumber(args.blurSigma, 0, 10) : 0.3;
|
|
787
|
+
const threshold = typeof args.threshold === "number" && Number.isFinite(args.threshold) ? clampInt(args.threshold, 180, 1, 254) : 180;
|
|
788
|
+
let pipeline = sharp(out, { raw: { width: info.width, height: info.height, channels: 1 } }).resize({
|
|
789
|
+
width: info.width * upscale,
|
|
790
|
+
height: info.height * upscale,
|
|
791
|
+
kernel: "nearest",
|
|
792
|
+
});
|
|
793
|
+
if (blurSigma >= 0.3)
|
|
794
|
+
pipeline = pipeline.blur(blurSigma);
|
|
795
|
+
const masked = await pipeline.threshold(threshold).png().toBuffer();
|
|
796
|
+
const result = await ocrRecognizeBuffer({
|
|
797
|
+
buffer: masked,
|
|
798
|
+
lang: args.lang,
|
|
799
|
+
langPathEffective: args.langPathEffective,
|
|
800
|
+
tessOptions: {
|
|
801
|
+
tessedit_char_whitelist: "0123456789.",
|
|
802
|
+
// Sparse text works better for isolated numeric labels.
|
|
803
|
+
tessedit_pageseg_mode: "11",
|
|
804
|
+
user_defined_dpi: "300",
|
|
805
|
+
},
|
|
806
|
+
output: { text: true, tsv: true },
|
|
807
|
+
});
|
|
808
|
+
return { ...result, usedSharp: true, upscale };
|
|
809
|
+
}
|
|
810
|
+
function rectHasInk(bw, width, height, rect, minCount) {
|
|
811
|
+
if (!width || !height)
|
|
812
|
+
return false;
|
|
813
|
+
const need = Math.max(1, Math.trunc(minCount));
|
|
814
|
+
const x0 = clampInt(Math.trunc(rect.x0), 0, 0, width - 1);
|
|
815
|
+
const x1 = clampInt(Math.trunc(rect.x1), width - 1, 0, width - 1);
|
|
816
|
+
const y0 = clampInt(Math.trunc(rect.y0), 0, 0, height - 1);
|
|
817
|
+
const y1 = clampInt(Math.trunc(rect.y1), height - 1, 0, height - 1);
|
|
818
|
+
if (x1 < x0 || y1 < y0)
|
|
819
|
+
return false;
|
|
820
|
+
let count = 0;
|
|
821
|
+
for (let y = y0; y <= y1; y++) {
|
|
822
|
+
const off = y * width;
|
|
823
|
+
for (let x = x0; x <= x1; x++) {
|
|
824
|
+
if (bw[off + x] < 128) {
|
|
825
|
+
count++;
|
|
826
|
+
if (count >= need)
|
|
827
|
+
return true;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
return false;
|
|
832
|
+
}
|
|
833
|
+
function rectInkBounds(bw, width, height, rect, minCount) {
|
|
834
|
+
if (!width || !height)
|
|
835
|
+
return null;
|
|
836
|
+
const need = Math.max(1, Math.trunc(minCount));
|
|
837
|
+
const x0 = clampInt(Math.trunc(rect.x0), 0, 0, width - 1);
|
|
838
|
+
const x1 = clampInt(Math.trunc(rect.x1), width - 1, 0, width - 1);
|
|
839
|
+
const y0 = clampInt(Math.trunc(rect.y0), 0, 0, height - 1);
|
|
840
|
+
const y1 = clampInt(Math.trunc(rect.y1), height - 1, 0, height - 1);
|
|
841
|
+
if (x1 < x0 || y1 < y0)
|
|
842
|
+
return null;
|
|
843
|
+
let minX = Number.POSITIVE_INFINITY;
|
|
844
|
+
let minY = Number.POSITIVE_INFINITY;
|
|
845
|
+
let maxX = Number.NEGATIVE_INFINITY;
|
|
846
|
+
let maxY = Number.NEGATIVE_INFINITY;
|
|
847
|
+
let count = 0;
|
|
848
|
+
for (let y = y0; y <= y1; y++) {
|
|
849
|
+
const off = y * width;
|
|
850
|
+
for (let x = x0; x <= x1; x++) {
|
|
851
|
+
if (bw[off + x] < 128) {
|
|
852
|
+
count++;
|
|
853
|
+
if (x < minX)
|
|
854
|
+
minX = x;
|
|
855
|
+
if (x > maxX)
|
|
856
|
+
maxX = x;
|
|
857
|
+
if (y < minY)
|
|
858
|
+
minY = y;
|
|
859
|
+
if (y > maxY)
|
|
860
|
+
maxY = y;
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
if (count < need)
|
|
865
|
+
return null;
|
|
866
|
+
if (!Number.isFinite(minX) || !Number.isFinite(minY) || !Number.isFinite(maxX) || !Number.isFinite(maxY))
|
|
867
|
+
return null;
|
|
868
|
+
return { x0: minX, y0: minY, x1: maxX, y1: maxY, count };
|
|
869
|
+
}
|
|
870
|
+
function detectThinHorizontalBarsFromBw(bw, width, height, opts) {
|
|
871
|
+
const minRun = Math.max(1, Math.trunc(opts.minRun));
|
|
872
|
+
const maxThickness = typeof opts.maxThickness === "number" ? Math.max(1, Math.trunc(opts.maxThickness)) : 12;
|
|
873
|
+
const mergeY = typeof opts.mergeY === "number" ? Math.max(0, Math.trunc(opts.mergeY)) : 2;
|
|
874
|
+
const overlapRatio = typeof opts.overlapRatio === "number" ? clampNumber(opts.overlapRatio, 0.1, 1) : 0.6;
|
|
875
|
+
if (!width || !height)
|
|
876
|
+
return [];
|
|
877
|
+
const segments = [];
|
|
878
|
+
for (let y = 0; y < height; y++) {
|
|
879
|
+
const rowOff = y * width;
|
|
880
|
+
let x = 0;
|
|
881
|
+
while (x < width) {
|
|
882
|
+
while (x < width && bw[rowOff + x] >= 128)
|
|
883
|
+
x++;
|
|
884
|
+
const x0 = x;
|
|
885
|
+
while (x < width && bw[rowOff + x] < 128)
|
|
886
|
+
x++;
|
|
887
|
+
const x1 = x - 1;
|
|
888
|
+
if (x1 >= x0) {
|
|
889
|
+
const len = x1 - x0 + 1;
|
|
890
|
+
if (len >= minRun)
|
|
891
|
+
segments.push({ x0, x1, y });
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
// Merge segments across adjacent rows if they overlap significantly in X.
|
|
896
|
+
const bars = [];
|
|
897
|
+
for (const s of segments) {
|
|
898
|
+
let merged = false;
|
|
899
|
+
for (const b of bars) {
|
|
900
|
+
if (s.y > b.y1 + mergeY)
|
|
901
|
+
continue;
|
|
902
|
+
if (s.y < b.y0 - mergeY)
|
|
903
|
+
continue;
|
|
904
|
+
const overlap = Math.max(0, Math.min(b.x1, s.x1) - Math.max(b.x0, s.x0) + 1);
|
|
905
|
+
const minLen = Math.max(1, Math.min(b.x1 - b.x0 + 1, s.x1 - s.x0 + 1));
|
|
906
|
+
if (overlap >= minLen * overlapRatio) {
|
|
907
|
+
b.x0 = Math.min(b.x0, s.x0);
|
|
908
|
+
b.x1 = Math.max(b.x1, s.x1);
|
|
909
|
+
b.y0 = Math.min(b.y0, s.y);
|
|
910
|
+
b.y1 = Math.max(b.y1, s.y);
|
|
911
|
+
merged = true;
|
|
912
|
+
break;
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
if (!merged)
|
|
916
|
+
bars.push({ x0: s.x0, x1: s.x1, y0: s.y, y1: s.y });
|
|
917
|
+
}
|
|
918
|
+
return bars
|
|
919
|
+
.filter((b) => b.x1 - b.x0 + 1 >= minRun)
|
|
920
|
+
.filter((b) => b.y1 - b.y0 + 1 <= maxThickness)
|
|
921
|
+
.map((b) => {
|
|
922
|
+
const x0 = clampInt(b.x0, 0, 0, width - 1);
|
|
923
|
+
const x1 = clampInt(b.x1, width - 1, 0, width - 1);
|
|
924
|
+
const y0 = clampInt(b.y0, 0, 0, height - 1);
|
|
925
|
+
const y1 = clampInt(b.y1, height - 1, 0, height - 1);
|
|
926
|
+
const len = x1 - x0 + 1;
|
|
927
|
+
return { x0, x1, y0, y1, cx: (x0 + x1) / 2, cy: (y0 + y1) / 2, len };
|
|
928
|
+
})
|
|
929
|
+
.sort((a, b) => a.cy - b.cy || a.cx - b.cx);
|
|
930
|
+
}
|
|
931
|
+
function detectThinVerticalBarsFromBw(bw, width, height, opts) {
|
|
932
|
+
const minRun = Math.max(1, Math.trunc(opts.minRun));
|
|
933
|
+
const maxThickness = typeof opts.maxThickness === "number" ? Math.max(1, Math.trunc(opts.maxThickness)) : 12;
|
|
934
|
+
const mergeX = typeof opts.mergeX === "number" ? Math.max(0, Math.trunc(opts.mergeX)) : 2;
|
|
935
|
+
const overlapRatio = typeof opts.overlapRatio === "number" ? clampNumber(opts.overlapRatio, 0.1, 1) : 0.6;
|
|
936
|
+
if (!width || !height)
|
|
937
|
+
return [];
|
|
938
|
+
const segments = [];
|
|
939
|
+
for (let x = 0; x < width; x++) {
|
|
940
|
+
let y = 0;
|
|
941
|
+
while (y < height) {
|
|
942
|
+
while (y < height && bw[y * width + x] >= 128)
|
|
943
|
+
y++;
|
|
944
|
+
const y0 = y;
|
|
945
|
+
while (y < height && bw[y * width + x] < 128)
|
|
946
|
+
y++;
|
|
947
|
+
const y1 = y - 1;
|
|
948
|
+
if (y1 >= y0) {
|
|
949
|
+
const len = y1 - y0 + 1;
|
|
950
|
+
if (len >= minRun)
|
|
951
|
+
segments.push({ y0, y1, x });
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
// Merge segments across adjacent columns if they overlap significantly in Y.
|
|
956
|
+
const bars = [];
|
|
957
|
+
for (const s of segments) {
|
|
958
|
+
let merged = false;
|
|
959
|
+
for (const b of bars) {
|
|
960
|
+
if (s.x > b.x1 + mergeX)
|
|
961
|
+
continue;
|
|
962
|
+
if (s.x < b.x0 - mergeX)
|
|
963
|
+
continue;
|
|
964
|
+
const overlap = Math.max(0, Math.min(b.y1, s.y1) - Math.max(b.y0, s.y0) + 1);
|
|
965
|
+
const minLen = Math.max(1, Math.min(b.y1 - b.y0 + 1, s.y1 - s.y0 + 1));
|
|
966
|
+
if (overlap >= minLen * overlapRatio) {
|
|
967
|
+
b.x0 = Math.min(b.x0, s.x);
|
|
968
|
+
b.x1 = Math.max(b.x1, s.x);
|
|
969
|
+
b.y0 = Math.min(b.y0, s.y0);
|
|
970
|
+
b.y1 = Math.max(b.y1, s.y1);
|
|
971
|
+
merged = true;
|
|
972
|
+
break;
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
if (!merged)
|
|
976
|
+
bars.push({ x0: s.x, x1: s.x, y0: s.y0, y1: s.y1 });
|
|
977
|
+
}
|
|
978
|
+
return bars
|
|
979
|
+
.filter((b) => b.y1 - b.y0 + 1 >= minRun)
|
|
980
|
+
.filter((b) => b.x1 - b.x0 + 1 <= maxThickness)
|
|
981
|
+
.map((b) => {
|
|
982
|
+
const x0 = clampInt(b.x0, 0, 0, width - 1);
|
|
983
|
+
const x1 = clampInt(b.x1, width - 1, 0, width - 1);
|
|
984
|
+
const y0 = clampInt(b.y0, 0, 0, height - 1);
|
|
985
|
+
const y1 = clampInt(b.y1, height - 1, 0, height - 1);
|
|
986
|
+
const len = y1 - y0 + 1;
|
|
987
|
+
return { x0, x1, y0, y1, cx: (x0 + x1) / 2, cy: (y0 + y1) / 2, len };
|
|
988
|
+
})
|
|
989
|
+
.sort((a, b) => a.cx - b.cx || a.cy - b.cy);
|
|
990
|
+
}
|
|
991
|
+
async function ocrIntegerFromImageRegion(args) {
|
|
992
|
+
const left = Math.trunc(args.left);
|
|
993
|
+
const top = Math.trunc(args.top);
|
|
994
|
+
const width = Math.trunc(args.width);
|
|
995
|
+
const height = Math.trunc(args.height);
|
|
996
|
+
if (width <= 0 || height <= 0)
|
|
997
|
+
return null;
|
|
998
|
+
const minValue = typeof args.minValue === "number" && Number.isFinite(args.minValue) ? args.minValue : null;
|
|
999
|
+
const maxValue = typeof args.maxValue === "number" && Number.isFinite(args.maxValue) ? args.maxValue : null;
|
|
1000
|
+
const thresholdListRaw = Array.isArray(args.thresholds) && args.thresholds.length ? args.thresholds : [args.threshold];
|
|
1001
|
+
const thresholdList = thresholdListRaw
|
|
1002
|
+
.map((t) => clampInt(t, 200, 1, 254))
|
|
1003
|
+
.filter((t, i, arr) => arr.indexOf(t) === i);
|
|
1004
|
+
let best = null;
|
|
1005
|
+
for (const thr of thresholdList) {
|
|
1006
|
+
let buf;
|
|
1007
|
+
try {
|
|
1008
|
+
// Many GAIA image tasks contain tiny digits; upscaling materially improves OCR recall.
|
|
1009
|
+
const targetW = 320;
|
|
1010
|
+
const scale = width > 0 && width < targetW ? Math.max(1, Math.min(8, Math.ceil(targetW / width))) : 1;
|
|
1011
|
+
let pipeline = args.sharp(args.source).extract({ left, top, width, height }).grayscale().normalize();
|
|
1012
|
+
if (scale > 1) {
|
|
1013
|
+
pipeline = pipeline.resize({ width: width * scale, height: height * scale, kernel: "nearest" });
|
|
1014
|
+
}
|
|
1015
|
+
buf = await pipeline.threshold(thr).png().toBuffer();
|
|
1016
|
+
}
|
|
1017
|
+
catch {
|
|
1018
|
+
continue;
|
|
1019
|
+
}
|
|
1020
|
+
for (const psm of args.psms) {
|
|
1021
|
+
const out = await ocrRecognizeBuffer({
|
|
1022
|
+
buffer: buf,
|
|
1023
|
+
lang: args.lang,
|
|
1024
|
+
langPathEffective: args.langPathEffective,
|
|
1025
|
+
tessOptions: {
|
|
1026
|
+
tessedit_char_whitelist: "0123456789",
|
|
1027
|
+
tessedit_pageseg_mode: String(Math.trunc(psm)),
|
|
1028
|
+
user_defined_dpi: "300",
|
|
1029
|
+
},
|
|
1030
|
+
output: { text: true, tsv: false },
|
|
1031
|
+
});
|
|
1032
|
+
const rawText = String(out.text ?? "").trim();
|
|
1033
|
+
const runs = rawText.match(/-?\d+/g) ?? [];
|
|
1034
|
+
if (!runs.length)
|
|
1035
|
+
continue;
|
|
1036
|
+
// Prefer longer digit runs (avoids picking the row index "1" when the crop also contains "29").
|
|
1037
|
+
// Also add suffix candidates to repair common OCR concatenation like "129" (index + number).
|
|
1038
|
+
const candidates = [];
|
|
1039
|
+
for (const r0 of runs) {
|
|
1040
|
+
const r = String(r0 ?? "").trim();
|
|
1041
|
+
const digits = r.replace(/^-/, "").length;
|
|
1042
|
+
const n0 = Number.parseInt(r, 10);
|
|
1043
|
+
if (Number.isFinite(n0))
|
|
1044
|
+
candidates.push({ n: n0, digits });
|
|
1045
|
+
if (digits >= 3) {
|
|
1046
|
+
const suff2 = r.replace(/^-/, "").slice(-2);
|
|
1047
|
+
const n2 = Number.parseInt(suff2, 10);
|
|
1048
|
+
if (Number.isFinite(n2))
|
|
1049
|
+
candidates.push({ n: n2, digits: 2 });
|
|
1050
|
+
const suff1 = r.replace(/^-/, "").slice(-1);
|
|
1051
|
+
const n1 = Number.parseInt(suff1, 10);
|
|
1052
|
+
if (Number.isFinite(n1))
|
|
1053
|
+
candidates.push({ n: n1, digits: 1 });
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
const filtered = candidates
|
|
1057
|
+
.filter((c) => Number.isFinite(c.n))
|
|
1058
|
+
.filter((c) => (minValue !== null ? c.n >= minValue : true))
|
|
1059
|
+
.filter((c) => (maxValue !== null ? c.n <= maxValue : true));
|
|
1060
|
+
if (!filtered.length)
|
|
1061
|
+
continue;
|
|
1062
|
+
filtered.sort((a, b) => b.digits - a.digits || b.n - a.n);
|
|
1063
|
+
const picked = filtered[0];
|
|
1064
|
+
const conf = typeof out.confidence === "number" && Number.isFinite(out.confidence) ? out.confidence : -1;
|
|
1065
|
+
// Primary key: digits (implicit via selection); secondary: OCR confidence.
|
|
1066
|
+
// Use confidence to break ties across different threshold/PSM attempts.
|
|
1067
|
+
if (!best || picked.digits > best.digits || (picked.digits === best.digits && conf > best.confidence)) {
|
|
1068
|
+
best = { n: picked.n, confidence: conf, digits: picked.digits };
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
return best ? best.n : null;
|
|
1073
|
+
}
|
|
1074
|
+
async function gradeFractionQuizFromImageRowBands(args) {
|
|
1075
|
+
const debugEnabled = process.env.NODEBENCH_DEBUG_FRACTION_QUIZ === "1";
|
|
1076
|
+
const meta = await args.sharp(args.filePath).metadata();
|
|
1077
|
+
const w0 = meta.width ?? 0;
|
|
1078
|
+
const h0 = meta.height ?? 0;
|
|
1079
|
+
if (!w0 || !h0)
|
|
1080
|
+
return null;
|
|
1081
|
+
const scale = w0 < 1200 ? 3 : w0 < 2000 ? 2 : 1;
|
|
1082
|
+
const width = w0 * scale;
|
|
1083
|
+
const height = h0 * scale;
|
|
1084
|
+
const base = await args.sharp(args.filePath)
|
|
1085
|
+
.grayscale()
|
|
1086
|
+
.resize({ width, height, kernel: "lanczos3" })
|
|
1087
|
+
.normalize()
|
|
1088
|
+
.png()
|
|
1089
|
+
.toBuffer();
|
|
1090
|
+
// Detect question row centers from a narrow left strip so math digits don't dominate.
|
|
1091
|
+
const { data: bw, info } = await args.sharp(base)
|
|
1092
|
+
.grayscale()
|
|
1093
|
+
.threshold(210)
|
|
1094
|
+
.raw()
|
|
1095
|
+
.toBuffer({ resolveWithObject: true });
|
|
1096
|
+
const bwW = info.width;
|
|
1097
|
+
const bwH = info.height;
|
|
1098
|
+
const leftW = Math.max(20, Math.floor(bwW * 0.04));
|
|
1099
|
+
const yMin = Math.floor(bwH * 0.08);
|
|
1100
|
+
const yMax = Math.floor(bwH * 0.92);
|
|
1101
|
+
const rowCounts = new Array(bwH).fill(0);
|
|
1102
|
+
let maxRow = 0;
|
|
1103
|
+
for (let y = yMin; y < yMax; y++) {
|
|
1104
|
+
let c = 0;
|
|
1105
|
+
const off = y * bwW;
|
|
1106
|
+
for (let x = 0; x < leftW; x++)
|
|
1107
|
+
if (bw[off + x] < 128)
|
|
1108
|
+
c++;
|
|
1109
|
+
rowCounts[y] = c;
|
|
1110
|
+
if (c > maxRow)
|
|
1111
|
+
maxRow = c;
|
|
1112
|
+
}
|
|
1113
|
+
if (maxRow <= 0)
|
|
1114
|
+
return null;
|
|
1115
|
+
const peakThresh = Math.max(2, Math.floor(maxRow * 0.45));
|
|
1116
|
+
const segs = [];
|
|
1117
|
+
for (let y = yMin; y < yMax; y++) {
|
|
1118
|
+
if (rowCounts[y] < peakThresh)
|
|
1119
|
+
continue;
|
|
1120
|
+
let y2 = y;
|
|
1121
|
+
let peak = rowCounts[y];
|
|
1122
|
+
while (y2 + 1 < yMax && rowCounts[y2 + 1] >= peakThresh) {
|
|
1123
|
+
y2++;
|
|
1124
|
+
peak = Math.max(peak, rowCounts[y2]);
|
|
1125
|
+
}
|
|
1126
|
+
segs.push({ cy: (y + y2) / 2, peak });
|
|
1127
|
+
y = y2;
|
|
1128
|
+
}
|
|
1129
|
+
if (segs.length < 6)
|
|
1130
|
+
return null;
|
|
1131
|
+
segs.sort((a, b) => a.cy - b.cy);
|
|
1132
|
+
// Merge nearby segments (digits like "10" can produce multiple peaks).
|
|
1133
|
+
const merged = [];
|
|
1134
|
+
const mergeTol = Math.max(20, Math.round(bwH * 0.018));
|
|
1135
|
+
for (const s of segs) {
|
|
1136
|
+
const last = merged[merged.length - 1];
|
|
1137
|
+
if (!last || Math.abs(s.cy - last.cy) > mergeTol) {
|
|
1138
|
+
merged.push({ ...s });
|
|
1139
|
+
continue;
|
|
1140
|
+
}
|
|
1141
|
+
const wA = Math.max(1, last.peak);
|
|
1142
|
+
const wB = Math.max(1, s.peak);
|
|
1143
|
+
last.cy = (last.cy * wA + s.cy * wB) / (wA + wB);
|
|
1144
|
+
last.peak = Math.max(last.peak, s.peak);
|
|
1145
|
+
}
|
|
1146
|
+
const pickBestWindow = (centers, k) => {
|
|
1147
|
+
if (centers.length <= k)
|
|
1148
|
+
return centers;
|
|
1149
|
+
let best = centers.slice(0, k);
|
|
1150
|
+
let bestScore = Number.POSITIVE_INFINITY;
|
|
1151
|
+
for (let i = 0; i + k - 1 < centers.length; i++) {
|
|
1152
|
+
const cand = centers.slice(i, i + k);
|
|
1153
|
+
const spacings = cand.slice(1).map((y, j) => y - cand[j]);
|
|
1154
|
+
const avg = spacings.reduce((s, n) => s + n, 0) / spacings.length;
|
|
1155
|
+
const variance = spacings.reduce((s, n) => s + (n - avg) * (n - avg), 0) / spacings.length;
|
|
1156
|
+
if (variance < bestScore) {
|
|
1157
|
+
bestScore = variance;
|
|
1158
|
+
best = cand;
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
return best;
|
|
1162
|
+
};
|
|
1163
|
+
const centersAll = merged.map((m) => m.cy).filter((n) => Number.isFinite(n));
|
|
1164
|
+
const targetK = centersAll.length >= 10 ? 10 : centersAll.length;
|
|
1165
|
+
const centers = pickBestWindow(centersAll, targetK).sort((a, b) => a - b);
|
|
1166
|
+
if (centers.length < 6)
|
|
1167
|
+
return null;
|
|
1168
|
+
// Use padded bands around each detected row-center. Midpoint-only bands were too tight on
|
|
1169
|
+
// GAIA's fraction quiz screenshots and could clip stacked numerators/denominators or the
|
|
1170
|
+
// answer box (conversion rows often place the box on the next line).
|
|
1171
|
+
const spacingAt = (i) => {
|
|
1172
|
+
const prev = i > 0 ? centers[i] - centers[i - 1] : centers.length > 1 ? centers[1] - centers[0] : height;
|
|
1173
|
+
const next = i + 1 < centers.length ? centers[i + 1] - centers[i] : prev;
|
|
1174
|
+
const s = Math.max(1, Math.min(prev, next));
|
|
1175
|
+
return s;
|
|
1176
|
+
};
|
|
1177
|
+
const rowBand = (i) => {
|
|
1178
|
+
const spacing = spacingAt(i);
|
|
1179
|
+
const padTop = Math.max(24, Math.round(spacing * 0.45));
|
|
1180
|
+
const padBot = Math.max(24, Math.round(spacing * 0.75));
|
|
1181
|
+
const y0 = clampInt(Math.floor(centers[i] - padTop), 0, 0, height - 1);
|
|
1182
|
+
const y1 = clampInt(Math.ceil(centers[i] + padBot), height, y0 + 1, height);
|
|
1183
|
+
return { y0, y1, spacing };
|
|
1184
|
+
};
|
|
1185
|
+
// These ratios are tuned to GAIA's fraction quiz screenshot layout.
|
|
1186
|
+
// Expression region must be wide enough to include stacked fractions.
|
|
1187
|
+
// Expression region must be wide enough to include stacked fractions.
|
|
1188
|
+
const exprX1 = Math.min(width, Math.floor(width * 0.55));
|
|
1189
|
+
const answerX0 = Math.floor(width * 0.12);
|
|
1190
|
+
const answerX1 = Math.min(width, Math.floor(width * 0.46));
|
|
1191
|
+
const perQuestion = [];
|
|
1192
|
+
let total = 0;
|
|
1193
|
+
const parseAllFractionsLoose = (text) => {
|
|
1194
|
+
const out = [];
|
|
1195
|
+
for (const m of String(text ?? "").matchAll(/(-?\d+)\s*\/\s*(\d+)/g)) {
|
|
1196
|
+
const n = Number.parseInt(m[1], 10);
|
|
1197
|
+
const d = Number.parseInt(m[2], 10);
|
|
1198
|
+
if (!Number.isFinite(n) || !Number.isFinite(d) || d === 0)
|
|
1199
|
+
continue;
|
|
1200
|
+
try {
|
|
1201
|
+
out.push(normalizeFraction({ n, d }));
|
|
1202
|
+
}
|
|
1203
|
+
catch {
|
|
1204
|
+
// ignore
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
return out;
|
|
1208
|
+
};
|
|
1209
|
+
const parseAllMixedNumbersLoose = (text) => {
|
|
1210
|
+
const out = [];
|
|
1211
|
+
for (const m of String(text ?? "").matchAll(/(-?\d+)\s+(-?\d+)\s*\/\s*(\d+)/g)) {
|
|
1212
|
+
const whole = Number.parseInt(m[1], 10);
|
|
1213
|
+
const n = Number.parseInt(m[2], 10);
|
|
1214
|
+
const d = Number.parseInt(m[3], 10);
|
|
1215
|
+
if (!Number.isFinite(whole) || !Number.isFinite(n) || !Number.isFinite(d) || d === 0)
|
|
1216
|
+
continue;
|
|
1217
|
+
try {
|
|
1218
|
+
out.push({ whole, frac: normalizeFraction({ n, d }) });
|
|
1219
|
+
}
|
|
1220
|
+
catch {
|
|
1221
|
+
// ignore
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
return out;
|
|
1225
|
+
};
|
|
1226
|
+
for (let i = 0; i < centers.length; i++) {
|
|
1227
|
+
const idx = i + 1;
|
|
1228
|
+
const band = rowBand(i);
|
|
1229
|
+
const y0 = band.y0;
|
|
1230
|
+
const y1 = band.y1;
|
|
1231
|
+
const rowH = Math.max(1, y1 - y0);
|
|
1232
|
+
// Detect the answer box X-range for this row. We use it to:
|
|
1233
|
+
// 1) Exclude answer-box fraction bars from operand extraction (critical for conversion rows)
|
|
1234
|
+
// 2) Narrow answer OCR to the box region for cleaner parsing.
|
|
1235
|
+
const rowGrayFull = await args.sharp(base)
|
|
1236
|
+
.extract({ left: 0, top: y0, width, height: rowH })
|
|
1237
|
+
.png()
|
|
1238
|
+
.toBuffer();
|
|
1239
|
+
const { data: rowBwFull, info: rowInfo } = await args.sharp(rowGrayFull)
|
|
1240
|
+
.grayscale()
|
|
1241
|
+
.threshold(210)
|
|
1242
|
+
.raw()
|
|
1243
|
+
.toBuffer({ resolveWithObject: true });
|
|
1244
|
+
// Light OCR on the row text to detect conversion prompts ("Turn ... into ...").
|
|
1245
|
+
// We keep this cheap (no TSV) and deterministic.
|
|
1246
|
+
const rowTextBuf = await args.sharp(base)
|
|
1247
|
+
.extract({ left: 0, top: y0, width: Math.max(1, Math.floor(width * 0.62)), height: rowH })
|
|
1248
|
+
.grayscale()
|
|
1249
|
+
.normalize()
|
|
1250
|
+
.threshold(200)
|
|
1251
|
+
.png()
|
|
1252
|
+
.toBuffer();
|
|
1253
|
+
const rowOcr = await ocrRecognizeBuffer({
|
|
1254
|
+
buffer: rowTextBuf,
|
|
1255
|
+
lang: args.lang,
|
|
1256
|
+
langPathEffective: args.langPathEffective,
|
|
1257
|
+
tessOptions: { tessedit_pageseg_mode: "6", user_defined_dpi: "300" },
|
|
1258
|
+
output: { text: true, tsv: false },
|
|
1259
|
+
});
|
|
1260
|
+
const rowLower = rowOcr.text.toLowerCase();
|
|
1261
|
+
const rowHasTurn = rowLower.includes("turn");
|
|
1262
|
+
const rowHintMixed = rowLower.includes("mixed");
|
|
1263
|
+
const rowHintImproper = rowLower.includes("improper");
|
|
1264
|
+
const detectAnswerBoxRect = () => {
|
|
1265
|
+
// Answer boxes are drawn as rectangles with long-ish horizontal borders.
|
|
1266
|
+
// Detect the longest horizontal bar that isn't the page border, then expand to all overlapping bars.
|
|
1267
|
+
const minRun = Math.max(30, Math.round(rowInfo.width * 0.06));
|
|
1268
|
+
const maxThickness = Math.max(2, Math.round(rowInfo.height * 0.18));
|
|
1269
|
+
const bars = detectThinHorizontalBarsFromBw(rowBwFull, rowInfo.width, rowInfo.height, {
|
|
1270
|
+
minRun,
|
|
1271
|
+
maxThickness,
|
|
1272
|
+
mergeY: 1,
|
|
1273
|
+
overlapRatio: 0.65,
|
|
1274
|
+
})
|
|
1275
|
+
// Exclude anything that starts in the left margin (question numbers).
|
|
1276
|
+
.filter((b) => b.x0 >= Math.round(rowInfo.width * 0.08))
|
|
1277
|
+
// Exclude extremely long lines (page borders / separators).
|
|
1278
|
+
.filter((b) => b.len <= Math.round(rowInfo.width * 0.5));
|
|
1279
|
+
if (!bars.length)
|
|
1280
|
+
return null;
|
|
1281
|
+
const best = [...bars].sort((a, b) => b.len - a.len)[0];
|
|
1282
|
+
const overlap = (a, b) => Math.max(0, Math.min(a.x1, b.x1) - Math.max(a.x0, b.x0) + 1);
|
|
1283
|
+
const overlapsBest = bars.filter((b) => {
|
|
1284
|
+
const ov = overlap(best, b);
|
|
1285
|
+
const minLen = Math.max(1, Math.min(best.len, b.len));
|
|
1286
|
+
return ov >= minLen * 0.72;
|
|
1287
|
+
});
|
|
1288
|
+
let x0 = best.x0;
|
|
1289
|
+
let x1 = best.x1;
|
|
1290
|
+
let y0 = best.y0;
|
|
1291
|
+
let y1 = best.y1;
|
|
1292
|
+
for (const b of overlapsBest) {
|
|
1293
|
+
if (b.x0 < x0)
|
|
1294
|
+
x0 = b.x0;
|
|
1295
|
+
if (b.x1 > x1)
|
|
1296
|
+
x1 = b.x1;
|
|
1297
|
+
if (b.y0 < y0)
|
|
1298
|
+
y0 = b.y0;
|
|
1299
|
+
if (b.y1 > y1)
|
|
1300
|
+
y1 = b.y1;
|
|
1301
|
+
}
|
|
1302
|
+
// Sanity: require a reasonable box width.
|
|
1303
|
+
const boxW = x1 - x0 + 1;
|
|
1304
|
+
if (boxW < Math.round(rowInfo.width * 0.06) || boxW > Math.round(rowInfo.width * 0.45))
|
|
1305
|
+
return null;
|
|
1306
|
+
const boxH = y1 - y0 + 1;
|
|
1307
|
+
if (boxH < Math.max(10, Math.round(rowInfo.height * 0.12)) || boxH > Math.round(rowInfo.height * 0.9)) {
|
|
1308
|
+
return null;
|
|
1309
|
+
}
|
|
1310
|
+
return { x0, x1, y0, y1 };
|
|
1311
|
+
};
|
|
1312
|
+
const answerBox = detectAnswerBoxRect();
|
|
1313
|
+
// Pixel-run fraction-bar detection is more reliable than OCR token geometry for stacked fractions.
|
|
1314
|
+
// OCR struggles to emit numerator/denominator tokens consistently for tiny stacked fractions.
|
|
1315
|
+
const exprGray = await args.sharp(base)
|
|
1316
|
+
.extract({
|
|
1317
|
+
left: 0,
|
|
1318
|
+
top: y0,
|
|
1319
|
+
width: Math.max(1, Math.min(exprX1,
|
|
1320
|
+
// Prefer excluding the answer box so we don't treat student-answer fractions as operands.
|
|
1321
|
+
answerBox ? Math.max(1, Math.round(answerBox.x0 - Math.max(10, rowInfo.width * 0.01))) : exprX1)),
|
|
1322
|
+
height: rowH,
|
|
1323
|
+
})
|
|
1324
|
+
.png()
|
|
1325
|
+
.toBuffer();
|
|
1326
|
+
const { data: exprBw, info: exprInfo } = await args.sharp(exprGray)
|
|
1327
|
+
.grayscale()
|
|
1328
|
+
.threshold(210)
|
|
1329
|
+
.raw()
|
|
1330
|
+
.toBuffer({ resolveWithObject: true });
|
|
1331
|
+
// Fraction bars are short relative to the row width; keep minRun small and filter by ink above/below.
|
|
1332
|
+
// Raise the minimum run length to avoid misclassifying operator glyphs (notably the "÷" bar)
|
|
1333
|
+
// as fraction bars. Stacked fraction bars in this screenshot are materially longer.
|
|
1334
|
+
const minBarRun = Math.max(18, Math.round(exprInfo.width * 0.02));
|
|
1335
|
+
const bars = detectThinHorizontalBarsFromBw(exprBw, exprInfo.width, exprInfo.height, {
|
|
1336
|
+
minRun: minBarRun,
|
|
1337
|
+
maxThickness: Math.max(2, Math.round(exprInfo.height * 0.14)),
|
|
1338
|
+
});
|
|
1339
|
+
const rowCenterLocal = centers[i] - y0;
|
|
1340
|
+
// Use row spacing (not row crop height) to keep parsing stable even if bands overlap slightly.
|
|
1341
|
+
const yWindow = Math.max(30, Math.round(band.spacing * 0.42));
|
|
1342
|
+
const aboveH = Math.max(8, Math.round(exprInfo.height * 0.18));
|
|
1343
|
+
const belowH = aboveH;
|
|
1344
|
+
const candidateBars = bars
|
|
1345
|
+
.filter((b) => Math.abs(b.cy - rowCenterLocal) <= yWindow)
|
|
1346
|
+
.filter((b) => b.len >= minBarRun)
|
|
1347
|
+
// Avoid giant horizontal lines like answer-box borders.
|
|
1348
|
+
.filter((b) => b.len <= Math.max(minBarRun, Math.round(exprInfo.width * 0.28)))
|
|
1349
|
+
.filter((b) => {
|
|
1350
|
+
const padX = Math.max(2, Math.round(b.len * 0.25));
|
|
1351
|
+
const x0 = clampInt(b.x0 - padX, 0, 0, exprInfo.width - 1);
|
|
1352
|
+
const x1 = clampInt(b.x1 + padX, exprInfo.width - 1, 0, exprInfo.width - 1);
|
|
1353
|
+
const aboveY0 = clampInt(b.y0 - aboveH, 0, 0, exprInfo.height - 1);
|
|
1354
|
+
const aboveY1 = clampInt(b.y0 - 1, exprInfo.height - 1, 0, exprInfo.height - 1);
|
|
1355
|
+
const belowY0 = clampInt(b.y1 + 1, exprInfo.height - 1, 0, exprInfo.height - 1);
|
|
1356
|
+
const belowY1 = clampInt(b.y1 + belowH, exprInfo.height - 1, 0, exprInfo.height - 1);
|
|
1357
|
+
const minInk = Math.max(4, Math.round(b.len * 0.05));
|
|
1358
|
+
return (rectHasInk(exprBw, exprInfo.width, exprInfo.height, { x0, y0: aboveY0, x1, y1: aboveY1 }, minInk) &&
|
|
1359
|
+
rectHasInk(exprBw, exprInfo.width, exprInfo.height, { x0, y0: belowY0, x1, y1: belowY1 }, minInk));
|
|
1360
|
+
})
|
|
1361
|
+
.sort((a, b) => Math.abs(a.cy - rowCenterLocal) - Math.abs(b.cy - rowCenterLocal) || b.len - a.len || a.cx - b.cx);
|
|
1362
|
+
const fracPairs = [];
|
|
1363
|
+
for (const bar of candidateBars) {
|
|
1364
|
+
// Adaptive crop sizes: longer fraction bars generally mean larger numerators/denominators (more digits).
|
|
1365
|
+
const padY = Math.max(1, Math.round(exprInfo.height * 0.012));
|
|
1366
|
+
const boxH = clampInt(Math.round(Math.max(exprInfo.height * 0.18, bar.len * 0.85)), 44, 14, 80);
|
|
1367
|
+
// Bar detection sometimes yields a short segment of the true fraction bar (especially under thin fonts).
|
|
1368
|
+
// Use a wider, center-based crop so 2-digit numerators/denominators aren't clipped.
|
|
1369
|
+
const desiredW = clampInt(Math.round(bar.len * 2.4), 80, Math.max(44, Math.round(exprInfo.width * 0.035)), Math.max(80, Math.round(exprInfo.width * 0.22)));
|
|
1370
|
+
const cropX0 = clampInt(Math.round(bar.cx - desiredW / 2), 0, 0, exprInfo.width - 1);
|
|
1371
|
+
const cropX1 = clampInt(cropX0 + desiredW - 1, exprInfo.width - 1, 0, exprInfo.width - 1);
|
|
1372
|
+
const cropW = Math.max(1, cropX1 - cropX0 + 1);
|
|
1373
|
+
const numY1 = clampInt(bar.y0 - padY, exprInfo.height, 0, exprInfo.height);
|
|
1374
|
+
const numY0 = clampInt(numY1 - boxH, 0, 0, numY1);
|
|
1375
|
+
const numH = numY1 - numY0;
|
|
1376
|
+
const denY0 = clampInt(bar.y1 + padY, exprInfo.height - 1, 0, exprInfo.height - 1);
|
|
1377
|
+
const denY1 = clampInt(denY0 + boxH, exprInfo.height, denY0, exprInfo.height);
|
|
1378
|
+
const denH = denY1 - denY0;
|
|
1379
|
+
if (numH < 6 || denH < 6)
|
|
1380
|
+
continue;
|
|
1381
|
+
const tightenForOcr = (rect) => {
|
|
1382
|
+
const minInk = Math.max(2, Math.round(bar.len * 0.03));
|
|
1383
|
+
const bounds = rectInkBounds(exprBw, exprInfo.width, exprInfo.height, rect, minInk);
|
|
1384
|
+
if (!bounds) {
|
|
1385
|
+
return {
|
|
1386
|
+
left: rect.x0,
|
|
1387
|
+
top: rect.y0,
|
|
1388
|
+
width: Math.max(1, rect.x1 - rect.x0 + 1),
|
|
1389
|
+
height: Math.max(1, rect.y1 - rect.y0 + 1),
|
|
1390
|
+
};
|
|
1391
|
+
}
|
|
1392
|
+
const pad = Math.max(2, Math.round(bar.len * 0.08));
|
|
1393
|
+
const x0 = clampInt(bounds.x0 - pad, 0, 0, exprInfo.width - 1);
|
|
1394
|
+
const x1 = clampInt(bounds.x1 + pad, exprInfo.width - 1, 0, exprInfo.width - 1);
|
|
1395
|
+
const y0 = clampInt(bounds.y0 - pad, 0, 0, exprInfo.height - 1);
|
|
1396
|
+
const y1 = clampInt(bounds.y1 + pad, exprInfo.height - 1, 0, exprInfo.height - 1);
|
|
1397
|
+
return {
|
|
1398
|
+
left: x0,
|
|
1399
|
+
top: y0,
|
|
1400
|
+
width: Math.max(1, x1 - x0 + 1),
|
|
1401
|
+
height: Math.max(1, y1 - y0 + 1),
|
|
1402
|
+
};
|
|
1403
|
+
};
|
|
1404
|
+
const numCrop = tightenForOcr({ x0: cropX0, y0: numY0, x1: cropX0 + cropW - 1, y1: numY0 + numH - 1 });
|
|
1405
|
+
const denCrop = tightenForOcr({ x0: cropX0, y0: denY0, x1: cropX0 + cropW - 1, y1: denY0 + denH - 1 });
|
|
1406
|
+
const n = await ocrIntegerFromImageRegion({
|
|
1407
|
+
sharp: args.sharp,
|
|
1408
|
+
source: exprGray,
|
|
1409
|
+
left: numCrop.left,
|
|
1410
|
+
top: numCrop.top,
|
|
1411
|
+
width: numCrop.width,
|
|
1412
|
+
height: numCrop.height,
|
|
1413
|
+
threshold: 200,
|
|
1414
|
+
thresholds: [150, 170, 190, 210],
|
|
1415
|
+
lang: args.lang,
|
|
1416
|
+
langPathEffective: args.langPathEffective,
|
|
1417
|
+
psms: [11, 7],
|
|
1418
|
+
minValue: 0,
|
|
1419
|
+
maxValue: 99,
|
|
1420
|
+
});
|
|
1421
|
+
const d = await ocrIntegerFromImageRegion({
|
|
1422
|
+
sharp: args.sharp,
|
|
1423
|
+
source: exprGray,
|
|
1424
|
+
left: denCrop.left,
|
|
1425
|
+
top: denCrop.top,
|
|
1426
|
+
width: denCrop.width,
|
|
1427
|
+
height: denCrop.height,
|
|
1428
|
+
threshold: 200,
|
|
1429
|
+
thresholds: [150, 170, 190, 210],
|
|
1430
|
+
lang: args.lang,
|
|
1431
|
+
langPathEffective: args.langPathEffective,
|
|
1432
|
+
psms: [7, 6],
|
|
1433
|
+
minValue: 1,
|
|
1434
|
+
maxValue: 99,
|
|
1435
|
+
});
|
|
1436
|
+
if (n === null || d === null || d === 0)
|
|
1437
|
+
continue;
|
|
1438
|
+
try {
|
|
1439
|
+
fracPairs.push({ x: bar.cx, frac: normalizeFraction({ n, d }), bar });
|
|
1440
|
+
}
|
|
1441
|
+
catch {
|
|
1442
|
+
// ignore
|
|
1443
|
+
}
|
|
1444
|
+
// Keep scanning a bit: OCR can fail for one bar; grabbing multiple candidates improves recall.
|
|
1445
|
+
if (fracPairs.length >= 4)
|
|
1446
|
+
break;
|
|
1447
|
+
}
|
|
1448
|
+
fracPairs.sort((a, b) => a.x - b.x);
|
|
1449
|
+
const fracs = [];
|
|
1450
|
+
const minDx = Math.max(10, Math.round(exprInfo.width * 0.035));
|
|
1451
|
+
for (const fp of fracPairs) {
|
|
1452
|
+
const last = fracs[fracs.length - 1];
|
|
1453
|
+
if (last && Math.abs(fp.x - last.x) < minDx)
|
|
1454
|
+
continue;
|
|
1455
|
+
fracs.push(fp);
|
|
1456
|
+
if (fracs.length >= 2)
|
|
1457
|
+
break;
|
|
1458
|
+
}
|
|
1459
|
+
// OCR answer box (digits + slash). Some rows place the answer box on the next line and slightly more
|
|
1460
|
+
// to the left; if the primary crop yields nothing parseable, do a second pass on the lower portion.
|
|
1461
|
+
const ocrAnswerFromCrop = async (crop) => {
|
|
1462
|
+
const buf = await args.sharp(base).extract(crop).png().toBuffer();
|
|
1463
|
+
const out = await ocrRecognizeBuffer({
|
|
1464
|
+
buffer: buf,
|
|
1465
|
+
lang: args.lang,
|
|
1466
|
+
langPathEffective: args.langPathEffective,
|
|
1467
|
+
tessOptions: {
|
|
1468
|
+
tessedit_char_whitelist: "0123456789/- ",
|
|
1469
|
+
tessedit_pageseg_mode: "7",
|
|
1470
|
+
user_defined_dpi: "300",
|
|
1471
|
+
},
|
|
1472
|
+
output: { text: true, tsv: false },
|
|
1473
|
+
});
|
|
1474
|
+
return out.text.replace(/\r/g, "\n").trim();
|
|
1475
|
+
};
|
|
1476
|
+
const defaultAnswerCrop = {
|
|
1477
|
+
left: answerX0,
|
|
1478
|
+
top: y0,
|
|
1479
|
+
width: Math.max(1, answerX1 - answerX0),
|
|
1480
|
+
height: rowH,
|
|
1481
|
+
};
|
|
1482
|
+
const answerCrop = (() => {
|
|
1483
|
+
if (!answerBox)
|
|
1484
|
+
return defaultAnswerCrop;
|
|
1485
|
+
// Crop tightly to the answer box's vertical range to avoid including operand fractions
|
|
1486
|
+
// (which can prepend extra digits like the row index).
|
|
1487
|
+
const padX = Math.max(6, Math.round(rowInfo.width * 0.01));
|
|
1488
|
+
const padY = Math.max(6, Math.round(rowInfo.height * 0.06));
|
|
1489
|
+
const left = clampInt(answerBox.x0 - padX, 0, 0, width - 1);
|
|
1490
|
+
const right = clampInt(answerBox.x1 + padX, width - 1, 0, width - 1);
|
|
1491
|
+
const w = Math.max(1, right - left + 1);
|
|
1492
|
+
const top = clampInt(y0 + answerBox.y0 - padY, 0, 0, height - 1);
|
|
1493
|
+
const bottom = clampInt(y0 + answerBox.y1 + padY, height, top + 1, height);
|
|
1494
|
+
return { left, top, width: w, height: Math.max(1, bottom - top) };
|
|
1495
|
+
})();
|
|
1496
|
+
let ansText = await ocrAnswerFromCrop({
|
|
1497
|
+
left: answerCrop.left,
|
|
1498
|
+
top: answerCrop.top,
|
|
1499
|
+
width: answerCrop.width,
|
|
1500
|
+
height: answerCrop.height,
|
|
1501
|
+
});
|
|
1502
|
+
let studentMixedAll = parseAllMixedNumbersLoose(ansText);
|
|
1503
|
+
let studentFracAll = parseAllFractionsLoose(ansText);
|
|
1504
|
+
if (!studentMixedAll.length && !studentFracAll.length) {
|
|
1505
|
+
const altLeft = Math.floor(width * 0.05);
|
|
1506
|
+
const altRight = Math.min(width, Math.floor(width * 0.45));
|
|
1507
|
+
const altTop = y0 + Math.floor(rowH * 0.35);
|
|
1508
|
+
if (altRight > altLeft && altTop < y1) {
|
|
1509
|
+
const altText = await ocrAnswerFromCrop({
|
|
1510
|
+
left: altLeft,
|
|
1511
|
+
top: altTop,
|
|
1512
|
+
width: Math.max(1, altRight - altLeft),
|
|
1513
|
+
height: Math.max(1, y1 - altTop),
|
|
1514
|
+
});
|
|
1515
|
+
const mixedAlt = parseAllMixedNumbersLoose(altText);
|
|
1516
|
+
const fracAlt = parseAllFractionsLoose(altText);
|
|
1517
|
+
if (mixedAlt.length || fracAlt.length) {
|
|
1518
|
+
ansText = altText;
|
|
1519
|
+
studentMixedAll = mixedAlt;
|
|
1520
|
+
studentFracAll = fracAlt;
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
// Conversion rows can place the answer box below the row midpoint band.
|
|
1525
|
+
// If we suspect a conversion prompt and still found nothing, scan lower (but stop before the next row center).
|
|
1526
|
+
if (rowHasTurn && !studentMixedAll.length && !studentFracAll.length && i + 1 < centers.length) {
|
|
1527
|
+
const spacing = centers[i + 1] - centers[i];
|
|
1528
|
+
const y1Ext = clampInt(Math.ceil(centers[i] + spacing * 0.9), height, y1, height);
|
|
1529
|
+
const extLeft = Math.floor(width * 0.05);
|
|
1530
|
+
const extRight = Math.min(width, Math.floor(width * 0.55));
|
|
1531
|
+
if (y1Ext > y0 + 8 && extRight > extLeft + 20) {
|
|
1532
|
+
const extText = await ocrAnswerFromCrop({
|
|
1533
|
+
left: extLeft,
|
|
1534
|
+
top: y0,
|
|
1535
|
+
width: Math.max(1, extRight - extLeft),
|
|
1536
|
+
height: Math.max(1, y1Ext - y0),
|
|
1537
|
+
});
|
|
1538
|
+
const mixedExt = parseAllMixedNumbersLoose(extText);
|
|
1539
|
+
const fracExt = parseAllFractionsLoose(extText);
|
|
1540
|
+
if (mixedExt.length || fracExt.length) {
|
|
1541
|
+
ansText = extText;
|
|
1542
|
+
studentMixedAll = mixedExt;
|
|
1543
|
+
studentFracAll = fracExt;
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
// Decide if this is a conversion row ("Turn ..."). Prefer explicit row text.
|
|
1548
|
+
const conversionModeHint = rowHasTurn
|
|
1549
|
+
? rowHintImproper && !rowHintMixed
|
|
1550
|
+
? "improper"
|
|
1551
|
+
: rowHintMixed && !rowHintImproper
|
|
1552
|
+
? "mixed"
|
|
1553
|
+
: null
|
|
1554
|
+
: null;
|
|
1555
|
+
let conversionMode = null;
|
|
1556
|
+
if (conversionModeHint)
|
|
1557
|
+
conversionMode = conversionModeHint;
|
|
1558
|
+
else if (rowHasTurn) {
|
|
1559
|
+
// If OCR missed the keywords, infer from the student's answer shape.
|
|
1560
|
+
if (studentMixedAll.length > 0)
|
|
1561
|
+
conversionMode = "mixed";
|
|
1562
|
+
else if (studentFracAll.length > 0)
|
|
1563
|
+
conversionMode = "improper";
|
|
1564
|
+
}
|
|
1565
|
+
// Grade conversion tasks first. Even if bar detection accidentally yields 2 fractions, "Turn" rows are conversions.
|
|
1566
|
+
if (rowHasTurn && conversionMode && fracs.length >= 1) {
|
|
1567
|
+
const givenEntry = [...fracs].sort((a, b) => b.bar.len - a.bar.len || a.x - b.x)[0];
|
|
1568
|
+
const given = givenEntry.frac;
|
|
1569
|
+
const givenBar = givenEntry.bar;
|
|
1570
|
+
if (conversionMode === "improper") {
|
|
1571
|
+
// Turn W N/D into an improper fraction.
|
|
1572
|
+
let whole = null;
|
|
1573
|
+
const m = rowLower.match(/turn\s+(-?\d+)/);
|
|
1574
|
+
const wholeFromText = m ? toIntegerOrNull(m[1]) : null;
|
|
1575
|
+
if (typeof wholeFromText === "number" && Number.isFinite(wholeFromText)) {
|
|
1576
|
+
whole = wholeFromText;
|
|
1577
|
+
}
|
|
1578
|
+
// OCR the number just to the left of the fraction (avoid the question index column). Prefer pixel OCR if it succeeds.
|
|
1579
|
+
const regionX1 = givenBar.x0 - 1;
|
|
1580
|
+
const regionW = Math.max(24, Math.round(exprInfo.width * 0.26));
|
|
1581
|
+
const idxColX1 = Math.round(exprInfo.width * 0.12);
|
|
1582
|
+
const regionX0 = Math.max(idxColX1, Math.max(0, regionX1 - regionW));
|
|
1583
|
+
const cropW = regionX1 - regionX0 + 1;
|
|
1584
|
+
if (regionX1 >= regionX0 && cropW >= 10) {
|
|
1585
|
+
const boxH = clampInt(Math.round(givenBar.len * 1.2), Math.round(exprInfo.height * 0.28), 14, Math.max(14, Math.round(exprInfo.height * 0.55)));
|
|
1586
|
+
const y0w = clampInt(Math.round(givenBar.cy - boxH), 0, 0, exprInfo.height - 1);
|
|
1587
|
+
const y1w = clampInt(Math.round(givenBar.cy + boxH), exprInfo.height, y0w + 1, exprInfo.height);
|
|
1588
|
+
const h = y1w - y0w;
|
|
1589
|
+
if (h >= 8) {
|
|
1590
|
+
const wholeFromPixels = await ocrIntegerFromImageRegion({
|
|
1591
|
+
sharp: args.sharp,
|
|
1592
|
+
source: exprGray,
|
|
1593
|
+
left: regionX0,
|
|
1594
|
+
top: y0w,
|
|
1595
|
+
width: cropW,
|
|
1596
|
+
height: h,
|
|
1597
|
+
threshold: 200,
|
|
1598
|
+
thresholds: [150, 170, 190, 210],
|
|
1599
|
+
lang: args.lang,
|
|
1600
|
+
langPathEffective: args.langPathEffective,
|
|
1601
|
+
psms: [7, 6, 11],
|
|
1602
|
+
minValue: 0,
|
|
1603
|
+
maxValue: 200,
|
|
1604
|
+
});
|
|
1605
|
+
if (wholeFromPixels !== null)
|
|
1606
|
+
whole = wholeFromPixels;
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
if (whole === null)
|
|
1610
|
+
continue;
|
|
1611
|
+
const correct = normalizeFraction({ n: whole * given.d + Math.abs(given.n), d: given.d });
|
|
1612
|
+
const picked = studentFracAll.find((f) => fractionsEqual(f, correct)) ?? null;
|
|
1613
|
+
const ok = !!picked;
|
|
1614
|
+
const pts = ok ? args.pointsImproperFraction : 0;
|
|
1615
|
+
total += pts;
|
|
1616
|
+
perQuestion.push({
|
|
1617
|
+
index: idx,
|
|
1618
|
+
type: "improper_fraction",
|
|
1619
|
+
correct: ok,
|
|
1620
|
+
points: pts,
|
|
1621
|
+
studentAnswer: picked
|
|
1622
|
+
? fractionToString(picked)
|
|
1623
|
+
: studentFracAll[0]
|
|
1624
|
+
? fractionToString(studentFracAll[0])
|
|
1625
|
+
: null,
|
|
1626
|
+
correctAnswer: fractionToString(correct),
|
|
1627
|
+
});
|
|
1628
|
+
continue;
|
|
1629
|
+
}
|
|
1630
|
+
// conversionMode === "mixed"
|
|
1631
|
+
const wholeOut = Math.trunc(given.n / given.d);
|
|
1632
|
+
const rem = Math.abs(given.n % given.d);
|
|
1633
|
+
const correctFrac = normalizeFraction({ n: rem, d: given.d });
|
|
1634
|
+
const picked = studentMixedAll.find((m) => m.whole === wholeOut && fractionsEqual(m.frac, correctFrac)) ?? null;
|
|
1635
|
+
const ok = !!picked;
|
|
1636
|
+
const pts = ok ? args.pointsMixedNumber : 0;
|
|
1637
|
+
total += pts;
|
|
1638
|
+
perQuestion.push({
|
|
1639
|
+
index: idx,
|
|
1640
|
+
type: "mixed_number",
|
|
1641
|
+
correct: ok,
|
|
1642
|
+
points: pts,
|
|
1643
|
+
studentAnswer: picked
|
|
1644
|
+
? mixedNumberToString(picked.whole, picked.frac)
|
|
1645
|
+
: studentMixedAll[0]
|
|
1646
|
+
? mixedNumberToString(studentMixedAll[0].whole, studentMixedAll[0].frac)
|
|
1647
|
+
: null,
|
|
1648
|
+
correctAnswer: mixedNumberToString(wholeOut, correctFrac),
|
|
1649
|
+
});
|
|
1650
|
+
continue;
|
|
1651
|
+
}
|
|
1652
|
+
// Grade arithmetic by matching the student's answer against all possible ops.
|
|
1653
|
+
if (fracs.length >= 2) {
|
|
1654
|
+
const a = fracs[0].frac;
|
|
1655
|
+
const b = fracs[1].frac;
|
|
1656
|
+
const candidates = [];
|
|
1657
|
+
try {
|
|
1658
|
+
candidates.push({ kind: "add_subtract", frac: addFractions(a, b), pts: args.pointsAddSubtract });
|
|
1659
|
+
}
|
|
1660
|
+
catch { }
|
|
1661
|
+
try {
|
|
1662
|
+
candidates.push({ kind: "add_subtract", frac: subFractions(a, b), pts: args.pointsAddSubtract });
|
|
1663
|
+
}
|
|
1664
|
+
catch { }
|
|
1665
|
+
try {
|
|
1666
|
+
candidates.push({ kind: "multiply_divide", frac: mulFractions(a, b), pts: args.pointsMultiplyDivide });
|
|
1667
|
+
}
|
|
1668
|
+
catch { }
|
|
1669
|
+
try {
|
|
1670
|
+
candidates.push({ kind: "multiply_divide", frac: divFractions(a, b), pts: args.pointsMultiplyDivide });
|
|
1671
|
+
}
|
|
1672
|
+
catch { }
|
|
1673
|
+
let match = null;
|
|
1674
|
+
let studentPicked = null;
|
|
1675
|
+
for (const sf of studentFracAll) {
|
|
1676
|
+
const m = candidates.find((c) => fractionsEqual(sf, c.frac));
|
|
1677
|
+
if (m) {
|
|
1678
|
+
match = m;
|
|
1679
|
+
studentPicked = sf;
|
|
1680
|
+
break;
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
const ok = !!match;
|
|
1684
|
+
const pts = ok && match ? match.pts : 0;
|
|
1685
|
+
total += pts;
|
|
1686
|
+
perQuestion.push({
|
|
1687
|
+
index: idx,
|
|
1688
|
+
type: match?.kind ?? "arithmetic",
|
|
1689
|
+
correct: ok,
|
|
1690
|
+
points: pts,
|
|
1691
|
+
studentAnswer: studentPicked
|
|
1692
|
+
? fractionToString(studentPicked)
|
|
1693
|
+
: studentFracAll[0]
|
|
1694
|
+
? fractionToString(studentFracAll[0])
|
|
1695
|
+
: null,
|
|
1696
|
+
correctAnswer: match ? fractionToString(match.frac) : fractionToString(addFractions(a, b)),
|
|
1697
|
+
debug: debugEnabled
|
|
1698
|
+
? {
|
|
1699
|
+
operands: [fractionToString(a), fractionToString(b)],
|
|
1700
|
+
studentCandidates: studentFracAll.map(fractionToString).slice(0, 8),
|
|
1701
|
+
}
|
|
1702
|
+
: undefined,
|
|
1703
|
+
});
|
|
1704
|
+
continue;
|
|
1705
|
+
}
|
|
586
1706
|
}
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
1707
|
+
total += args.bonusPoints;
|
|
1708
|
+
// Return only if we graded a meaningful number of rows (avoids false positives on unrelated images).
|
|
1709
|
+
if (perQuestion.length < Math.min(centers.length, 6))
|
|
1710
|
+
return null;
|
|
1711
|
+
return { extractedQuestionCount: perQuestion.length, score: total, perQuestion };
|
|
1712
|
+
}
|
|
1713
|
+
const FASTER_WHISPER_PY_SCRIPT_V1 = `# NodeBench MCP audio transcription helper (faster-whisper)
|
|
1714
|
+
# This file is written to a temp directory at runtime.
|
|
1715
|
+
import argparse
|
|
1716
|
+
import json
|
|
1717
|
+
import sys
|
|
1718
|
+
|
|
1719
|
+
|
|
1720
|
+
def main() -> None:
|
|
1721
|
+
p = argparse.ArgumentParser()
|
|
1722
|
+
p.add_argument("--path", required=True)
|
|
1723
|
+
p.add_argument("--model", default="tiny.en")
|
|
1724
|
+
p.add_argument("--language", default="")
|
|
1725
|
+
p.add_argument("--task", default="transcribe")
|
|
1726
|
+
p.add_argument("--beam-size", type=int, default=5)
|
|
1727
|
+
p.add_argument("--vad-filter", type=int, default=0)
|
|
1728
|
+
p.add_argument("--max-chars", type=int, default=12000)
|
|
1729
|
+
p.add_argument("--include-segments", type=int, default=0)
|
|
1730
|
+
args = p.parse_args()
|
|
1731
|
+
|
|
1732
|
+
try:
|
|
1733
|
+
from faster_whisper import WhisperModel
|
|
1734
|
+
except Exception:
|
|
1735
|
+
sys.stderr.write(
|
|
1736
|
+
"Missing python dependency: faster-whisper. Install with: pip install faster-whisper\\n"
|
|
1737
|
+
)
|
|
1738
|
+
raise
|
|
1739
|
+
|
|
1740
|
+
model = WhisperModel(args.model, device="cpu", compute_type="int8")
|
|
1741
|
+
segments, info = model.transcribe(
|
|
1742
|
+
args.path,
|
|
1743
|
+
beam_size=max(1, int(args.beam_size)),
|
|
1744
|
+
language=(args.language or None),
|
|
1745
|
+
task=(args.task or "transcribe"),
|
|
1746
|
+
vad_filter=bool(int(args.vad_filter)),
|
|
1747
|
+
word_timestamps=False,
|
|
1748
|
+
temperature=0.0,
|
|
1749
|
+
)
|
|
1750
|
+
|
|
1751
|
+
include_segments = bool(int(args.include_segments))
|
|
1752
|
+
max_chars = max(200, int(args.max_chars))
|
|
1753
|
+
|
|
1754
|
+
parts = []
|
|
1755
|
+
segs = []
|
|
1756
|
+
char_budget = 0
|
|
1757
|
+
truncated = False
|
|
1758
|
+
|
|
1759
|
+
for seg in segments:
|
|
1760
|
+
t = str(getattr(seg, "text", "") or "")
|
|
1761
|
+
if not t:
|
|
1762
|
+
continue
|
|
1763
|
+
parts.append(t)
|
|
1764
|
+
if include_segments:
|
|
1765
|
+
segs.append(
|
|
1766
|
+
{
|
|
1767
|
+
"start": float(getattr(seg, "start", 0.0) or 0.0),
|
|
1768
|
+
"end": float(getattr(seg, "end", 0.0) or 0.0),
|
|
1769
|
+
"text": t,
|
|
1770
|
+
}
|
|
1771
|
+
)
|
|
1772
|
+
char_budget += len(t)
|
|
1773
|
+
if char_budget >= max_chars:
|
|
1774
|
+
truncated = True
|
|
1775
|
+
break
|
|
1776
|
+
|
|
1777
|
+
text = "".join(parts).strip()
|
|
1778
|
+
if len(text) > max_chars:
|
|
1779
|
+
text = text[:max_chars]
|
|
1780
|
+
truncated = True
|
|
1781
|
+
|
|
1782
|
+
out = {
|
|
1783
|
+
"path": args.path,
|
|
1784
|
+
"model": args.model,
|
|
1785
|
+
"task": args.task,
|
|
1786
|
+
"language": getattr(info, "language", None),
|
|
1787
|
+
"languageProbability": getattr(info, "language_probability", None),
|
|
1788
|
+
"durationSeconds": getattr(info, "duration", None),
|
|
1789
|
+
"beamSize": int(args.beam_size),
|
|
1790
|
+
"vadFilter": bool(int(args.vad_filter)),
|
|
1791
|
+
"maxChars": max_chars,
|
|
1792
|
+
"truncated": truncated,
|
|
1793
|
+
"text": text,
|
|
1794
|
+
}
|
|
1795
|
+
if include_segments:
|
|
1796
|
+
out["segments"] = segs
|
|
1797
|
+
sys.stdout.write(json.dumps(out, ensure_ascii=False))
|
|
1798
|
+
|
|
1799
|
+
|
|
1800
|
+
if __name__ == "__main__":
|
|
1801
|
+
main()
|
|
594
1802
|
`;
|
|
595
1803
|
function findPythonExecutable() {
|
|
596
1804
|
const override = process.env.NODEBENCH_PYTHON ||
|
|
@@ -998,7 +2206,15 @@ async function loadXlsxTable(args, opts) {
|
|
|
998
2206
|
dataRows,
|
|
999
2207
|
};
|
|
1000
2208
|
}
|
|
1001
|
-
|
|
2209
|
+
const GAIA_SOLVER_NAMES = new Set([
|
|
2210
|
+
"solve_red_green_deviation_average_from_image",
|
|
2211
|
+
"solve_green_polygon_area_from_image",
|
|
2212
|
+
"grade_fraction_quiz_from_image",
|
|
2213
|
+
"extract_fractions_and_simplify_from_image",
|
|
2214
|
+
"solve_bass_clef_age_from_image",
|
|
2215
|
+
"solve_storage_upgrade_cost_per_file_from_image",
|
|
2216
|
+
]);
|
|
2217
|
+
const _ALL_LOCAL_FILE_TOOLS = [
|
|
1002
2218
|
{
|
|
1003
2219
|
name: "read_csv_file",
|
|
1004
2220
|
description: "Read a local CSV file and return a bounded table preview (headers + rows). Deterministic, no network.",
|
|
@@ -2850,6 +4066,1825 @@ export const localFileTools = [
|
|
|
2850
4066
|
};
|
|
2851
4067
|
},
|
|
2852
4068
|
},
|
|
4069
|
+
{
|
|
4070
|
+
name: "solve_red_green_deviation_average_from_image",
|
|
4071
|
+
description: "Extract red and green numbers from an image, compute population stdev(red) and sample stdev(green), then return their average. Deterministic, no network.",
|
|
4072
|
+
inputSchema: {
|
|
4073
|
+
type: "object",
|
|
4074
|
+
properties: {
|
|
4075
|
+
path: {
|
|
4076
|
+
type: "string",
|
|
4077
|
+
description: "Path to a local image file (absolute or relative to current working directory).",
|
|
4078
|
+
},
|
|
4079
|
+
decimals: {
|
|
4080
|
+
type: "number",
|
|
4081
|
+
description: "Decimal places to round to (default: 3).",
|
|
4082
|
+
default: 3,
|
|
4083
|
+
},
|
|
4084
|
+
lang: {
|
|
4085
|
+
type: "string",
|
|
4086
|
+
description: "Tesseract language code (default: eng).",
|
|
4087
|
+
default: "eng",
|
|
4088
|
+
},
|
|
4089
|
+
langPath: {
|
|
4090
|
+
type: "string",
|
|
4091
|
+
description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
|
|
4092
|
+
},
|
|
4093
|
+
maxPixels: {
|
|
4094
|
+
type: "number",
|
|
4095
|
+
description: "Safety cap on pixels to process (default: 6,000,000).",
|
|
4096
|
+
default: 6000000,
|
|
4097
|
+
},
|
|
4098
|
+
debug: {
|
|
4099
|
+
type: "boolean",
|
|
4100
|
+
description: "If true, include detailed debug info (labels + segment assignments).",
|
|
4101
|
+
default: false,
|
|
4102
|
+
},
|
|
4103
|
+
},
|
|
4104
|
+
required: ["path"],
|
|
4105
|
+
},
|
|
4106
|
+
handler: async (args) => {
|
|
4107
|
+
const filePath = resolveLocalPath(args?.path);
|
|
4108
|
+
if (!existsSync(filePath))
|
|
4109
|
+
throw new Error(`File not found: ${filePath}`);
|
|
4110
|
+
const decimals = clampInt(args?.decimals, 3, 0, 8);
|
|
4111
|
+
const lang = String(args?.lang ?? "eng").trim() || "eng";
|
|
4112
|
+
const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
|
|
4113
|
+
const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
|
|
4114
|
+
const langPathEffective = langPathArg
|
|
4115
|
+
? resolveLocalPath(langPathArg)
|
|
4116
|
+
: existsSync(defaultLangPath)
|
|
4117
|
+
? defaultLangPath
|
|
4118
|
+
: null;
|
|
4119
|
+
const maxPixels = clampInt(args?.maxPixels, 6000000, 10000, 100_000_000);
|
|
4120
|
+
const redOcr = await ocrRecognizeImageFileWithColorMask({
|
|
4121
|
+
filePath,
|
|
4122
|
+
color: "red",
|
|
4123
|
+
lang,
|
|
4124
|
+
langPathEffective,
|
|
4125
|
+
maxPixels,
|
|
4126
|
+
});
|
|
4127
|
+
const greenOcr = await ocrRecognizeImageFileWithColorMask({
|
|
4128
|
+
filePath,
|
|
4129
|
+
color: "green",
|
|
4130
|
+
lang,
|
|
4131
|
+
langPathEffective,
|
|
4132
|
+
maxPixels,
|
|
4133
|
+
});
|
|
4134
|
+
// OCR on masked grids can concatenate adjacent numbers. Recover by chunking digit runs.
|
|
4135
|
+
const repairTwoDigitGrid = (nums) => {
|
|
4136
|
+
const ones = nums.filter((n) => Number.isFinite(n) && n >= 0 && n <= 9);
|
|
4137
|
+
const twos = nums.filter((n) => Number.isFinite(n) && n >= 10 && n <= 99);
|
|
4138
|
+
// If OCR dropped the leading digit for a handful of tokens (common with '5'),
|
|
4139
|
+
// repair by choosing the 2-digit candidate closest to the distribution median.
|
|
4140
|
+
if (twos.length < 10 || ones.length === 0 || ones.length > 6)
|
|
4141
|
+
return twos;
|
|
4142
|
+
const sorted = [...twos].sort((a, b) => a - b);
|
|
4143
|
+
const med = sorted[Math.floor(sorted.length / 2)];
|
|
4144
|
+
const repaired = [...twos];
|
|
4145
|
+
for (const d of ones) {
|
|
4146
|
+
let best = 10 + d;
|
|
4147
|
+
let bestDist = Math.abs(best - med);
|
|
4148
|
+
for (let k = 2; k <= 9; k++) {
|
|
4149
|
+
const cand = k * 10 + d;
|
|
4150
|
+
const dist = Math.abs(cand - med);
|
|
4151
|
+
if (dist < bestDist) {
|
|
4152
|
+
best = cand;
|
|
4153
|
+
bestDist = dist;
|
|
4154
|
+
}
|
|
4155
|
+
}
|
|
4156
|
+
repaired.push(best);
|
|
4157
|
+
}
|
|
4158
|
+
return repaired;
|
|
4159
|
+
};
|
|
4160
|
+
const redRaw = extractChunkedIntsFromText(redOcr.text, { chunkSize: 2, min: 0, max: 99 });
|
|
4161
|
+
const greenRaw = extractChunkedIntsFromText(greenOcr.text, { chunkSize: 2, min: 0, max: 99 });
|
|
4162
|
+
const redNums = repairTwoDigitGrid(redRaw);
|
|
4163
|
+
const greenNums = repairTwoDigitGrid(greenRaw);
|
|
4164
|
+
if (!redNums.length)
|
|
4165
|
+
throw new Error("No red numbers found via OCR");
|
|
4166
|
+
if (greenNums.length < 2)
|
|
4167
|
+
throw new Error("Need at least 2 green numbers via OCR");
|
|
4168
|
+
const redDev = pstdev(redNums);
|
|
4169
|
+
const greenDev = stdev(greenNums);
|
|
4170
|
+
const avg = (redDev + greenDev) / 2;
|
|
4171
|
+
const rounded = Number(avg.toFixed(decimals));
|
|
4172
|
+
return {
|
|
4173
|
+
path: filePath,
|
|
4174
|
+
decimals,
|
|
4175
|
+
redCount: redNums.length,
|
|
4176
|
+
greenCount: greenNums.length,
|
|
4177
|
+
redPstdev: Number(redDev.toFixed(decimals + 3)),
|
|
4178
|
+
greenStdev: Number(greenDev.toFixed(decimals + 3)),
|
|
4179
|
+
average: rounded,
|
|
4180
|
+
answer: rounded.toFixed(decimals),
|
|
4181
|
+
};
|
|
4182
|
+
},
|
|
4183
|
+
},
|
|
4184
|
+
{
|
|
4185
|
+
name: "solve_green_polygon_area_from_image",
|
|
4186
|
+
description: "Compute the area of a green filled polygon in an image by pixel segmentation, calibrating pixel-to-unit scale from nearby purple length labels. Deterministic, no network.",
|
|
4187
|
+
inputSchema: {
|
|
4188
|
+
type: "object",
|
|
4189
|
+
properties: {
|
|
4190
|
+
path: {
|
|
4191
|
+
type: "string",
|
|
4192
|
+
description: "Path to a local image file (absolute or relative to current working directory).",
|
|
4193
|
+
},
|
|
4194
|
+
lang: {
|
|
4195
|
+
type: "string",
|
|
4196
|
+
description: "Tesseract language code for reading purple numeric labels (default: eng).",
|
|
4197
|
+
default: "eng",
|
|
4198
|
+
},
|
|
4199
|
+
langPath: {
|
|
4200
|
+
type: "string",
|
|
4201
|
+
description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
|
|
4202
|
+
},
|
|
4203
|
+
maxPixels: {
|
|
4204
|
+
type: "number",
|
|
4205
|
+
description: "Safety cap on pixels to process (default: 6,000,000).",
|
|
4206
|
+
default: 6000000,
|
|
4207
|
+
},
|
|
4208
|
+
},
|
|
4209
|
+
required: ["path"],
|
|
4210
|
+
},
|
|
4211
|
+
handler: async (args) => {
|
|
4212
|
+
const filePath = resolveLocalPath(args?.path);
|
|
4213
|
+
if (!existsSync(filePath))
|
|
4214
|
+
throw new Error(`File not found: ${filePath}`);
|
|
4215
|
+
const sharp = await getSharpOptional();
|
|
4216
|
+
if (!sharp)
|
|
4217
|
+
throw new Error("Missing optional dependency: sharp. Install it to use polygon area parsing.");
|
|
4218
|
+
const lang = String(args?.lang ?? "eng").trim() || "eng";
|
|
4219
|
+
const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
|
|
4220
|
+
const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
|
|
4221
|
+
const langPathEffective = langPathArg
|
|
4222
|
+
? resolveLocalPath(langPathArg)
|
|
4223
|
+
: existsSync(defaultLangPath)
|
|
4224
|
+
? defaultLangPath
|
|
4225
|
+
: null;
|
|
4226
|
+
const maxPixels = clampInt(args?.maxPixels, 6000000, 10000, 100_000_000);
|
|
4227
|
+
const debug = args?.debug === true || process.env.NODEBENCH_DEBUG_GREEN_POLYGON === "1";
|
|
4228
|
+
// Segment green pixels (filled polygon).
|
|
4229
|
+
const image = sharp(await readFile(filePath));
|
|
4230
|
+
const meta = await image.metadata();
|
|
4231
|
+
const w = meta.width ?? 0;
|
|
4232
|
+
const h = meta.height ?? 0;
|
|
4233
|
+
if (!w || !h)
|
|
4234
|
+
throw new Error("Unable to read image dimensions");
|
|
4235
|
+
if (w * h > maxPixels) {
|
|
4236
|
+
throw new Error(`Refusing huge image (${w}x${h}) for polygon parsing (maxPixels=${maxPixels})`);
|
|
4237
|
+
}
|
|
4238
|
+
const { data, info } = await image.ensureAlpha().raw().toBuffer({ resolveWithObject: true });
|
|
4239
|
+
const width = info.width;
|
|
4240
|
+
const height = info.height;
|
|
4241
|
+
const green = new Uint8Array(width * height);
|
|
4242
|
+
let areaPx = 0;
|
|
4243
|
+
for (let i = 0, j = 0; i < data.length; i += 4, j++) {
|
|
4244
|
+
const r = data[i];
|
|
4245
|
+
const g = data[i + 1];
|
|
4246
|
+
const b = data[i + 2];
|
|
4247
|
+
const a = data[i + 3];
|
|
4248
|
+
// Conservative "green" heuristic: G is high and dominates R/B.
|
|
4249
|
+
const isGreen = a >= 40 && g >= 110 && g - r >= 25 && g - b >= 25;
|
|
4250
|
+
if (isGreen) {
|
|
4251
|
+
green[j] = 1;
|
|
4252
|
+
areaPx++;
|
|
4253
|
+
}
|
|
4254
|
+
}
|
|
4255
|
+
if (!areaPx)
|
|
4256
|
+
throw new Error("No green region detected");
|
|
4257
|
+
// Trace the polygon boundary on a grid, then assign labeled lengths to segments and compute area in units.
|
|
4258
|
+
const vertW = width + 1;
|
|
4259
|
+
const adj = new Map();
|
|
4260
|
+
const pushNeighbor = (a, b) => {
|
|
4261
|
+
const arr = adj.get(a);
|
|
4262
|
+
if (arr) {
|
|
4263
|
+
if (!arr.includes(b))
|
|
4264
|
+
arr.push(b);
|
|
4265
|
+
}
|
|
4266
|
+
else {
|
|
4267
|
+
adj.set(a, [b]);
|
|
4268
|
+
}
|
|
4269
|
+
};
|
|
4270
|
+
const addEdge = (ax, ay, bx, by) => {
|
|
4271
|
+
const aId = ay * vertW + ax;
|
|
4272
|
+
const bId = by * vertW + bx;
|
|
4273
|
+
pushNeighbor(aId, bId);
|
|
4274
|
+
pushNeighbor(bId, aId);
|
|
4275
|
+
};
|
|
4276
|
+
const pix = (x, y) => green[y * width + x] === 1;
|
|
4277
|
+
for (let y = 0; y < height; y++) {
|
|
4278
|
+
for (let x = 0; x < width; x++) {
|
|
4279
|
+
if (!pix(x, y))
|
|
4280
|
+
continue;
|
|
4281
|
+
// Add boundary edges where the neighbor pixel is empty/out-of-bounds.
|
|
4282
|
+
if (y === 0 || !pix(x, y - 1))
|
|
4283
|
+
addEdge(x, y, x + 1, y); // top edge
|
|
4284
|
+
if (y === height - 1 || !pix(x, y + 1))
|
|
4285
|
+
addEdge(x, y + 1, x + 1, y + 1); // bottom edge
|
|
4286
|
+
if (x === 0 || !pix(x - 1, y))
|
|
4287
|
+
addEdge(x, y, x, y + 1); // left edge
|
|
4288
|
+
if (x === width - 1 || !pix(x + 1, y))
|
|
4289
|
+
addEdge(x + 1, y, x + 1, y + 1); // right edge
|
|
4290
|
+
}
|
|
4291
|
+
}
|
|
4292
|
+
if (!adj.size)
|
|
4293
|
+
throw new Error("Failed to build polygon boundary graph");
|
|
4294
|
+
const idToXY = (id) => ({ x: id % vertW, y: Math.floor(id / vertW) });
|
|
4295
|
+
// Extract all boundary cycles (outer boundary + holes). Filled pixel regions can contain holes,
|
|
4296
|
+
// and we must subtract them from the outer boundary area.
|
|
4297
|
+
const edgeKey = (a, b) => (a < b ? `${a}-${b}` : `${b}-${a}`);
|
|
4298
|
+
const visitedEdge = new Set();
|
|
4299
|
+
const maxSteps = Math.max(10_000, width * height);
|
|
4300
|
+
const cycleIds = [];
|
|
4301
|
+
for (const [u, ns] of adj.entries()) {
|
|
4302
|
+
for (const v of ns) {
|
|
4303
|
+
const k0 = edgeKey(u, v);
|
|
4304
|
+
if (visitedEdge.has(k0))
|
|
4305
|
+
continue;
|
|
4306
|
+
const pathIds = [u];
|
|
4307
|
+
let prev = u;
|
|
4308
|
+
let curr = v;
|
|
4309
|
+
visitedEdge.add(k0);
|
|
4310
|
+
for (let step = 0; step < maxSteps; step++) {
|
|
4311
|
+
pathIds.push(curr);
|
|
4312
|
+
if (curr === u)
|
|
4313
|
+
break;
|
|
4314
|
+
const nbrs = adj.get(curr) ?? [];
|
|
4315
|
+
if (!nbrs.length)
|
|
4316
|
+
break;
|
|
4317
|
+
// Prefer an unvisited edge continuing forward, else fall back to "not prev".
|
|
4318
|
+
let next = nbrs.find((n) => n !== prev && !visitedEdge.has(edgeKey(curr, n))) ??
|
|
4319
|
+
nbrs.find((n) => n !== prev) ??
|
|
4320
|
+
nbrs[0];
|
|
4321
|
+
if (typeof next !== "number")
|
|
4322
|
+
break;
|
|
4323
|
+
visitedEdge.add(edgeKey(curr, next));
|
|
4324
|
+
prev = curr;
|
|
4325
|
+
curr = next;
|
|
4326
|
+
}
|
|
4327
|
+
if (pathIds.length >= 4 && pathIds[pathIds.length - 1] === u) {
|
|
4328
|
+
cycleIds.push(pathIds);
|
|
4329
|
+
}
|
|
4330
|
+
}
|
|
4331
|
+
}
|
|
4332
|
+
if (!cycleIds.length)
|
|
4333
|
+
throw new Error("Failed to extract boundary cycles");
|
|
4334
|
+
const cycleData = [];
|
|
4335
|
+
for (const ids of cycleIds) {
|
|
4336
|
+
const pts = ids.map(idToXY);
|
|
4337
|
+
if (pts.length < 4)
|
|
4338
|
+
continue;
|
|
4339
|
+
let x0 = pts[0].x, x1 = pts[0].x, y0 = pts[0].y, y1 = pts[0].y;
|
|
4340
|
+
for (const p of pts) {
|
|
4341
|
+
if (p.x < x0)
|
|
4342
|
+
x0 = p.x;
|
|
4343
|
+
if (p.x > x1)
|
|
4344
|
+
x1 = p.x;
|
|
4345
|
+
if (p.y < y0)
|
|
4346
|
+
y0 = p.y;
|
|
4347
|
+
if (p.y > y1)
|
|
4348
|
+
y1 = p.y;
|
|
4349
|
+
}
|
|
4350
|
+
let twiceArea = 0;
|
|
4351
|
+
for (let i = 0; i < pts.length - 1; i++) {
|
|
4352
|
+
twiceArea += pts[i].x * pts[i + 1].y - pts[i + 1].x * pts[i].y;
|
|
4353
|
+
}
|
|
4354
|
+
const areaPx2 = Math.abs(twiceArea) / 2;
|
|
4355
|
+
if (!Number.isFinite(areaPx2) || areaPx2 < 10)
|
|
4356
|
+
continue;
|
|
4357
|
+
cycleData.push({ pts, areaPx2, box: { x0, y0, x1, y1 } });
|
|
4358
|
+
}
|
|
4359
|
+
if (!cycleData.length)
|
|
4360
|
+
throw new Error("No valid boundary cycles after filtering");
|
|
4361
|
+
const allSegs = [];
|
|
4362
|
+
const cycleSegs = [];
|
|
4363
|
+
for (let cycleIndex = 0; cycleIndex < cycleData.length; cycleIndex++) {
|
|
4364
|
+
const pts = cycleData[cycleIndex].pts;
|
|
4365
|
+
const segs = [];
|
|
4366
|
+
let runDir = null;
|
|
4367
|
+
let runLen = 0;
|
|
4368
|
+
let sx = pts[0].x;
|
|
4369
|
+
let sy = pts[0].y;
|
|
4370
|
+
for (let i = 1; i < pts.length; i++) {
|
|
4371
|
+
const a = pts[i - 1];
|
|
4372
|
+
const b = pts[i];
|
|
4373
|
+
const dx = b.x - a.x;
|
|
4374
|
+
const dy = b.y - a.y;
|
|
4375
|
+
const dir = dx === 1 ? "R" : dx === -1 ? "L" : dy === 1 ? "D" : "U";
|
|
4376
|
+
if (runDir === null) {
|
|
4377
|
+
runDir = dir;
|
|
4378
|
+
runLen = 1;
|
|
4379
|
+
sx = a.x;
|
|
4380
|
+
sy = a.y;
|
|
4381
|
+
continue;
|
|
4382
|
+
}
|
|
4383
|
+
if (dir === runDir) {
|
|
4384
|
+
runLen++;
|
|
4385
|
+
continue;
|
|
4386
|
+
}
|
|
4387
|
+
segs.push({
|
|
4388
|
+
dir: runDir,
|
|
4389
|
+
pxLen: runLen,
|
|
4390
|
+
x0: sx,
|
|
4391
|
+
y0: sy,
|
|
4392
|
+
x1: a.x,
|
|
4393
|
+
y1: a.y,
|
|
4394
|
+
unitLen: null,
|
|
4395
|
+
labelRaw: null,
|
|
4396
|
+
cycleIndex,
|
|
4397
|
+
});
|
|
4398
|
+
runDir = dir;
|
|
4399
|
+
runLen = 1;
|
|
4400
|
+
sx = a.x;
|
|
4401
|
+
sy = a.y;
|
|
4402
|
+
}
|
|
4403
|
+
if (runDir !== null && runLen > 0) {
|
|
4404
|
+
const lastA = pts[pts.length - 2];
|
|
4405
|
+
segs.push({
|
|
4406
|
+
dir: runDir,
|
|
4407
|
+
pxLen: runLen,
|
|
4408
|
+
x0: sx,
|
|
4409
|
+
y0: sy,
|
|
4410
|
+
x1: lastA.x,
|
|
4411
|
+
y1: lastA.y,
|
|
4412
|
+
unitLen: null,
|
|
4413
|
+
labelRaw: null,
|
|
4414
|
+
cycleIndex,
|
|
4415
|
+
});
|
|
4416
|
+
}
|
|
4417
|
+
cycleSegs.push(segs);
|
|
4418
|
+
allSegs.push(...segs);
|
|
4419
|
+
}
|
|
4420
|
+
if (allSegs.length < 4)
|
|
4421
|
+
throw new Error("Failed to simplify boundary into segments");
|
|
4422
|
+
// Build a purple-only binary mask for labels.
|
|
4423
|
+
const purpleBw = new Uint8Array(width * height);
|
|
4424
|
+
for (let i = 0, j = 0; i < data.length; i += 4, j++) {
|
|
4425
|
+
const r = data[i];
|
|
4426
|
+
const g = data[i + 1];
|
|
4427
|
+
const b = data[i + 2];
|
|
4428
|
+
const a = data[i + 3];
|
|
4429
|
+
// Purple length labels: relatively high R and B, lower G. Keep this somewhat permissive to
|
|
4430
|
+
// avoid dropping thin digits (e.g. "4") due to anti-aliasing.
|
|
4431
|
+
const isPurple = a >= 40 && r >= 120 && b >= 120 && g <= 220 && r - g >= 10 && b - g >= 10;
|
|
4432
|
+
purpleBw[j] = isPurple ? 0 : 255;
|
|
4433
|
+
}
|
|
4434
|
+
const parseLabelValue = (raw) => {
|
|
4435
|
+
const cleaned = String(raw ?? "")
|
|
4436
|
+
.trim()
|
|
4437
|
+
.replace(/,/g, ".")
|
|
4438
|
+
.replace(/[^0-9.]/g, "");
|
|
4439
|
+
if (!cleaned)
|
|
4440
|
+
return null;
|
|
4441
|
+
const parts = cleaned.split(".");
|
|
4442
|
+
const normalized = parts.length <= 2 ? cleaned : `${parts[0]}.${parts.slice(1).join("")}`;
|
|
4443
|
+
const n = Number.parseFloat(normalized);
|
|
4444
|
+
return Number.isFinite(n) ? n : null;
|
|
4445
|
+
};
|
|
4446
|
+
const purpleBuf = Buffer.from(purpleBw);
|
|
4447
|
+
const visited = new Uint8Array(width * height);
|
|
4448
|
+
const inBounds = (x, y) => x >= 0 && x < width && y >= 0 && y < height;
|
|
4449
|
+
const neighbors = [
|
|
4450
|
+
[1, 0],
|
|
4451
|
+
[-1, 0],
|
|
4452
|
+
[0, 1],
|
|
4453
|
+
[0, -1],
|
|
4454
|
+
[1, 1],
|
|
4455
|
+
[-1, -1],
|
|
4456
|
+
[1, -1],
|
|
4457
|
+
[-1, 1],
|
|
4458
|
+
];
|
|
4459
|
+
const comps = [];
|
|
4460
|
+
const idx2 = (x, y) => y * width + x;
|
|
4461
|
+
for (let y = 0; y < height; y++) {
|
|
4462
|
+
for (let x = 0; x < width; x++) {
|
|
4463
|
+
const startIdx = idx2(x, y);
|
|
4464
|
+
if (visited[startIdx])
|
|
4465
|
+
continue;
|
|
4466
|
+
visited[startIdx] = 1;
|
|
4467
|
+
if (purpleBw[startIdx] >= 128)
|
|
4468
|
+
continue;
|
|
4469
|
+
let area = 0;
|
|
4470
|
+
let sx2 = 0;
|
|
4471
|
+
let sy2 = 0;
|
|
4472
|
+
let x0 = x, x1 = x, y0 = y, y1 = y;
|
|
4473
|
+
const qx = [x];
|
|
4474
|
+
const qy = [y];
|
|
4475
|
+
for (let qi = 0; qi < qx.length; qi++) {
|
|
4476
|
+
const px = qx[qi];
|
|
4477
|
+
const py = qy[qi];
|
|
4478
|
+
const pidx = idx2(px, py);
|
|
4479
|
+
if (purpleBw[pidx] >= 128)
|
|
4480
|
+
continue;
|
|
4481
|
+
area++;
|
|
4482
|
+
sx2 += px;
|
|
4483
|
+
sy2 += py;
|
|
4484
|
+
if (px < x0)
|
|
4485
|
+
x0 = px;
|
|
4486
|
+
if (px > x1)
|
|
4487
|
+
x1 = px;
|
|
4488
|
+
if (py < y0)
|
|
4489
|
+
y0 = py;
|
|
4490
|
+
if (py > y1)
|
|
4491
|
+
y1 = py;
|
|
4492
|
+
for (const [dx, dy] of neighbors) {
|
|
4493
|
+
const nx = px + dx;
|
|
4494
|
+
const ny = py + dy;
|
|
4495
|
+
if (!inBounds(nx, ny))
|
|
4496
|
+
continue;
|
|
4497
|
+
const nidx = idx2(nx, ny);
|
|
4498
|
+
if (visited[nidx])
|
|
4499
|
+
continue;
|
|
4500
|
+
visited[nidx] = 1;
|
|
4501
|
+
if (purpleBw[nidx] < 128) {
|
|
4502
|
+
qx.push(nx);
|
|
4503
|
+
qy.push(ny);
|
|
4504
|
+
}
|
|
4505
|
+
}
|
|
4506
|
+
}
|
|
4507
|
+
const bw = x1 - x0 + 1;
|
|
4508
|
+
const bh = y1 - y0 + 1;
|
|
4509
|
+
if (area < 6)
|
|
4510
|
+
continue;
|
|
4511
|
+
if (bw < 2 || bh < 2)
|
|
4512
|
+
continue;
|
|
4513
|
+
if (bw > Math.round(width * 0.25) || bh > Math.round(height * 0.25))
|
|
4514
|
+
continue;
|
|
4515
|
+
const cx = sx2 / area;
|
|
4516
|
+
const cy = sy2 / area;
|
|
4517
|
+
comps.push({ area, x0, y0, x1, y1, cx, cy });
|
|
4518
|
+
}
|
|
4519
|
+
}
|
|
4520
|
+
// Group digit components into label boxes.
|
|
4521
|
+
comps.sort((a, b) => a.cy - b.cy || a.cx - b.cx);
|
|
4522
|
+
const rowTol = Math.max(8, Math.round(height * 0.03));
|
|
4523
|
+
const rows = [];
|
|
4524
|
+
for (const c of comps) {
|
|
4525
|
+
const row = rows.find((r) => Math.abs(r.cy - c.cy) <= rowTol);
|
|
4526
|
+
if (!row) {
|
|
4527
|
+
rows.push({ cy: c.cy, comps: [c] });
|
|
4528
|
+
continue;
|
|
4529
|
+
}
|
|
4530
|
+
row.comps.push(c);
|
|
4531
|
+
row.cy = (row.cy * (row.comps.length - 1) + c.cy) / row.comps.length;
|
|
4532
|
+
}
|
|
4533
|
+
const labelBoxes = [];
|
|
4534
|
+
const xGapTol = Math.max(6, Math.round(width * 0.015));
|
|
4535
|
+
for (const r of rows) {
|
|
4536
|
+
const cs = [...r.comps].sort((a, b) => a.cx - b.cx);
|
|
4537
|
+
let group = [];
|
|
4538
|
+
const flush = () => {
|
|
4539
|
+
if (!group.length)
|
|
4540
|
+
return;
|
|
4541
|
+
let x0 = group[0].x0, y0 = group[0].y0, x1 = group[0].x1, y1 = group[0].y1;
|
|
4542
|
+
for (const c of group) {
|
|
4543
|
+
if (c.x0 < x0)
|
|
4544
|
+
x0 = c.x0;
|
|
4545
|
+
if (c.y0 < y0)
|
|
4546
|
+
y0 = c.y0;
|
|
4547
|
+
if (c.x1 > x1)
|
|
4548
|
+
x1 = c.x1;
|
|
4549
|
+
if (c.y1 > y1)
|
|
4550
|
+
y1 = c.y1;
|
|
4551
|
+
}
|
|
4552
|
+
labelBoxes.push({ x0, y0, x1, y1, cx: (x0 + x1) / 2, cy: (y0 + y1) / 2 });
|
|
4553
|
+
group = [];
|
|
4554
|
+
};
|
|
4555
|
+
for (const c of cs) {
|
|
4556
|
+
const last = group[group.length - 1];
|
|
4557
|
+
if (!last) {
|
|
4558
|
+
group.push(c);
|
|
4559
|
+
continue;
|
|
4560
|
+
}
|
|
4561
|
+
const gap = c.x0 - last.x1;
|
|
4562
|
+
if (gap <= xGapTol) {
|
|
4563
|
+
group.push(c);
|
|
4564
|
+
}
|
|
4565
|
+
else {
|
|
4566
|
+
flush();
|
|
4567
|
+
group.push(c);
|
|
4568
|
+
}
|
|
4569
|
+
}
|
|
4570
|
+
flush();
|
|
4571
|
+
}
|
|
4572
|
+
const ocrLabelBox = async (box) => {
|
|
4573
|
+
const pad = 2;
|
|
4574
|
+
const left = clampInt(box.x0 - pad, 0, 0, width - 1);
|
|
4575
|
+
const top = clampInt(box.y0 - pad, 0, 0, height - 1);
|
|
4576
|
+
const right = clampInt(box.x1 + pad, width - 1, 0, width - 1);
|
|
4577
|
+
const bottom = clampInt(box.y1 + pad, height - 1, 0, height - 1);
|
|
4578
|
+
const w2 = right - left + 1;
|
|
4579
|
+
const h2 = bottom - top + 1;
|
|
4580
|
+
if (w2 < 4 || h2 < 4)
|
|
4581
|
+
return null;
|
|
4582
|
+
const targetW = 140;
|
|
4583
|
+
const scale = w2 < targetW ? Math.max(1, Math.min(8, Math.ceil(targetW / w2))) : 1;
|
|
4584
|
+
const buf = await sharp(purpleBuf, { raw: { width, height, channels: 1 } })
|
|
4585
|
+
.extract({ left, top, width: w2, height: h2 })
|
|
4586
|
+
.resize({ width: w2 * scale, height: h2 * scale, kernel: "nearest" })
|
|
4587
|
+
// sharp blur() requires sigma >= 0.3. Use 0.5 for safety margin above the minimum.
|
|
4588
|
+
.blur(0.5)
|
|
4589
|
+
.threshold(180)
|
|
4590
|
+
.png()
|
|
4591
|
+
.toBuffer();
|
|
4592
|
+
const psms = [7, 8, 11];
|
|
4593
|
+
let best = null;
|
|
4594
|
+
for (const psm of psms) {
|
|
4595
|
+
const out = await ocrRecognizeBuffer({
|
|
4596
|
+
buffer: buf,
|
|
4597
|
+
lang,
|
|
4598
|
+
langPathEffective,
|
|
4599
|
+
tessOptions: {
|
|
4600
|
+
tessedit_char_whitelist: "0123456789.",
|
|
4601
|
+
tessedit_pageseg_mode: String(psm),
|
|
4602
|
+
user_defined_dpi: "300",
|
|
4603
|
+
},
|
|
4604
|
+
output: { text: true, tsv: false },
|
|
4605
|
+
});
|
|
4606
|
+
const raw = String(out.text ?? "").trim();
|
|
4607
|
+
const v = parseLabelValue(raw);
|
|
4608
|
+
if (v === null)
|
|
4609
|
+
continue;
|
|
4610
|
+
const conf = typeof out.confidence === "number" && Number.isFinite(out.confidence) ? out.confidence : -1;
|
|
4611
|
+
if (!best || conf > best.conf)
|
|
4612
|
+
best = { value: v, raw, conf };
|
|
4613
|
+
}
|
|
4614
|
+
return best ? { value: best.value, raw: best.raw } : null;
|
|
4615
|
+
};
|
|
4616
|
+
const labels = [];
|
|
4617
|
+
for (const b of labelBoxes) {
|
|
4618
|
+
const o = await ocrLabelBox(b);
|
|
4619
|
+
if (!o)
|
|
4620
|
+
continue;
|
|
4621
|
+
// Basic sanity filter (avoid spurious large numbers).
|
|
4622
|
+
if (o.value <= 0 || o.value > 1000)
|
|
4623
|
+
continue;
|
|
4624
|
+
labels.push({ value: o.value, cx: b.cx, cy: b.cy, raw: o.raw });
|
|
4625
|
+
}
|
|
4626
|
+
if (!labels.length)
|
|
4627
|
+
throw new Error("Failed to extract any purple labels");
|
|
4628
|
+
const dist2PointToSeg = (px, py, s) => {
|
|
4629
|
+
const xMin = Math.min(s.x0, s.x1);
|
|
4630
|
+
const xMax = Math.max(s.x0, s.x1);
|
|
4631
|
+
const yMin = Math.min(s.y0, s.y1);
|
|
4632
|
+
const yMax = Math.max(s.y0, s.y1);
|
|
4633
|
+
const dx = px < xMin ? xMin - px : px > xMax ? px - xMax : 0;
|
|
4634
|
+
const dy = py < yMin ? yMin - py : py > yMax ? py - yMax : 0;
|
|
4635
|
+
return dx * dx + dy * dy;
|
|
4636
|
+
};
|
|
4637
|
+
const segBestDist = new Array(allSegs.length).fill(Number.POSITIVE_INFINITY);
|
|
4638
|
+
for (const lab of labels) {
|
|
4639
|
+
let bestIdx = -1;
|
|
4640
|
+
let bestD = Number.POSITIVE_INFINITY;
|
|
4641
|
+
for (let i = 0; i < allSegs.length; i++) {
|
|
4642
|
+
const d2 = dist2PointToSeg(lab.cx, lab.cy, allSegs[i]);
|
|
4643
|
+
if (d2 < bestD) {
|
|
4644
|
+
bestD = d2;
|
|
4645
|
+
bestIdx = i;
|
|
4646
|
+
}
|
|
4647
|
+
}
|
|
4648
|
+
if (bestIdx < 0)
|
|
4649
|
+
continue;
|
|
4650
|
+
if (bestD < segBestDist[bestIdx]) {
|
|
4651
|
+
segBestDist[bestIdx] = bestD;
|
|
4652
|
+
allSegs[bestIdx].unitLen = lab.value;
|
|
4653
|
+
allSegs[bestIdx].labelRaw = lab.raw;
|
|
4654
|
+
}
|
|
4655
|
+
}
|
|
4656
|
+
const labeled = allSegs.filter((s) => s.unitLen !== null && s.unitLen > 0);
|
|
4657
|
+
if (!labeled.length)
|
|
4658
|
+
throw new Error("No segments received labels; cannot compute area");
|
|
4659
|
+
const median = (xs) => {
|
|
4660
|
+
const s = [...xs].sort((a, b) => a - b);
|
|
4661
|
+
const mid = Math.floor(s.length / 2);
|
|
4662
|
+
return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
4663
|
+
};
|
|
4664
|
+
// Calibrate px-per-unit from labeled segments (best effort). Use per-orientation medians so
|
|
4665
|
+
// one bad label doesn't poison all inferred segments.
|
|
4666
|
+
const pxPerUnitAll = median(labeled.map((s) => s.pxLen / s.unitLen));
|
|
4667
|
+
const labeledH = labeled.filter((s) => s.dir === "R" || s.dir === "L");
|
|
4668
|
+
const labeledV = labeled.filter((s) => s.dir === "U" || s.dir === "D");
|
|
4669
|
+
const pxPerUnitH = labeledH.length ? median(labeledH.map((s) => s.pxLen / s.unitLen)) : pxPerUnitAll;
|
|
4670
|
+
const pxPerUnitV = labeledV.length ? median(labeledV.map((s) => s.pxLen / s.unitLen)) : pxPerUnitAll;
|
|
4671
|
+
// Infer missing segment lengths in *units* for unlabeled segments (fallback only).
|
|
4672
|
+
for (const s of allSegs) {
|
|
4673
|
+
if (s.unitLen !== null)
|
|
4674
|
+
continue;
|
|
4675
|
+
const ppu = s.dir === "R" || s.dir === "L" ? pxPerUnitH : pxPerUnitV;
|
|
4676
|
+
const safe = ppu > 0 ? ppu : 1;
|
|
4677
|
+
s.unitLen = s.pxLen / safe;
|
|
4678
|
+
}
|
|
4679
|
+
// Compute area in *unit* coordinates via shoelace on each cycle.
|
|
4680
|
+
const cycleAreasUnits = [];
|
|
4681
|
+
for (let cycleIndex = 0; cycleIndex < cycleSegs.length; cycleIndex++) {
|
|
4682
|
+
const segs = cycleSegs[cycleIndex] ?? [];
|
|
4683
|
+
if (!segs.length) {
|
|
4684
|
+
cycleAreasUnits.push(0);
|
|
4685
|
+
continue;
|
|
4686
|
+
}
|
|
4687
|
+
let ux = 0;
|
|
4688
|
+
let uy = 0;
|
|
4689
|
+
const verts = [{ x: ux, y: uy }];
|
|
4690
|
+
for (const s of segs) {
|
|
4691
|
+
const len = s.unitLen ?? 0;
|
|
4692
|
+
if (s.dir === "R")
|
|
4693
|
+
ux += len;
|
|
4694
|
+
else if (s.dir === "L")
|
|
4695
|
+
ux -= len;
|
|
4696
|
+
else if (s.dir === "D")
|
|
4697
|
+
uy += len;
|
|
4698
|
+
else
|
|
4699
|
+
uy -= len;
|
|
4700
|
+
verts.push({ x: ux, y: uy });
|
|
4701
|
+
}
|
|
4702
|
+
let twiceArea = 0;
|
|
4703
|
+
for (let i = 0; i < verts.length - 1; i++) {
|
|
4704
|
+
twiceArea += verts[i].x * verts[i + 1].y - verts[i + 1].x * verts[i].y;
|
|
4705
|
+
}
|
|
4706
|
+
cycleAreasUnits.push(Math.abs(twiceArea) / 2);
|
|
4707
|
+
}
|
|
4708
|
+
// Subtract holes that are strictly inside the outer boundary bbox (same heuristic as px-space).
|
|
4709
|
+
const outerIdx = cycleData.reduce((bestIdx, c, i) => (c.areaPx2 > cycleData[bestIdx].areaPx2 ? i : bestIdx), 0);
|
|
4710
|
+
const outerBox = cycleData[outerIdx].box;
|
|
4711
|
+
let totalAreaUnits = 0;
|
|
4712
|
+
for (let i = 0; i < cycleData.length; i++) {
|
|
4713
|
+
const c = cycleData[i];
|
|
4714
|
+
const isInsideOuter = i !== outerIdx &&
|
|
4715
|
+
c.box.x0 > outerBox.x0 &&
|
|
4716
|
+
c.box.x1 < outerBox.x1 &&
|
|
4717
|
+
c.box.y0 > outerBox.y0 &&
|
|
4718
|
+
c.box.y1 < outerBox.y1;
|
|
4719
|
+
totalAreaUnits += i === outerIdx ? (cycleAreasUnits[i] ?? 0) : isInsideOuter ? -(cycleAreasUnits[i] ?? 0) : (cycleAreasUnits[i] ?? 0);
|
|
4720
|
+
}
|
|
4721
|
+
const areaUnits = Math.abs(totalAreaUnits);
|
|
4722
|
+
const rounded = Math.round(areaUnits);
|
|
4723
|
+
return {
|
|
4724
|
+
path: filePath,
|
|
4725
|
+
width,
|
|
4726
|
+
height,
|
|
4727
|
+
areaPx,
|
|
4728
|
+
cyclesDetected: cycleData.length,
|
|
4729
|
+
segments: allSegs.length,
|
|
4730
|
+
labeledSegments: labeled.length,
|
|
4731
|
+
labelsDetected: labels.length,
|
|
4732
|
+
pxPerUnit: Number((pxPerUnitAll > 0 ? pxPerUnitAll : 1).toFixed(4)),
|
|
4733
|
+
areaUnits: Number(areaUnits.toFixed(4)),
|
|
4734
|
+
answer: String(rounded),
|
|
4735
|
+
...(debug
|
|
4736
|
+
? {
|
|
4737
|
+
labels: labels
|
|
4738
|
+
.map((l) => ({
|
|
4739
|
+
value: l.value,
|
|
4740
|
+
raw: l.raw,
|
|
4741
|
+
cx: Math.round(l.cx),
|
|
4742
|
+
cy: Math.round(l.cy),
|
|
4743
|
+
}))
|
|
4744
|
+
.sort((a, b) => a.cy - b.cy || a.cx - b.cx),
|
|
4745
|
+
outerCycleIndex: outerIdx,
|
|
4746
|
+
outerCycleSegments: (cycleSegs[outerIdx] ?? []).map((s) => ({
|
|
4747
|
+
dir: s.dir,
|
|
4748
|
+
pxLen: s.pxLen,
|
|
4749
|
+
unitLen: s.unitLen,
|
|
4750
|
+
labelRaw: s.labelRaw,
|
|
4751
|
+
})),
|
|
4752
|
+
}
|
|
4753
|
+
: {}),
|
|
4754
|
+
};
|
|
4755
|
+
},
|
|
4756
|
+
},
|
|
4757
|
+
{
|
|
4758
|
+
name: "grade_fraction_quiz_from_image",
|
|
4759
|
+
description: "Grade a fraction quiz shown in an image by OCRing the problems + student answers, computing correct answers, and scoring by problem type. Deterministic, no network.",
|
|
4760
|
+
inputSchema: {
|
|
4761
|
+
type: "object",
|
|
4762
|
+
properties: {
|
|
4763
|
+
path: {
|
|
4764
|
+
type: "string",
|
|
4765
|
+
description: "Path to a local image file (absolute or relative to current working directory).",
|
|
4766
|
+
},
|
|
4767
|
+
bonusPoints: {
|
|
4768
|
+
type: "number",
|
|
4769
|
+
description: "Bonus points added to the final total (default: 0).",
|
|
4770
|
+
default: 0,
|
|
4771
|
+
},
|
|
4772
|
+
pointsAddSubtract: {
|
|
4773
|
+
type: "number",
|
|
4774
|
+
description: "Points for add/subtract fraction problems (default: 5).",
|
|
4775
|
+
default: 5,
|
|
4776
|
+
},
|
|
4777
|
+
pointsMultiplyDivide: {
|
|
4778
|
+
type: "number",
|
|
4779
|
+
description: "Points for multiply/divide fraction problems (default: 10).",
|
|
4780
|
+
default: 10,
|
|
4781
|
+
},
|
|
4782
|
+
pointsImproperFraction: {
|
|
4783
|
+
type: "number",
|
|
4784
|
+
description: "Points for forming an improper fraction (default: 15).",
|
|
4785
|
+
default: 15,
|
|
4786
|
+
},
|
|
4787
|
+
pointsMixedNumber: {
|
|
4788
|
+
type: "number",
|
|
4789
|
+
description: "Points for forming a mixed number (default: 20).",
|
|
4790
|
+
default: 20,
|
|
4791
|
+
},
|
|
4792
|
+
lang: {
|
|
4793
|
+
type: "string",
|
|
4794
|
+
description: "Tesseract language code (default: eng).",
|
|
4795
|
+
default: "eng",
|
|
4796
|
+
},
|
|
4797
|
+
langPath: {
|
|
4798
|
+
type: "string",
|
|
4799
|
+
description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
|
|
4800
|
+
},
|
|
4801
|
+
preprocess: {
|
|
4802
|
+
type: "boolean",
|
|
4803
|
+
description: "If true (default), basic sharp preprocessing is applied before OCR.",
|
|
4804
|
+
default: true,
|
|
4805
|
+
},
|
|
4806
|
+
maxChars: {
|
|
4807
|
+
type: "number",
|
|
4808
|
+
description: "Maximum OCR text characters to consider.",
|
|
4809
|
+
default: 80000,
|
|
4810
|
+
},
|
|
4811
|
+
maxQuestions: {
|
|
4812
|
+
type: "number",
|
|
4813
|
+
description: "Maximum question count to scan for (default: 30).",
|
|
4814
|
+
default: 30,
|
|
4815
|
+
},
|
|
4816
|
+
},
|
|
4817
|
+
required: ["path"],
|
|
4818
|
+
},
|
|
4819
|
+
handler: async (args) => {
|
|
4820
|
+
const filePath = resolveLocalPath(args?.path);
|
|
4821
|
+
if (!existsSync(filePath))
|
|
4822
|
+
throw new Error(`File not found: ${filePath}`);
|
|
4823
|
+
const bonusPoints = clampInt(args?.bonusPoints, 0, -100000, 100000);
|
|
4824
|
+
const pointsAddSubtract = clampInt(args?.pointsAddSubtract, 5, 0, 1000);
|
|
4825
|
+
const pointsMultiplyDivide = clampInt(args?.pointsMultiplyDivide, 10, 0, 1000);
|
|
4826
|
+
const pointsImproperFraction = clampInt(args?.pointsImproperFraction, 15, 0, 1000);
|
|
4827
|
+
const pointsMixedNumber = clampInt(args?.pointsMixedNumber, 20, 0, 1000);
|
|
4828
|
+
const maxQuestions = clampInt(args?.maxQuestions, 30, 1, 200);
|
|
4829
|
+
const lang = String(args?.lang ?? "eng").trim() || "eng";
|
|
4830
|
+
const preprocess = args?.preprocess !== false;
|
|
4831
|
+
const maxChars = clampInt(args?.maxChars, 80000, 5000, 200000);
|
|
4832
|
+
const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
|
|
4833
|
+
const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
|
|
4834
|
+
const langPathEffective = langPathArg
|
|
4835
|
+
? resolveLocalPath(langPathArg)
|
|
4836
|
+
: existsSync(defaultLangPath)
|
|
4837
|
+
? defaultLangPath
|
|
4838
|
+
: null;
|
|
4839
|
+
// Preferred path (deterministic + robust): detect row bands from pixels, then OCR each row separately.
|
|
4840
|
+
const sharp = await getSharpOptional();
|
|
4841
|
+
if (sharp && preprocess) {
|
|
4842
|
+
const rowRes = await gradeFractionQuizFromImageRowBands({
|
|
4843
|
+
sharp,
|
|
4844
|
+
filePath,
|
|
4845
|
+
lang,
|
|
4846
|
+
langPathEffective,
|
|
4847
|
+
bonusPoints,
|
|
4848
|
+
pointsAddSubtract,
|
|
4849
|
+
pointsMultiplyDivide,
|
|
4850
|
+
pointsImproperFraction,
|
|
4851
|
+
pointsMixedNumber,
|
|
4852
|
+
});
|
|
4853
|
+
if (rowRes) {
|
|
4854
|
+
return {
|
|
4855
|
+
path: filePath,
|
|
4856
|
+
bonusPoints,
|
|
4857
|
+
extractedQuestionCount: rowRes.extractedQuestionCount,
|
|
4858
|
+
score: rowRes.score,
|
|
4859
|
+
answer: String(rowRes.score),
|
|
4860
|
+
perQuestion: rowRes.perQuestion,
|
|
4861
|
+
ocr: {
|
|
4862
|
+
lang,
|
|
4863
|
+
langPath: langPathEffective,
|
|
4864
|
+
preprocess,
|
|
4865
|
+
usedSharp: true,
|
|
4866
|
+
confidence: null,
|
|
4867
|
+
},
|
|
4868
|
+
};
|
|
4869
|
+
}
|
|
4870
|
+
// Fallback: older whole-page OCR geometry approach.
|
|
4871
|
+
const meta = await sharp(filePath).metadata();
|
|
4872
|
+
const w0 = meta.width ?? 0;
|
|
4873
|
+
const h0 = meta.height ?? 0;
|
|
4874
|
+
if (w0 && h0) {
|
|
4875
|
+
const scale = w0 < 1200 ? 3 : w0 < 2000 ? 2 : 1;
|
|
4876
|
+
const processed = await sharp(filePath)
|
|
4877
|
+
.grayscale()
|
|
4878
|
+
.resize({ width: w0 * scale, height: h0 * scale, kernel: "lanczos3" })
|
|
4879
|
+
.normalize()
|
|
4880
|
+
.threshold(180)
|
|
4881
|
+
.png()
|
|
4882
|
+
.toBuffer();
|
|
4883
|
+
const ocrGeomBase = await ocrRecognizeBuffer({ buffer: processed, lang, langPathEffective });
|
|
4884
|
+
const width = w0 * scale;
|
|
4885
|
+
const height = h0 * scale;
|
|
4886
|
+
const tokens = ocrGeomBase.words
|
|
4887
|
+
.filter((w) => !!w.bbox && !!w.text && w.text.trim())
|
|
4888
|
+
.map((w) => {
|
|
4889
|
+
const bbox = w.bbox;
|
|
4890
|
+
return {
|
|
4891
|
+
text: w.text.trim(),
|
|
4892
|
+
bbox,
|
|
4893
|
+
cx: (bbox.x0 + bbox.x1) / 2,
|
|
4894
|
+
cy: (bbox.y0 + bbox.y1) / 2,
|
|
4895
|
+
};
|
|
4896
|
+
});
|
|
4897
|
+
const leftMaxX = width * 0.08;
|
|
4898
|
+
const answerRegionX0 = width * 0.32;
|
|
4899
|
+
const candidates = tokens
|
|
4900
|
+
.map((t) => ({ t, v: toIntegerOrNull(t.text) }))
|
|
4901
|
+
.filter((x) => typeof x.v === "number" && Number.isFinite(x.v))
|
|
4902
|
+
.filter((x) => x.v >= 1 && x.v <= maxQuestions)
|
|
4903
|
+
.filter((x) => x.t.bbox.x1 <= leftMaxX)
|
|
4904
|
+
.sort((a, b) => a.t.cy - b.t.cy);
|
|
4905
|
+
const candidates10 = candidates.filter((c) => c.v <= 10);
|
|
4906
|
+
const anchorsRaw = candidates10.length >= 6 ? candidates10 : candidates;
|
|
4907
|
+
const anchors = [];
|
|
4908
|
+
const yTol = Math.max(12, Math.round(height * 0.01));
|
|
4909
|
+
for (const a of anchorsRaw) {
|
|
4910
|
+
const last = anchors[anchors.length - 1];
|
|
4911
|
+
if (last && Math.abs(a.t.cy - last.cy) <= yTol)
|
|
4912
|
+
continue;
|
|
4913
|
+
anchors.push({ index: a.v, cy: a.t.cy });
|
|
4914
|
+
}
|
|
4915
|
+
anchors.sort((a, b) => a.cy - b.cy);
|
|
4916
|
+
if (anchors.length >= 2) {
|
|
4917
|
+
const bandTop = (i) => (i === 0 ? 0 : (anchors[i - 1].cy + anchors[i].cy) / 2);
|
|
4918
|
+
const bandBottom = (i) => i === anchors.length - 1 ? height : (anchors[i].cy + anchors[i + 1].cy) / 2;
|
|
4919
|
+
const perQuestion = [];
|
|
4920
|
+
let total = 0;
|
|
4921
|
+
for (let i = 0; i < anchors.length; i++) {
|
|
4922
|
+
const idx = anchors[i].index;
|
|
4923
|
+
const y0 = bandTop(i);
|
|
4924
|
+
const y1 = bandBottom(i);
|
|
4925
|
+
const row = tokens.filter((t) => t.cy >= y0 && t.cy < y1);
|
|
4926
|
+
if (!row.length)
|
|
4927
|
+
continue;
|
|
4928
|
+
const answerTokens = row
|
|
4929
|
+
.filter((t) => t.bbox.x0 >= answerRegionX0)
|
|
4930
|
+
.sort((a, b) => a.bbox.x0 - b.bbox.x0);
|
|
4931
|
+
const answerText = answerTokens.map((t) => t.text).join(" ").trim();
|
|
4932
|
+
const studentMixed = parseMixedNumberLoose(answerText);
|
|
4933
|
+
const studentFrac = parseFractionLoose(answerText);
|
|
4934
|
+
const left = row.filter((t) => t.bbox.x1 < answerRegionX0);
|
|
4935
|
+
const numeric = left
|
|
4936
|
+
.map((t) => ({ t, v: toIntegerOrNull(t.text) }))
|
|
4937
|
+
.filter((x) => typeof x.v === "number" && Number.isFinite(x.v))
|
|
4938
|
+
.filter((x) => !(x.v === idx && x.t.bbox.x1 <= leftMaxX));
|
|
4939
|
+
const heights = numeric
|
|
4940
|
+
.map((x) => x.t.bbox.y1 - x.t.bbox.y0)
|
|
4941
|
+
.filter((n) => Number.isFinite(n) && n > 0)
|
|
4942
|
+
.sort((a, b) => a - b);
|
|
4943
|
+
const medianH = heights.length ? heights[Math.floor(heights.length / 2)] : 20;
|
|
4944
|
+
const maxDy = Math.max(20, Math.round(medianH * 1.8));
|
|
4945
|
+
const xTol = Math.max(18, Math.round(width * 0.015));
|
|
4946
|
+
const cols = [];
|
|
4947
|
+
for (const it of numeric) {
|
|
4948
|
+
const cx = it.t.cx;
|
|
4949
|
+
const col = cols.find((c) => Math.abs(c.x - cx) <= xTol);
|
|
4950
|
+
if (col)
|
|
4951
|
+
col.items.push({ v: it.v, bbox: it.t.bbox, tok: it.t });
|
|
4952
|
+
else
|
|
4953
|
+
cols.push({ x: cx, items: [{ v: it.v, bbox: it.t.bbox, tok: it.t }] });
|
|
4954
|
+
}
|
|
4955
|
+
const used = new Set();
|
|
4956
|
+
const fracPairs = [];
|
|
4957
|
+
for (const col of cols) {
|
|
4958
|
+
const items = col.items.sort((a, b) => a.bbox.y0 - b.bbox.y0);
|
|
4959
|
+
for (let j = 0; j < items.length - 1; j++) {
|
|
4960
|
+
const top = items[j];
|
|
4961
|
+
const bot = items[j + 1];
|
|
4962
|
+
const dy = bot.bbox.y0 - top.bbox.y1;
|
|
4963
|
+
if (dy < -2 || dy > maxDy)
|
|
4964
|
+
continue;
|
|
4965
|
+
if (bot.v === 0)
|
|
4966
|
+
continue;
|
|
4967
|
+
fracPairs.push({ x: col.x, frac: normalizeFraction({ n: top.v, d: bot.v }) });
|
|
4968
|
+
used.add(top.tok);
|
|
4969
|
+
used.add(bot.tok);
|
|
4970
|
+
}
|
|
4971
|
+
}
|
|
4972
|
+
fracPairs.sort((a, b) => a.x - b.x);
|
|
4973
|
+
const fracs = fracPairs.length > 2 ? fracPairs.slice(0, 2) : fracPairs;
|
|
4974
|
+
const opCandidates = left
|
|
4975
|
+
.map((t) => ({ t, s: t.text.trim() }))
|
|
4976
|
+
.filter((x) => x.s.length && x.s.length <= 2)
|
|
4977
|
+
.map((x) => ({ ...x, ch: x.s.replace(/[^\+\-\*xX/]/g, "")[0] ?? "" }))
|
|
4978
|
+
.filter((x) => !!x.ch);
|
|
4979
|
+
const pickOp = (aX, bX) => {
|
|
4980
|
+
const mid = (aX + bX) / 2;
|
|
4981
|
+
const between = opCandidates
|
|
4982
|
+
.filter((o) => o.t.cx >= Math.min(aX, bX) && o.t.cx <= Math.max(aX, bX))
|
|
4983
|
+
.sort((p, q) => Math.abs(p.t.cx - mid) - Math.abs(q.t.cx - mid));
|
|
4984
|
+
return between.length ? between[0].ch : "+";
|
|
4985
|
+
};
|
|
4986
|
+
if (fracs.length >= 2) {
|
|
4987
|
+
const a = fracs[0].frac;
|
|
4988
|
+
const bFrac = fracs[1].frac;
|
|
4989
|
+
const student = studentFrac;
|
|
4990
|
+
if (!student)
|
|
4991
|
+
continue;
|
|
4992
|
+
const opChar = pickOp(fracs[0].x, fracs[1].x);
|
|
4993
|
+
let correct;
|
|
4994
|
+
let kind;
|
|
4995
|
+
if (opChar === "+" || opChar === "-") {
|
|
4996
|
+
correct = opChar === "+" ? addFractions(a, bFrac) : subFractions(a, bFrac);
|
|
4997
|
+
kind = "add_subtract";
|
|
4998
|
+
}
|
|
4999
|
+
else if (opChar === "x" || opChar === "X" || opChar === "*") {
|
|
5000
|
+
correct = mulFractions(a, bFrac);
|
|
5001
|
+
kind = "multiply_divide";
|
|
5002
|
+
}
|
|
5003
|
+
else {
|
|
5004
|
+
correct = divFractions(a, bFrac);
|
|
5005
|
+
kind = "multiply_divide";
|
|
5006
|
+
}
|
|
5007
|
+
const ok = fractionsEqual(student, correct);
|
|
5008
|
+
const pts = ok ? (kind === "add_subtract" ? pointsAddSubtract : pointsMultiplyDivide) : 0;
|
|
5009
|
+
total += pts;
|
|
5010
|
+
perQuestion.push({
|
|
5011
|
+
index: idx,
|
|
5012
|
+
type: kind,
|
|
5013
|
+
correct: ok,
|
|
5014
|
+
points: pts,
|
|
5015
|
+
studentAnswer: fractionToString(student),
|
|
5016
|
+
correctAnswer: fractionToString(correct),
|
|
5017
|
+
});
|
|
5018
|
+
continue;
|
|
5019
|
+
}
|
|
5020
|
+
if (fracs.length === 1) {
|
|
5021
|
+
const given = fracs[0].frac;
|
|
5022
|
+
const wholeCandidates = numeric
|
|
5023
|
+
.filter((n) => !used.has(n.t))
|
|
5024
|
+
.map((n) => ({ v: n.v, x: n.t.bbox.x0 }))
|
|
5025
|
+
.filter((n) => n.x > leftMaxX && n.x < fracs[0].x - xTol * 0.5);
|
|
5026
|
+
const whole = wholeCandidates.length ? wholeCandidates[wholeCandidates.length - 1].v : null;
|
|
5027
|
+
if (whole !== null) {
|
|
5028
|
+
const student = studentFrac;
|
|
5029
|
+
if (!student)
|
|
5030
|
+
continue;
|
|
5031
|
+
const correct = normalizeFraction({ n: whole * given.d + Math.abs(given.n), d: given.d });
|
|
5032
|
+
const ok = fractionsEqual(student, correct);
|
|
5033
|
+
const pts = ok ? pointsImproperFraction : 0;
|
|
5034
|
+
total += pts;
|
|
5035
|
+
perQuestion.push({
|
|
5036
|
+
index: idx,
|
|
5037
|
+
type: "improper_fraction",
|
|
5038
|
+
correct: ok,
|
|
5039
|
+
points: pts,
|
|
5040
|
+
studentAnswer: fractionToString(student),
|
|
5041
|
+
correctAnswer: fractionToString(correct),
|
|
5042
|
+
});
|
|
5043
|
+
continue;
|
|
5044
|
+
}
|
|
5045
|
+
const wholeOut = Math.trunc(given.n / given.d);
|
|
5046
|
+
const rem = Math.abs(given.n % given.d);
|
|
5047
|
+
const correctFrac = normalizeFraction({ n: rem, d: given.d });
|
|
5048
|
+
const ok = studentMixed !== null &&
|
|
5049
|
+
studentMixed.whole === wholeOut &&
|
|
5050
|
+
fractionsEqual(studentMixed.frac, correctFrac);
|
|
5051
|
+
const pts = ok ? pointsMixedNumber : 0;
|
|
5052
|
+
total += pts;
|
|
5053
|
+
perQuestion.push({
|
|
5054
|
+
index: idx,
|
|
5055
|
+
type: "mixed_number",
|
|
5056
|
+
correct: ok,
|
|
5057
|
+
points: pts,
|
|
5058
|
+
studentAnswer: studentMixed ? mixedNumberToString(studentMixed.whole, studentMixed.frac) : null,
|
|
5059
|
+
correctAnswer: mixedNumberToString(wholeOut, correctFrac),
|
|
5060
|
+
});
|
|
5061
|
+
}
|
|
5062
|
+
}
|
|
5063
|
+
total += bonusPoints;
|
|
5064
|
+
// If we graded at least one question, return the geometry-based score.
|
|
5065
|
+
if (perQuestion.length) {
|
|
5066
|
+
return {
|
|
5067
|
+
path: filePath,
|
|
5068
|
+
bonusPoints,
|
|
5069
|
+
extractedQuestionCount: perQuestion.length,
|
|
5070
|
+
score: total,
|
|
5071
|
+
answer: String(total),
|
|
5072
|
+
perQuestion,
|
|
5073
|
+
ocr: {
|
|
5074
|
+
lang,
|
|
5075
|
+
langPath: langPathEffective,
|
|
5076
|
+
preprocess,
|
|
5077
|
+
usedSharp: true,
|
|
5078
|
+
confidence: ocrGeomBase.confidence,
|
|
5079
|
+
},
|
|
5080
|
+
};
|
|
5081
|
+
}
|
|
5082
|
+
}
|
|
5083
|
+
}
|
|
5084
|
+
}
|
|
5085
|
+
const ocr = await ocrRecognizeImageFile({ filePath, lang, langPathEffective, preprocess });
|
|
5086
|
+
const ocrText = ocr.text.slice(0, maxChars);
|
|
5087
|
+
const rawLines = ocr.lines.map((l) => l.text).filter((t) => t && t.trim());
|
|
5088
|
+
const lines = rawLines.length
|
|
5089
|
+
? rawLines
|
|
5090
|
+
: ocrText
|
|
5091
|
+
.split(/\r?\n/)
|
|
5092
|
+
.map((l) => l.trim())
|
|
5093
|
+
.filter(Boolean);
|
|
5094
|
+
const blocks = [];
|
|
5095
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5096
|
+
const m = lines[i].match(/^\s*(\d{1,3})\b\s*(.*)$/);
|
|
5097
|
+
if (!m)
|
|
5098
|
+
continue;
|
|
5099
|
+
const idx = Number.parseInt(m[1], 10);
|
|
5100
|
+
if (!Number.isFinite(idx) || idx <= 0 || idx > maxQuestions)
|
|
5101
|
+
continue;
|
|
5102
|
+
let text = (m[2] ?? "").trim();
|
|
5103
|
+
// Include subsequent non-index lines as part of the same question.
|
|
5104
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
5105
|
+
if (/^\s*\d{1,3}\b/.test(lines[j]))
|
|
5106
|
+
break;
|
|
5107
|
+
if (lines[j].trim())
|
|
5108
|
+
text += ` ${lines[j].trim()}`;
|
|
5109
|
+
i = j;
|
|
5110
|
+
}
|
|
5111
|
+
blocks.push({ index: idx, text: text.trim() });
|
|
5112
|
+
}
|
|
5113
|
+
// If we couldn't detect blocks, fall back to the whole OCR text as a single block.
|
|
5114
|
+
if (!blocks.length)
|
|
5115
|
+
blocks.push({ index: 1, text: ocrText });
|
|
5116
|
+
blocks.sort((a, b) => a.index - b.index);
|
|
5117
|
+
const perQuestion = [];
|
|
5118
|
+
let total = 0;
|
|
5119
|
+
for (const b of blocks) {
|
|
5120
|
+
const t = b.text;
|
|
5121
|
+
const lower = t.toLowerCase();
|
|
5122
|
+
// Mixed number conversion: Turn a/b into a mixed number
|
|
5123
|
+
if (lower.includes("turn") && lower.includes("mixed")) {
|
|
5124
|
+
const given = parseFractionLoose(t);
|
|
5125
|
+
const student = parseMixedNumberLoose(t);
|
|
5126
|
+
if (!given)
|
|
5127
|
+
continue;
|
|
5128
|
+
const whole = Math.trunc(given.n / given.d);
|
|
5129
|
+
const rem = Math.abs(given.n % given.d);
|
|
5130
|
+
const correctFrac = normalizeFraction({ n: rem, d: given.d });
|
|
5131
|
+
const correctAnswer = mixedNumberToString(whole, correctFrac);
|
|
5132
|
+
const ok = student !== null &&
|
|
5133
|
+
student.whole === whole &&
|
|
5134
|
+
fractionsEqual(student.frac, correctFrac);
|
|
5135
|
+
const pts = ok ? pointsMixedNumber : 0;
|
|
5136
|
+
total += pts;
|
|
5137
|
+
perQuestion.push({
|
|
5138
|
+
index: b.index,
|
|
5139
|
+
type: "mixed_number",
|
|
5140
|
+
correct: ok,
|
|
5141
|
+
points: pts,
|
|
5142
|
+
studentAnswer: student ? mixedNumberToString(student.whole, student.frac) : null,
|
|
5143
|
+
correctAnswer,
|
|
5144
|
+
});
|
|
5145
|
+
continue;
|
|
5146
|
+
}
|
|
5147
|
+
// Improper fraction conversion: Turn W N/D into an improper fraction
|
|
5148
|
+
if (lower.includes("turn") && lower.includes("improper")) {
|
|
5149
|
+
const given = parseMixedNumberLoose(t);
|
|
5150
|
+
const allFracMatches = Array.from(t.matchAll(/-?\d+\s*\/\s*\d+/g)).map((m) => m[0]);
|
|
5151
|
+
const studentRaw = allFracMatches.length ? allFracMatches[allFracMatches.length - 1] : "";
|
|
5152
|
+
const student = parseFractionLoose(studentRaw);
|
|
5153
|
+
if (!given)
|
|
5154
|
+
continue;
|
|
5155
|
+
const sgn = given.whole < 0 ? -1 : 1;
|
|
5156
|
+
const correct = normalizeFraction({
|
|
5157
|
+
n: given.whole * given.frac.d + sgn * Math.abs(given.frac.n),
|
|
5158
|
+
d: given.frac.d,
|
|
5159
|
+
});
|
|
5160
|
+
const ok = student !== null && fractionsEqual(student, correct);
|
|
5161
|
+
const pts = ok ? pointsImproperFraction : 0;
|
|
5162
|
+
total += pts;
|
|
5163
|
+
perQuestion.push({
|
|
5164
|
+
index: b.index,
|
|
5165
|
+
type: "improper_fraction",
|
|
5166
|
+
correct: ok,
|
|
5167
|
+
points: pts,
|
|
5168
|
+
studentAnswer: student ? fractionToString(student) : null,
|
|
5169
|
+
correctAnswer: fractionToString(correct),
|
|
5170
|
+
});
|
|
5171
|
+
continue;
|
|
5172
|
+
}
|
|
5173
|
+
// Operation with two fractions + a student answer fraction.
|
|
5174
|
+
const matches = Array.from(t.matchAll(/-?\d+\s*\/\s*\d+/g)).map((m) => m[0]);
|
|
5175
|
+
const parsed = matches.map((m) => parseFractionLoose(m)).filter((f) => !!f);
|
|
5176
|
+
if (parsed.length < 3)
|
|
5177
|
+
continue;
|
|
5178
|
+
const a = parsed[0];
|
|
5179
|
+
const bFrac = parsed[1];
|
|
5180
|
+
const student = parsed[parsed.length - 1];
|
|
5181
|
+
// Prefer operator between first and second fraction if possible.
|
|
5182
|
+
const opM = t.match(/-?\d+\s*\/\s*\d+\s*([+\-x×*÷/])\s*-?\d+\s*\/\s*\d+/i);
|
|
5183
|
+
const op = opM ? opM[1] : "+";
|
|
5184
|
+
let correct;
|
|
5185
|
+
let kind;
|
|
5186
|
+
if (op === "+" || op === "-") {
|
|
5187
|
+
correct = op === "+" ? addFractions(a, bFrac) : subFractions(a, bFrac);
|
|
5188
|
+
kind = "add_subtract";
|
|
5189
|
+
}
|
|
5190
|
+
else if (op === "x" || op === "×" || op === "*") {
|
|
5191
|
+
correct = mulFractions(a, bFrac);
|
|
5192
|
+
kind = "multiply_divide";
|
|
5193
|
+
}
|
|
5194
|
+
else {
|
|
5195
|
+
correct = divFractions(a, bFrac);
|
|
5196
|
+
kind = "multiply_divide";
|
|
5197
|
+
}
|
|
5198
|
+
const ok = fractionsEqual(student, correct);
|
|
5199
|
+
const pts = ok ? (kind === "add_subtract" ? pointsAddSubtract : pointsMultiplyDivide) : 0;
|
|
5200
|
+
total += pts;
|
|
5201
|
+
perQuestion.push({
|
|
5202
|
+
index: b.index,
|
|
5203
|
+
type: kind,
|
|
5204
|
+
correct: ok,
|
|
5205
|
+
points: pts,
|
|
5206
|
+
studentAnswer: fractionToString(student),
|
|
5207
|
+
correctAnswer: fractionToString(correct),
|
|
5208
|
+
});
|
|
5209
|
+
}
|
|
5210
|
+
total += bonusPoints;
|
|
5211
|
+
return {
|
|
5212
|
+
path: filePath,
|
|
5213
|
+
bonusPoints,
|
|
5214
|
+
extractedQuestionCount: perQuestion.length,
|
|
5215
|
+
score: total,
|
|
5216
|
+
answer: String(total),
|
|
5217
|
+
perQuestion,
|
|
5218
|
+
ocr: {
|
|
5219
|
+
lang,
|
|
5220
|
+
langPath: langPathEffective,
|
|
5221
|
+
preprocess,
|
|
5222
|
+
usedSharp: ocr.usedSharp,
|
|
5223
|
+
confidence: ocr.confidence,
|
|
5224
|
+
},
|
|
5225
|
+
};
|
|
5226
|
+
},
|
|
5227
|
+
},
|
|
5228
|
+
{
|
|
5229
|
+
name: "extract_fractions_and_simplify_from_image",
|
|
5230
|
+
description: "Extract slash-style fractions (e.g. 3/4) from body text in an image and also detect stacked numerator/denominator fractions in a worksheet-style region, returning the simplified answers. Deterministic, no network.",
|
|
5231
|
+
inputSchema: {
|
|
5232
|
+
type: "object",
|
|
5233
|
+
properties: {
|
|
5234
|
+
path: {
|
|
5235
|
+
type: "string",
|
|
5236
|
+
description: "Path to a local image file (absolute or relative to current working directory).",
|
|
5237
|
+
},
|
|
5238
|
+
lang: {
|
|
5239
|
+
type: "string",
|
|
5240
|
+
description: "Tesseract language code (default: eng).",
|
|
5241
|
+
default: "eng",
|
|
5242
|
+
},
|
|
5243
|
+
langPath: {
|
|
5244
|
+
type: "string",
|
|
5245
|
+
description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
|
|
5246
|
+
},
|
|
5247
|
+
preprocess: {
|
|
5248
|
+
type: "boolean",
|
|
5249
|
+
description: "If true (default), basic sharp preprocessing is applied before OCR.",
|
|
5250
|
+
default: true,
|
|
5251
|
+
},
|
|
5252
|
+
bodyBottomFrac: {
|
|
5253
|
+
type: "number",
|
|
5254
|
+
description: "Body cutoff as fraction of image height (words below are treated as worksheet region). Default: 0.7.",
|
|
5255
|
+
default: 0.7,
|
|
5256
|
+
},
|
|
5257
|
+
maxChars: {
|
|
5258
|
+
type: "number",
|
|
5259
|
+
description: "Maximum OCR text characters to consider.",
|
|
5260
|
+
default: 120000,
|
|
5261
|
+
},
|
|
5262
|
+
},
|
|
5263
|
+
required: ["path"],
|
|
5264
|
+
},
|
|
5265
|
+
handler: async (args) => {
|
|
5266
|
+
const filePath = resolveLocalPath(args?.path);
|
|
5267
|
+
if (!existsSync(filePath))
|
|
5268
|
+
throw new Error(`File not found: ${filePath}`);
|
|
5269
|
+
const sharp = await getSharpOptional();
|
|
5270
|
+
if (!sharp) {
|
|
5271
|
+
throw new Error("Missing optional dependency: sharp. Install it to use image fraction extraction.");
|
|
5272
|
+
}
|
|
5273
|
+
const meta = await sharp(filePath).metadata();
|
|
5274
|
+
const imgH = meta.height ?? 0;
|
|
5275
|
+
if (!imgH)
|
|
5276
|
+
throw new Error("Unable to read image height");
|
|
5277
|
+
const lang = String(args?.lang ?? "eng").trim() || "eng";
|
|
5278
|
+
const preprocess = args?.preprocess !== false;
|
|
5279
|
+
const maxChars = clampInt(args?.maxChars, 120000, 5000, 200000);
|
|
5280
|
+
const bodyBottomFrac = clampNumber(Number(args?.bodyBottomFrac ?? 0.7), 0.3, 0.95);
|
|
5281
|
+
const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
|
|
5282
|
+
const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
|
|
5283
|
+
const langPathEffective = langPathArg
|
|
5284
|
+
? resolveLocalPath(langPathArg)
|
|
5285
|
+
: existsSync(defaultLangPath)
|
|
5286
|
+
? defaultLangPath
|
|
5287
|
+
: null;
|
|
5288
|
+
const ocr = await ocrRecognizeImageFile({ filePath, lang, langPathEffective, preprocess });
|
|
5289
|
+
const bodyYMax = imgH * bodyBottomFrac;
|
|
5290
|
+
// Use OCR text (not per-word) since OCR may split "3/4" into multiple tokens (e.g. "3", "/", "4").
|
|
5291
|
+
const ocrText = ocr.text.slice(0, maxChars);
|
|
5292
|
+
const bodyFractions = Array.from(ocrText.matchAll(/(\d+)\s*\/\s*(\d+)/g)).map((m) => `${m[1]}/${m[2]}`);
|
|
5293
|
+
// Detect stacked fractions (numerator over denominator) below cutoff by looking for fraction bars,
|
|
5294
|
+
// then OCR numerator and denominator from sub-crops. This is more robust than relying on whole-page
|
|
5295
|
+
// OCR word boxes, which often miss one half of stacked fractions.
|
|
5296
|
+
const imgW = meta.width ?? 0;
|
|
5297
|
+
if (!imgW)
|
|
5298
|
+
throw new Error("Unable to read image width");
|
|
5299
|
+
const scale = imgW < 1200 ? 3 : imgW < 2000 ? 2 : 1;
|
|
5300
|
+
const baseW = imgW * scale;
|
|
5301
|
+
const baseH = imgH * scale;
|
|
5302
|
+
const base = await sharp(filePath)
|
|
5303
|
+
.grayscale()
|
|
5304
|
+
.resize({ width: baseW, height: baseH, kernel: "lanczos3" })
|
|
5305
|
+
.normalize()
|
|
5306
|
+
.png()
|
|
5307
|
+
.toBuffer();
|
|
5308
|
+
const extractStackedFromCrop = async (cutoffFrac) => {
|
|
5309
|
+
const cropTop = clampInt(Math.floor(imgH * cutoffFrac) * scale, 0, 0, baseH - 1);
|
|
5310
|
+
const cropH = Math.max(1, baseH - cropTop);
|
|
5311
|
+
const worksheetGray = await sharp(base)
|
|
5312
|
+
.extract({ left: 0, top: cropTop, width: baseW, height: cropH })
|
|
5313
|
+
.png()
|
|
5314
|
+
.toBuffer();
|
|
5315
|
+
const { data: wsBw, info: wsInfo } = await sharp(worksheetGray)
|
|
5316
|
+
.grayscale()
|
|
5317
|
+
.threshold(210)
|
|
5318
|
+
.raw()
|
|
5319
|
+
.toBuffer({ resolveWithObject: true });
|
|
5320
|
+
// Worksheet stacked-fraction bars are much shorter than the page width; keep minRun small and
|
|
5321
|
+
// filter by ink above/below to avoid picking input-box borders.
|
|
5322
|
+
const minBarRun = Math.max(12, Math.round(wsInfo.width * 0.004));
|
|
5323
|
+
const bars = detectThinHorizontalBarsFromBw(wsBw, wsInfo.width, wsInfo.height, {
|
|
5324
|
+
minRun: minBarRun,
|
|
5325
|
+
maxThickness: Math.max(2, Math.round(wsInfo.height * 0.08)),
|
|
5326
|
+
});
|
|
5327
|
+
const stacked = [];
|
|
5328
|
+
for (const bar of bars) {
|
|
5329
|
+
// Skip far-right bars; in this worksheet layout, fraction bars live left of the input boxes.
|
|
5330
|
+
if (bar.cx > wsInfo.width * 0.6)
|
|
5331
|
+
continue;
|
|
5332
|
+
// Avoid long horizontal lines like answer-box borders.
|
|
5333
|
+
if (bar.len > wsInfo.width * 0.28)
|
|
5334
|
+
continue;
|
|
5335
|
+
// Require ink above AND below the bar in the same X band (numerator/denominator).
|
|
5336
|
+
const padXInk = Math.max(2, Math.round(bar.len * 0.25));
|
|
5337
|
+
const xInk0 = clampInt(bar.x0 - padXInk, 0, 0, wsInfo.width - 1);
|
|
5338
|
+
const xInk1 = clampInt(bar.x1 + padXInk, wsInfo.width - 1, 0, wsInfo.width - 1);
|
|
5339
|
+
const aboveH = Math.max(10, Math.round(wsInfo.height * 0.12));
|
|
5340
|
+
const belowH = aboveH;
|
|
5341
|
+
const aboveY0 = clampInt(bar.y0 - aboveH, 0, 0, wsInfo.height - 1);
|
|
5342
|
+
const aboveY1 = clampInt(bar.y0 - 1, wsInfo.height - 1, 0, wsInfo.height - 1);
|
|
5343
|
+
const belowY0 = clampInt(bar.y1 + 1, wsInfo.height - 1, 0, wsInfo.height - 1);
|
|
5344
|
+
const belowY1 = clampInt(bar.y1 + belowH, wsInfo.height - 1, 0, wsInfo.height - 1);
|
|
5345
|
+
const minInk = Math.max(5, Math.round(bar.len * 0.05));
|
|
5346
|
+
if (!rectHasInk(wsBw, wsInfo.width, wsInfo.height, { x0: xInk0, y0: aboveY0, x1: xInk1, y1: aboveY1 }, minInk) ||
|
|
5347
|
+
!rectHasInk(wsBw, wsInfo.width, wsInfo.height, { x0: xInk0, y0: belowY0, x1: xInk1, y1: belowY1 }, minInk)) {
|
|
5348
|
+
continue;
|
|
5349
|
+
}
|
|
5350
|
+
const padX = Math.max(2, Math.round(bar.len * 0.25));
|
|
5351
|
+
const padY = Math.max(1, Math.round(wsInfo.height * 0.01));
|
|
5352
|
+
const boxH = clampInt(Math.round(bar.len * 1.2), Math.round(wsInfo.height * 0.14), 14, Math.max(14, Math.round(wsInfo.height * 0.35)));
|
|
5353
|
+
const cropX0 = clampInt(bar.x0 - padX, 0, 0, wsInfo.width - 1);
|
|
5354
|
+
const cropX1 = clampInt(bar.x1 + padX, wsInfo.width - 1, 0, wsInfo.width - 1);
|
|
5355
|
+
const cropW = cropX1 - cropX0 + 1;
|
|
5356
|
+
const numY1 = clampInt(bar.y0 - padY, wsInfo.height, 0, wsInfo.height);
|
|
5357
|
+
const numY0 = clampInt(numY1 - boxH, 0, 0, numY1);
|
|
5358
|
+
const numH = numY1 - numY0;
|
|
5359
|
+
const denY0 = clampInt(bar.y1 + padY, wsInfo.height - 1, 0, wsInfo.height - 1);
|
|
5360
|
+
const denY1 = clampInt(denY0 + boxH, wsInfo.height, denY0, wsInfo.height);
|
|
5361
|
+
const denH = denY1 - denY0;
|
|
5362
|
+
if (numH < 6 || denH < 6)
|
|
5363
|
+
continue;
|
|
5364
|
+
const n = await ocrIntegerFromImageRegion({
|
|
5365
|
+
sharp,
|
|
5366
|
+
source: worksheetGray,
|
|
5367
|
+
left: cropX0,
|
|
5368
|
+
top: numY0,
|
|
5369
|
+
width: cropW,
|
|
5370
|
+
height: numH,
|
|
5371
|
+
threshold: 200,
|
|
5372
|
+
thresholds: [150, 170, 190, 210],
|
|
5373
|
+
lang,
|
|
5374
|
+
langPathEffective,
|
|
5375
|
+
psms: [11, 7],
|
|
5376
|
+
minValue: 0,
|
|
5377
|
+
maxValue: 10000,
|
|
5378
|
+
});
|
|
5379
|
+
const d = await ocrIntegerFromImageRegion({
|
|
5380
|
+
sharp,
|
|
5381
|
+
source: worksheetGray,
|
|
5382
|
+
left: cropX0,
|
|
5383
|
+
top: denY0,
|
|
5384
|
+
width: cropW,
|
|
5385
|
+
height: denH,
|
|
5386
|
+
threshold: 200,
|
|
5387
|
+
thresholds: [150, 170, 190, 210],
|
|
5388
|
+
lang,
|
|
5389
|
+
langPathEffective,
|
|
5390
|
+
psms: [7, 6],
|
|
5391
|
+
minValue: 1,
|
|
5392
|
+
maxValue: 10000,
|
|
5393
|
+
});
|
|
5394
|
+
if (n === null || d === null || d === 0)
|
|
5395
|
+
continue;
|
|
5396
|
+
try {
|
|
5397
|
+
stacked.push({ y: bar.cy, x: bar.cx, frac: normalizeFraction({ n, d }) });
|
|
5398
|
+
}
|
|
5399
|
+
catch {
|
|
5400
|
+
// ignore
|
|
5401
|
+
}
|
|
5402
|
+
}
|
|
5403
|
+
stacked.sort((a, b) => a.y - b.y || a.x - b.x);
|
|
5404
|
+
const simplified = [];
|
|
5405
|
+
const dedupeTolY = Math.max(6, Math.round(wsInfo.height * 0.015));
|
|
5406
|
+
let lastY = -1e9;
|
|
5407
|
+
for (const s of stacked) {
|
|
5408
|
+
if (Math.abs(s.y - lastY) < dedupeTolY)
|
|
5409
|
+
continue;
|
|
5410
|
+
simplified.push(fractionToString(s.frac));
|
|
5411
|
+
lastY = s.y;
|
|
5412
|
+
}
|
|
5413
|
+
return {
|
|
5414
|
+
cutoffFrac,
|
|
5415
|
+
cropTop,
|
|
5416
|
+
barCount: bars.length,
|
|
5417
|
+
stackedCandidateCount: stacked.length,
|
|
5418
|
+
simplified,
|
|
5419
|
+
};
|
|
5420
|
+
};
|
|
5421
|
+
// GAIA image layouts vary; callers often provide a body cutoff, but it can be too low/high.
|
|
5422
|
+
// Start with the requested cutoff, then try a few additional cutoffs if we didn't recover
|
|
5423
|
+
// enough stacked fractions.
|
|
5424
|
+
const cutoffCandidatesRaw = [
|
|
5425
|
+
bodyBottomFrac,
|
|
5426
|
+
bodyBottomFrac - 0.1,
|
|
5427
|
+
bodyBottomFrac - 0.2,
|
|
5428
|
+
0.7,
|
|
5429
|
+
0.65,
|
|
5430
|
+
0.6,
|
|
5431
|
+
0.55,
|
|
5432
|
+
0.5,
|
|
5433
|
+
0.45,
|
|
5434
|
+
0.4,
|
|
5435
|
+
0.35,
|
|
5436
|
+
]
|
|
5437
|
+
.map((n) => clampNumber(n, 0.3, 0.95))
|
|
5438
|
+
.map((n) => Number(n.toFixed(3)));
|
|
5439
|
+
const seen = new Set();
|
|
5440
|
+
const cutoffCandidates = [];
|
|
5441
|
+
for (const c of cutoffCandidatesRaw) {
|
|
5442
|
+
if (seen.has(c))
|
|
5443
|
+
continue;
|
|
5444
|
+
seen.add(c);
|
|
5445
|
+
cutoffCandidates.push(c);
|
|
5446
|
+
}
|
|
5447
|
+
let best = await extractStackedFromCrop(cutoffCandidates[0] ?? bodyBottomFrac);
|
|
5448
|
+
if (best.simplified.length < 5) {
|
|
5449
|
+
for (const c of cutoffCandidates.slice(1)) {
|
|
5450
|
+
const cand = await extractStackedFromCrop(c);
|
|
5451
|
+
if (cand.simplified.length > best.simplified.length)
|
|
5452
|
+
best = cand;
|
|
5453
|
+
// Early stop: once we hit a "healthy" count, avoid extra OCR passes.
|
|
5454
|
+
if (best.simplified.length >= 7)
|
|
5455
|
+
break;
|
|
5456
|
+
}
|
|
5457
|
+
}
|
|
5458
|
+
const simplified = best.simplified;
|
|
5459
|
+
const all = [...bodyFractions, ...simplified];
|
|
5460
|
+
const answer = all.join(",");
|
|
5461
|
+
return {
|
|
5462
|
+
path: filePath,
|
|
5463
|
+
bodyBottomFrac,
|
|
5464
|
+
worksheetBottomFracUsed: best.cutoffFrac,
|
|
5465
|
+
bodyFractionCount: bodyFractions.length,
|
|
5466
|
+
worksheetFractionCount: simplified.length,
|
|
5467
|
+
answer,
|
|
5468
|
+
fractions: all,
|
|
5469
|
+
debug: {
|
|
5470
|
+
barCount: best.barCount,
|
|
5471
|
+
stackedCandidateCount: best.stackedCandidateCount,
|
|
5472
|
+
cutoffsTried: cutoffCandidates,
|
|
5473
|
+
},
|
|
5474
|
+
ocr: {
|
|
5475
|
+
lang,
|
|
5476
|
+
langPath: langPathEffective,
|
|
5477
|
+
preprocess,
|
|
5478
|
+
usedSharp: ocr.usedSharp,
|
|
5479
|
+
confidence: ocr.confidence,
|
|
5480
|
+
maxChars,
|
|
5481
|
+
truncated: ocr.text.length > maxChars,
|
|
5482
|
+
},
|
|
5483
|
+
};
|
|
5484
|
+
},
|
|
5485
|
+
},
|
|
5486
|
+
{
|
|
5487
|
+
name: "solve_bass_clef_age_from_image",
|
|
5488
|
+
description: "Extract bass-clef note letters from a simple staff image and compute the derived 'age' for time-words like DECADE/CENTURY. Deterministic, no network.",
|
|
5489
|
+
inputSchema: {
|
|
5490
|
+
type: "object",
|
|
5491
|
+
properties: {
|
|
5492
|
+
path: {
|
|
5493
|
+
type: "string",
|
|
5494
|
+
description: "Path to a local image file (absolute or relative to current working directory).",
|
|
5495
|
+
},
|
|
5496
|
+
maxPixels: {
|
|
5497
|
+
type: "number",
|
|
5498
|
+
description: "Safety cap on pixels to process (default: 1,000,000).",
|
|
5499
|
+
default: 1000000,
|
|
5500
|
+
},
|
|
5501
|
+
threshold: {
|
|
5502
|
+
type: "number",
|
|
5503
|
+
description: "Binarization threshold (0-255). Default 160.",
|
|
5504
|
+
default: 160,
|
|
5505
|
+
},
|
|
5506
|
+
},
|
|
5507
|
+
required: ["path"],
|
|
5508
|
+
},
|
|
5509
|
+
handler: async (args) => {
|
|
5510
|
+
const filePath = resolveLocalPath(args?.path);
|
|
5511
|
+
if (!existsSync(filePath))
|
|
5512
|
+
throw new Error(`File not found: ${filePath}`);
|
|
5513
|
+
const sharp = await getSharpOptional();
|
|
5514
|
+
if (!sharp)
|
|
5515
|
+
throw new Error("Missing optional dependency: sharp. Install it to use music staff parsing.");
|
|
5516
|
+
const maxPixels = clampInt(args?.maxPixels, 1000000, 10000, 100_000_000);
|
|
5517
|
+
const threshold = clampInt(args?.threshold, 160, 1, 254);
|
|
5518
|
+
const meta = await sharp(filePath).metadata();
|
|
5519
|
+
const w = meta.width ?? 0;
|
|
5520
|
+
const h = meta.height ?? 0;
|
|
5521
|
+
if (!w || !h)
|
|
5522
|
+
throw new Error("Unable to read image dimensions");
|
|
5523
|
+
// These staff images can be tiny (e.g. ~300px wide). Upscale to make line/note detection robust.
|
|
5524
|
+
let scale = 1;
|
|
5525
|
+
if (w < 600 || h < 140) {
|
|
5526
|
+
scale = Math.max(2, Math.min(10, Math.ceil(1800 / w)));
|
|
5527
|
+
}
|
|
5528
|
+
while (w * scale * h * scale > maxPixels && scale > 1)
|
|
5529
|
+
scale--;
|
|
5530
|
+
if (w * scale * h * scale > maxPixels) {
|
|
5531
|
+
throw new Error(`Refusing huge image (${w}x${h}) even after scaling checks (scale=${scale}) (maxPixels=${maxPixels})`);
|
|
5532
|
+
}
|
|
5533
|
+
let pipeline = sharp(filePath);
|
|
5534
|
+
if (scale > 1) {
|
|
5535
|
+
pipeline = pipeline.resize({
|
|
5536
|
+
width: w * scale,
|
|
5537
|
+
height: h * scale,
|
|
5538
|
+
kernel: "nearest",
|
|
5539
|
+
});
|
|
5540
|
+
}
|
|
5541
|
+
const { data, info } = await pipeline
|
|
5542
|
+
.grayscale()
|
|
5543
|
+
.threshold(threshold)
|
|
5544
|
+
.raw()
|
|
5545
|
+
.toBuffer({ resolveWithObject: true });
|
|
5546
|
+
const width = info.width;
|
|
5547
|
+
const height = info.height;
|
|
5548
|
+
const idxOf = (x, y) => y * width + x;
|
|
5549
|
+
const isBlack = (x, y) => data[idxOf(x, y)] < 128;
|
|
5550
|
+
// Horizontal projection to find staff lines.
|
|
5551
|
+
const rowCounts = new Array(height).fill(0);
|
|
5552
|
+
let maxRow = 0;
|
|
5553
|
+
for (let y = 0; y < height; y++) {
|
|
5554
|
+
let c = 0;
|
|
5555
|
+
for (let x = 0; x < width; x++)
|
|
5556
|
+
if (isBlack(x, y))
|
|
5557
|
+
c++;
|
|
5558
|
+
rowCounts[y] = c;
|
|
5559
|
+
if (c > maxRow)
|
|
5560
|
+
maxRow = c;
|
|
5561
|
+
}
|
|
5562
|
+
const lineThresh = Math.max(5, Math.floor(maxRow * 0.55));
|
|
5563
|
+
const lineYs = [];
|
|
5564
|
+
for (let y = 0; y < height; y++) {
|
|
5565
|
+
if (rowCounts[y] < lineThresh)
|
|
5566
|
+
continue;
|
|
5567
|
+
let y2 = y;
|
|
5568
|
+
while (y2 + 1 < height && rowCounts[y2 + 1] >= lineThresh)
|
|
5569
|
+
y2++;
|
|
5570
|
+
lineYs.push((y + y2) / 2);
|
|
5571
|
+
y = y2;
|
|
5572
|
+
}
|
|
5573
|
+
if (lineYs.length < 5)
|
|
5574
|
+
throw new Error(`Failed to detect 5 staff lines (found ${lineYs.length})`);
|
|
5575
|
+
lineYs.sort((a, b) => a - b);
|
|
5576
|
+
// Choose the best contiguous group of 5 lines by spacing consistency.
|
|
5577
|
+
let staff = lineYs.slice(0, 5);
|
|
5578
|
+
if (lineYs.length > 5) {
|
|
5579
|
+
let bestScore = Number.POSITIVE_INFINITY;
|
|
5580
|
+
let best = null;
|
|
5581
|
+
for (let i = 0; i + 4 < lineYs.length; i++) {
|
|
5582
|
+
const cand = lineYs.slice(i, i + 5);
|
|
5583
|
+
const spacings = cand.slice(1).map((y, j) => y - cand[j]);
|
|
5584
|
+
const avg = spacings.reduce((s, n) => s + n, 0) / spacings.length;
|
|
5585
|
+
const variance = spacings.reduce((s, n) => s + (n - avg) * (n - avg), 0) / spacings.length;
|
|
5586
|
+
// Penalize implausible spacing (too tight or too large).
|
|
5587
|
+
const spacingPenalty = avg < 2 ? 1e6 : avg > height / 4 ? 1e6 : 0;
|
|
5588
|
+
const score = variance + spacingPenalty;
|
|
5589
|
+
if (score < bestScore) {
|
|
5590
|
+
bestScore = score;
|
|
5591
|
+
best = cand;
|
|
5592
|
+
}
|
|
5593
|
+
}
|
|
5594
|
+
if (best)
|
|
5595
|
+
staff = best;
|
|
5596
|
+
}
|
|
5597
|
+
const spacings = staff.slice(1).map((y, i) => y - staff[i]);
|
|
5598
|
+
const lineSpacing = spacings.length ? spacings.reduce((s, n) => s + n, 0) / spacings.length : 1;
|
|
5599
|
+
// Remove staff lines into a mutable buffer. Keep pixels that belong to note heads
|
|
5600
|
+
// (detected by having black pixels both above or below the line).
|
|
5601
|
+
const buf = Buffer.from(data);
|
|
5602
|
+
const band = Math.max(1, Math.round(lineSpacing * 0.12));
|
|
5603
|
+
const probe = Math.max(1, Math.round(lineSpacing * 0.35));
|
|
5604
|
+
for (const ly of staff) {
|
|
5605
|
+
const yCenter = Math.round(ly);
|
|
5606
|
+
const y0 = Math.max(0, yCenter - band);
|
|
5607
|
+
const y1 = Math.min(height - 1, yCenter + band);
|
|
5608
|
+
for (let yy = y0; yy <= y1; yy++) {
|
|
5609
|
+
for (let x = 0; x < width; x++) {
|
|
5610
|
+
const idx = idxOf(x, yy);
|
|
5611
|
+
if (buf[idx] >= 128)
|
|
5612
|
+
continue;
|
|
5613
|
+
const aboveY = yy - probe;
|
|
5614
|
+
const belowY = yy + probe;
|
|
5615
|
+
const above = aboveY >= 0 ? buf[idxOf(x, aboveY)] : 255;
|
|
5616
|
+
const below = belowY < height ? buf[idxOf(x, belowY)] : 255;
|
|
5617
|
+
if (above >= 128 && below >= 128) {
|
|
5618
|
+
buf[idx] = 255;
|
|
5619
|
+
}
|
|
5620
|
+
}
|
|
5621
|
+
}
|
|
5622
|
+
}
|
|
5623
|
+
const visited = new Uint8Array(width * height);
|
|
5624
|
+
const inBounds = (x, y) => x >= 0 && x < width && y >= 0 && y < height;
|
|
5625
|
+
const neighbors = [
|
|
5626
|
+
[1, 0],
|
|
5627
|
+
[-1, 0],
|
|
5628
|
+
[0, 1],
|
|
5629
|
+
[0, -1],
|
|
5630
|
+
[1, 1],
|
|
5631
|
+
[-1, -1],
|
|
5632
|
+
[1, -1],
|
|
5633
|
+
[-1, 1],
|
|
5634
|
+
];
|
|
5635
|
+
const components = [];
|
|
5636
|
+
for (let y = 0; y < height; y++) {
|
|
5637
|
+
for (let x = 0; x < width; x++) {
|
|
5638
|
+
const startIdx = idxOf(x, y);
|
|
5639
|
+
if (visited[startIdx])
|
|
5640
|
+
continue;
|
|
5641
|
+
visited[startIdx] = 1;
|
|
5642
|
+
if (buf[startIdx] >= 128)
|
|
5643
|
+
continue;
|
|
5644
|
+
let area = 0;
|
|
5645
|
+
let sx = 0;
|
|
5646
|
+
let sy = 0;
|
|
5647
|
+
let x0 = x, x1 = x, y0 = y, y1 = y;
|
|
5648
|
+
const qx = [x];
|
|
5649
|
+
const qy = [y];
|
|
5650
|
+
for (let qi = 0; qi < qx.length; qi++) {
|
|
5651
|
+
const px = qx[qi];
|
|
5652
|
+
const py = qy[qi];
|
|
5653
|
+
const pidx = idxOf(px, py);
|
|
5654
|
+
if (buf[pidx] >= 128)
|
|
5655
|
+
continue;
|
|
5656
|
+
area++;
|
|
5657
|
+
sx += px;
|
|
5658
|
+
sy += py;
|
|
5659
|
+
if (px < x0)
|
|
5660
|
+
x0 = px;
|
|
5661
|
+
if (px > x1)
|
|
5662
|
+
x1 = px;
|
|
5663
|
+
if (py < y0)
|
|
5664
|
+
y0 = py;
|
|
5665
|
+
if (py > y1)
|
|
5666
|
+
y1 = py;
|
|
5667
|
+
for (const [dx, dy] of neighbors) {
|
|
5668
|
+
const nx = px + dx;
|
|
5669
|
+
const ny = py + dy;
|
|
5670
|
+
if (!inBounds(nx, ny))
|
|
5671
|
+
continue;
|
|
5672
|
+
const nidx = idxOf(nx, ny);
|
|
5673
|
+
if (visited[nidx])
|
|
5674
|
+
continue;
|
|
5675
|
+
visited[nidx] = 1;
|
|
5676
|
+
if (buf[nidx] < 128) {
|
|
5677
|
+
qx.push(nx);
|
|
5678
|
+
qy.push(ny);
|
|
5679
|
+
}
|
|
5680
|
+
}
|
|
5681
|
+
}
|
|
5682
|
+
const bw = x1 - x0 + 1;
|
|
5683
|
+
const bh = y1 - y0 + 1;
|
|
5684
|
+
// Keep note-head-ish blobs. Scale thresholds by staff spacing.
|
|
5685
|
+
const minArea = Math.max(20, Math.floor(lineSpacing * lineSpacing * 0.35));
|
|
5686
|
+
const maxArea = Math.max(minArea, Math.floor(lineSpacing * lineSpacing * 30));
|
|
5687
|
+
if (area < minArea || area > maxArea)
|
|
5688
|
+
continue;
|
|
5689
|
+
const minDim = Math.max(6, Math.floor(lineSpacing * 0.6));
|
|
5690
|
+
const maxDim = Math.max(minDim, Math.floor(lineSpacing * 4.5));
|
|
5691
|
+
if (bw < minDim || bh < minDim)
|
|
5692
|
+
continue;
|
|
5693
|
+
if (bw > maxDim || bh > maxDim)
|
|
5694
|
+
continue;
|
|
5695
|
+
const cx = sx / area;
|
|
5696
|
+
const cy = sy / area;
|
|
5697
|
+
components.push({ area, cx, cy, x0, y0, x1, y1 });
|
|
5698
|
+
}
|
|
5699
|
+
}
|
|
5700
|
+
let notes = components;
|
|
5701
|
+
if (notes.length > 12) {
|
|
5702
|
+
// If noise produced extra small blobs, keep the most prominent ones.
|
|
5703
|
+
notes = [...notes].sort((a, b) => b.area - a.area).slice(0, 12);
|
|
5704
|
+
}
|
|
5705
|
+
notes = notes.sort((a, b) => a.cx - b.cx);
|
|
5706
|
+
if (!notes.length)
|
|
5707
|
+
throw new Error("No note-like blobs detected");
|
|
5708
|
+
const staffLineCount = staff.length;
|
|
5709
|
+
const noteCount = notes.length;
|
|
5710
|
+
const notesOnLines = notes.filter((n) => {
|
|
5711
|
+
const closest = staff.reduce((best, yy) => Math.abs(yy - n.cy) < Math.abs(best - n.cy) ? yy : best, staff[0]);
|
|
5712
|
+
return Math.abs(closest - n.cy) <= lineSpacing * 0.18;
|
|
5713
|
+
}).length;
|
|
5714
|
+
const bottomLineY = Math.max(...staff);
|
|
5715
|
+
const step = lineSpacing / 2;
|
|
5716
|
+
const lettersSeq = ["G", "A", "B", "C", "D", "E", "F"];
|
|
5717
|
+
const noteLetters = notes.map((n) => {
|
|
5718
|
+
const pos = Math.round((bottomLineY - n.cy) / step);
|
|
5719
|
+
const idx = ((pos % 7) + 7) % 7;
|
|
5720
|
+
return lettersSeq[idx];
|
|
5721
|
+
});
|
|
5722
|
+
const word = noteLetters.join("");
|
|
5723
|
+
const wordLower = word.toLowerCase();
|
|
5724
|
+
const timeWordValue = {
|
|
5725
|
+
decade: 10,
|
|
5726
|
+
score: 20,
|
|
5727
|
+
century: 100,
|
|
5728
|
+
millennium: 1000,
|
|
5729
|
+
year: 1,
|
|
5730
|
+
};
|
|
5731
|
+
const value = timeWordValue[wordLower];
|
|
5732
|
+
if (typeof value !== "number")
|
|
5733
|
+
throw new Error(`Unrecognized time-word from notes: ${word}`);
|
|
5734
|
+
const derived = staffLineCount + noteCount - notesOnLines;
|
|
5735
|
+
const age = value * derived;
|
|
5736
|
+
return {
|
|
5737
|
+
path: filePath,
|
|
5738
|
+
staffLineCount,
|
|
5739
|
+
noteCount,
|
|
5740
|
+
notesOnLines,
|
|
5741
|
+
word,
|
|
5742
|
+
wordValue: value,
|
|
5743
|
+
derived,
|
|
5744
|
+
age,
|
|
5745
|
+
answer: String(age),
|
|
5746
|
+
};
|
|
5747
|
+
},
|
|
5748
|
+
},
|
|
5749
|
+
{
|
|
5750
|
+
name: "solve_storage_upgrade_cost_per_file_from_image",
|
|
5751
|
+
description: "OCR plan tiers from an image, compute required storage from equally-sized file counts, and return average incremental $/file beyond the current plan limit. Deterministic, no network.",
|
|
5752
|
+
inputSchema: {
|
|
5753
|
+
type: "object",
|
|
5754
|
+
properties: {
|
|
5755
|
+
path: {
|
|
5756
|
+
type: "string",
|
|
5757
|
+
description: "Path to a local image file (absolute or relative to current working directory).",
|
|
5758
|
+
},
|
|
5759
|
+
currentPlanName: {
|
|
5760
|
+
type: "string",
|
|
5761
|
+
description: "Name of the current plan (e.g. 'Standard').",
|
|
5762
|
+
},
|
|
5763
|
+
filesUploaded: {
|
|
5764
|
+
type: "number",
|
|
5765
|
+
description: "Number of equally-sized files already uploaded.",
|
|
5766
|
+
},
|
|
5767
|
+
overLimitGb: {
|
|
5768
|
+
type: "number",
|
|
5769
|
+
description: "How many GB over the current plan limit after uploading filesUploaded.",
|
|
5770
|
+
},
|
|
5771
|
+
additionalFiles: {
|
|
5772
|
+
type: "number",
|
|
5773
|
+
description: "Additional equally-sized files to upload.",
|
|
5774
|
+
},
|
|
5775
|
+
decimals: {
|
|
5776
|
+
type: "number",
|
|
5777
|
+
description: "Decimal places to round to (default: 2).",
|
|
5778
|
+
default: 2,
|
|
5779
|
+
},
|
|
5780
|
+
lang: {
|
|
5781
|
+
type: "string",
|
|
5782
|
+
description: "Tesseract language code (default: eng).",
|
|
5783
|
+
default: "eng",
|
|
5784
|
+
},
|
|
5785
|
+
langPath: {
|
|
5786
|
+
type: "string",
|
|
5787
|
+
description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
|
|
5788
|
+
},
|
|
5789
|
+
preprocess: {
|
|
5790
|
+
type: "boolean",
|
|
5791
|
+
description: "If true (default), basic sharp preprocessing is applied before OCR.",
|
|
5792
|
+
default: true,
|
|
5793
|
+
},
|
|
5794
|
+
maxChars: {
|
|
5795
|
+
type: "number",
|
|
5796
|
+
description: "Maximum OCR text characters to consider.",
|
|
5797
|
+
default: 60000,
|
|
5798
|
+
},
|
|
5799
|
+
},
|
|
5800
|
+
required: ["path", "currentPlanName", "filesUploaded", "overLimitGb", "additionalFiles"],
|
|
5801
|
+
},
|
|
5802
|
+
handler: async (args) => {
|
|
5803
|
+
const filePath = resolveLocalPath(args?.path);
|
|
5804
|
+
if (!existsSync(filePath))
|
|
5805
|
+
throw new Error(`File not found: ${filePath}`);
|
|
5806
|
+
const currentPlanName = String(args?.currentPlanName ?? "").trim();
|
|
5807
|
+
if (!currentPlanName)
|
|
5808
|
+
throw new Error("currentPlanName is required");
|
|
5809
|
+
const filesUploaded = toNumberOrNull(args?.filesUploaded);
|
|
5810
|
+
const overLimitGb = toNumberOrNull(args?.overLimitGb);
|
|
5811
|
+
const additionalFiles = toNumberOrNull(args?.additionalFiles);
|
|
5812
|
+
if (filesUploaded === null || overLimitGb === null || additionalFiles === null) {
|
|
5813
|
+
throw new Error("filesUploaded, overLimitGb, and additionalFiles must be numbers");
|
|
5814
|
+
}
|
|
5815
|
+
if (filesUploaded <= 0)
|
|
5816
|
+
throw new Error("filesUploaded must be > 0");
|
|
5817
|
+
if (overLimitGb < 0)
|
|
5818
|
+
throw new Error("overLimitGb must be >= 0");
|
|
5819
|
+
if (additionalFiles < 0)
|
|
5820
|
+
throw new Error("additionalFiles must be >= 0");
|
|
5821
|
+
const decimals = clampInt(args?.decimals, 2, 0, 6);
|
|
5822
|
+
const lang = String(args?.lang ?? "eng").trim() || "eng";
|
|
5823
|
+
const preprocess = args?.preprocess !== false;
|
|
5824
|
+
const maxChars = clampInt(args?.maxChars, 60000, 5000, 200000);
|
|
5825
|
+
const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
|
|
5826
|
+
const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
|
|
5827
|
+
const langPathEffective = langPathArg
|
|
5828
|
+
? resolveLocalPath(langPathArg)
|
|
5829
|
+
: existsSync(defaultLangPath)
|
|
5830
|
+
? defaultLangPath
|
|
5831
|
+
: null;
|
|
5832
|
+
const ocr = await ocrRecognizeImageFile({ filePath, lang, langPathEffective, preprocess });
|
|
5833
|
+
const text = ocr.text.slice(0, maxChars);
|
|
5834
|
+
const lower = text.toLowerCase();
|
|
5835
|
+
const orderedKeys = ["standard", "plus", "premium"].filter((k) => lower.includes(k));
|
|
5836
|
+
const plans = [];
|
|
5837
|
+
for (let i = 0; i < orderedKeys.length; i++) {
|
|
5838
|
+
const key = orderedKeys[i];
|
|
5839
|
+
const start = lower.indexOf(key);
|
|
5840
|
+
const end = i + 1 < orderedKeys.length ? lower.indexOf(orderedKeys[i + 1], start + 1) : lower.length;
|
|
5841
|
+
const block = text.slice(start, end);
|
|
5842
|
+
const priceM = block.match(/\$\s*(\d+(?:\.\d+)?)\s*\/?\s*month/i);
|
|
5843
|
+
const storageM = block.match(/\b(\d+(?:\.\d+)?)\s*tb\b/i);
|
|
5844
|
+
if (!priceM || !storageM)
|
|
5845
|
+
continue;
|
|
5846
|
+
const pricePerMonth = Number.parseFloat(priceM[1]);
|
|
5847
|
+
const storageTb = Number.parseFloat(storageM[1]);
|
|
5848
|
+
if (!Number.isFinite(pricePerMonth) || !Number.isFinite(storageTb))
|
|
5849
|
+
continue;
|
|
5850
|
+
plans.push({
|
|
5851
|
+
name: key[0].toUpperCase() + key.slice(1),
|
|
5852
|
+
pricePerMonth,
|
|
5853
|
+
storageTb,
|
|
5854
|
+
});
|
|
5855
|
+
}
|
|
5856
|
+
if (!plans.length)
|
|
5857
|
+
throw new Error("Failed to parse plans from OCR");
|
|
5858
|
+
const current = plans.find((p) => p.name.toLowerCase() === currentPlanName.toLowerCase());
|
|
5859
|
+
if (!current)
|
|
5860
|
+
throw new Error(`Current plan not found in OCR plans: ${currentPlanName}`);
|
|
5861
|
+
const currentLimitGb = current.storageTb * 1000;
|
|
5862
|
+
const usedGb = currentLimitGb + overLimitGb;
|
|
5863
|
+
const fileSizeGb = usedGb / filesUploaded;
|
|
5864
|
+
const totalFiles = filesUploaded + additionalFiles;
|
|
5865
|
+
const requiredGb = totalFiles * fileSizeGb;
|
|
5866
|
+
const needed = [...plans].sort((a, b) => a.storageTb - b.storageTb).find((p) => p.storageTb * 1000 >= requiredGb);
|
|
5867
|
+
if (!needed)
|
|
5868
|
+
throw new Error("No plan tier can satisfy required storage");
|
|
5869
|
+
const upgradeCost = needed.pricePerMonth - current.pricePerMonth;
|
|
5870
|
+
const includedFilesCapacity = currentLimitGb / fileSizeGb;
|
|
5871
|
+
const filesOverLimit = Math.max(0, totalFiles - includedFilesCapacity);
|
|
5872
|
+
const costPerFile = filesOverLimit > 0 ? upgradeCost / filesOverLimit : 0;
|
|
5873
|
+
const rounded = Number(costPerFile.toFixed(decimals));
|
|
5874
|
+
return {
|
|
5875
|
+
path: filePath,
|
|
5876
|
+
plans,
|
|
5877
|
+
current,
|
|
5878
|
+
needed,
|
|
5879
|
+
fileSizeGb: Number(fileSizeGb.toFixed(4)),
|
|
5880
|
+
requiredGb: Number(requiredGb.toFixed(2)),
|
|
5881
|
+
filesOverLimit: Number(filesOverLimit.toFixed(4)),
|
|
5882
|
+
upgradeCost: Number(upgradeCost.toFixed(2)),
|
|
5883
|
+
costPerFile: rounded,
|
|
5884
|
+
answer: rounded.toFixed(decimals),
|
|
5885
|
+
};
|
|
5886
|
+
},
|
|
5887
|
+
},
|
|
2853
5888
|
{
|
|
2854
5889
|
name: "transcribe_audio_file",
|
|
2855
5890
|
description: "Transcribe a local audio file (MP3/WAV/etc) to text using faster-whisper via Python. Deterministic, no network.",
|
|
@@ -2968,4 +6003,8 @@ export const localFileTools = [
|
|
|
2968
6003
|
},
|
|
2969
6004
|
},
|
|
2970
6005
|
];
|
|
6006
|
+
/** General-purpose local file parsing tools (19 tools) */
|
|
6007
|
+
export const localFileTools = _ALL_LOCAL_FILE_TOOLS.filter((t) => !GAIA_SOLVER_NAMES.has(t.name));
|
|
6008
|
+
/** Specialized GAIA media image solver tools (6 tools) */
|
|
6009
|
+
export const gaiaMediaSolvers = _ALL_LOCAL_FILE_TOOLS.filter((t) => GAIA_SOLVER_NAMES.has(t.name));
|
|
2971
6010
|
//# sourceMappingURL=localFileTools.js.map
|