nodebench-mcp 2.11.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/NODEBENCH_AGENTS.md +809 -809
  2. package/README.md +443 -431
  3. package/STYLE_GUIDE.md +477 -477
  4. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +153 -5
  5. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  6. package/dist/__tests__/helpers/textLlm.d.ts +1 -1
  7. package/dist/__tests__/presetRealWorldBench.test.d.ts +1 -0
  8. package/dist/__tests__/presetRealWorldBench.test.js +839 -0
  9. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -0
  10. package/dist/__tests__/tools.test.js +8 -5
  11. package/dist/__tests__/tools.test.js.map +1 -1
  12. package/dist/__tests__/toolsetGatingEval.test.js +11 -11
  13. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  14. package/dist/index.js +397 -327
  15. package/dist/index.js.map +1 -1
  16. package/dist/tools/agentBootstrapTools.js +258 -258
  17. package/dist/tools/boilerplateTools.js +144 -144
  18. package/dist/tools/cCompilerBenchmarkTools.js +33 -33
  19. package/dist/tools/documentationTools.js +59 -59
  20. package/dist/tools/flywheelTools.js +6 -6
  21. package/dist/tools/learningTools.js +26 -26
  22. package/dist/tools/localFileTools.d.ts +3 -0
  23. package/dist/tools/localFileTools.js +3164 -125
  24. package/dist/tools/localFileTools.js.map +1 -1
  25. package/dist/tools/reconTools.js +31 -31
  26. package/dist/tools/selfEvalTools.js +44 -44
  27. package/dist/tools/sessionMemoryTools.d.ts +15 -0
  28. package/dist/tools/sessionMemoryTools.js +348 -0
  29. package/dist/tools/sessionMemoryTools.js.map +1 -0
  30. package/dist/tools/toolRegistry.d.ts +4 -0
  31. package/dist/tools/toolRegistry.js +229 -0
  32. package/dist/tools/toolRegistry.js.map +1 -1
  33. package/dist/tools/verificationTools.js +41 -41
  34. package/dist/tools/visionTools.js +17 -17
  35. package/dist/tools/webTools.js +18 -18
  36. package/package.json +101 -101
@@ -91,6 +91,67 @@ function toIntegerOrNull(value) {
91
91
  const n = Number.parseInt(m[0], 10);
92
92
  return Number.isFinite(n) ? n : null;
93
93
  }
94
+ function extractChunkedIntsFromText(text, opts) {
95
+ const chunkSize = typeof opts?.chunkSize === "number" && opts.chunkSize > 0 ? Math.trunc(opts.chunkSize) : 2;
96
+ const min = typeof opts?.min === "number" && Number.isFinite(opts.min) ? opts.min : 0;
97
+ const max = typeof opts?.max === "number" && Number.isFinite(opts.max) ? opts.max : 200;
98
+ const runs = String(text ?? "").match(/\d+/g) ?? [];
99
+ const out = [];
100
+ for (const run of runs) {
101
+ let s = String(run ?? "").trim();
102
+ if (!s)
103
+ continue;
104
+ const pushIfOk = (n) => {
105
+ if (!Number.isFinite(n))
106
+ return;
107
+ if (n < min || n > max)
108
+ return;
109
+ out.push(n);
110
+ };
111
+ // Common OCR noise: isolated digits when the underlying data are 2-digit tokens.
112
+ if (s.length < chunkSize) {
113
+ pushIfOk(Number.parseInt(s, 10));
114
+ continue;
115
+ }
116
+ if (s.length === chunkSize) {
117
+ pushIfOk(Number.parseInt(s, 10));
118
+ continue;
119
+ }
120
+ // Sometimes OCR concatenates adjacent numbers (e.g. "247428"). Split into fixed-size chunks.
121
+ // For 2-digit chunking, handle odd-length runs by shifting or trimming leading zeros.
122
+ if (chunkSize === 2) {
123
+ // Fix common leading/trailing zero artifacts: "074" -> "74", "580" -> "58".
124
+ if (s.length === 3 && s.startsWith("0"))
125
+ s = s.slice(1);
126
+ if (s.length === 3 && s.endsWith("0"))
127
+ s = s.slice(0, 2);
128
+ // For any remaining odd-length run, prefer chunking that yields more in-range 2-digit values.
129
+ const chunkFrom = (start) => {
130
+ const nums = [];
131
+ for (let i = start; i + 2 <= s.length; i += 2) {
132
+ nums.push(Number.parseInt(s.slice(i, i + 2), 10));
133
+ }
134
+ return nums;
135
+ };
136
+ if (s.length % 2 === 1) {
137
+ const a = chunkFrom(0);
138
+ const b = chunkFrom(1);
139
+ const score = (arr) => arr.filter((n) => Number.isFinite(n) && n >= min && n <= max).length;
140
+ const best = score(b) > score(a) ? b : a;
141
+ for (const n of best)
142
+ pushIfOk(n);
143
+ continue;
144
+ }
145
+ for (const n of chunkFrom(0))
146
+ pushIfOk(n);
147
+ continue;
148
+ }
149
+ for (let i = 0; i + chunkSize <= s.length; i += chunkSize) {
150
+ pushIfOk(Number.parseInt(s.slice(i, i + chunkSize), 10));
151
+ }
152
+ }
153
+ return out;
154
+ }
94
155
  function gcdInt(a, b) {
95
156
  let x = Math.abs(Math.trunc(a));
96
157
  let y = Math.abs(Math.trunc(b));
@@ -405,35 +466,189 @@ function toOcrBbox(raw) {
405
466
  return null;
406
467
  return { x0, y0, x1, y1 };
407
468
  }
408
- async function ocrRecognizeBuffer(args) {
409
- const tesseract = await getTesseract();
410
- const recognize = tesseract?.recognize;
411
- if (typeof recognize !== "function") {
412
- throw new Error("tesseract.js missing recognize() export (unsupported version)");
469
+ function parseTesseractTsv(tsv) {
470
+ const text = String(tsv ?? "").trim();
471
+ if (!text)
472
+ return { words: [], lines: [] };
473
+ const rows = text.split(/\r?\n/);
474
+ if (rows.length <= 1)
475
+ return { words: [], lines: [] };
476
+ const wordsRaw = [];
477
+ for (let i = 1; i < rows.length; i++) {
478
+ const row = rows[i];
479
+ if (!row)
480
+ continue;
481
+ const cols = row.split("\t");
482
+ if (cols.length < 12)
483
+ continue;
484
+ const level = Number.parseInt(cols[0], 10);
485
+ if (!Number.isFinite(level))
486
+ continue;
487
+ const page = cols[1];
488
+ const block = cols[2];
489
+ const par = cols[3];
490
+ const line = cols[4];
491
+ const word = cols[5];
492
+ const left = Number.parseInt(cols[6], 10);
493
+ const top = Number.parseInt(cols[7], 10);
494
+ const w = Number.parseInt(cols[8], 10);
495
+ const h = Number.parseInt(cols[9], 10);
496
+ const confN = Number.parseFloat(cols[10]);
497
+ const conf = Number.isFinite(confN) ? confN : null;
498
+ const t = String(cols.slice(11).join("\t") ?? "").trim();
499
+ if (![left, top, w, h].every(Number.isFinite))
500
+ continue;
501
+ if (!t)
502
+ continue;
503
+ const key = `${page}:${block}:${par}:${line}`;
504
+ if (level === 5) {
505
+ wordsRaw.push({ key, left, top, width: w, height: h, conf, text: t });
506
+ }
413
507
  }
414
- const result = await recognize(args.buffer, args.lang, {
415
- ...(args.langPathEffective ? { langPath: args.langPathEffective } : {}),
416
- logger: () => {
417
- // silence
418
- },
508
+ const words = wordsRaw.map((w) => ({
509
+ text: w.text,
510
+ confidence: w.conf,
511
+ bbox: { x0: w.left, y0: w.top, x1: w.left + w.width, y1: w.top + w.height },
512
+ }));
513
+ // Build lines by grouping words with the same (page,block,par,line) key.
514
+ const byLine = new Map();
515
+ for (const w of wordsRaw) {
516
+ const arr = byLine.get(w.key);
517
+ if (arr)
518
+ arr.push(w);
519
+ else
520
+ byLine.set(w.key, [w]);
521
+ }
522
+ const lines = [];
523
+ for (const arr of byLine.values()) {
524
+ const sorted = [...arr].sort((a, b) => a.left - b.left);
525
+ const lineText = sorted.map((x) => x.text).join(" ").trim();
526
+ if (!lineText)
527
+ continue;
528
+ const x0 = Math.min(...sorted.map((x) => x.left));
529
+ const y0 = Math.min(...sorted.map((x) => x.top));
530
+ const x1 = Math.max(...sorted.map((x) => x.left + x.width));
531
+ const y1 = Math.max(...sorted.map((x) => x.top + x.height));
532
+ const confs = sorted.map((x) => x.conf).filter((c) => typeof c === "number" && Number.isFinite(c));
533
+ const avgConf = confs.length ? confs.reduce((s, n) => s + n, 0) / confs.length : null;
534
+ lines.push({ text: lineText, confidence: avgConf, bbox: { x0, y0, x1, y1 } });
535
+ }
536
+ // Keep a stable reading order.
537
+ lines.sort((a, b) => {
538
+ const ay = a.bbox ? a.bbox.y0 : 0;
539
+ const by = b.bbox ? b.bbox.y0 : 0;
540
+ const ax = a.bbox ? a.bbox.x0 : 0;
541
+ const bx = b.bbox ? b.bbox.x0 : 0;
542
+ return ay - by || ax - bx;
543
+ });
544
+ return { words, lines };
545
+ }
546
+ const OCR_WORKER_POOL = new Map();
547
+ const OCR_WORKER_IDLE_TERMINATE_MS = 3000;
548
+ function ocrWorkerKey(lang, langPathEffective) {
549
+ return `${lang}::${langPathEffective ?? ""}`;
550
+ }
551
+ async function getOrCreateOcrWorkerEntry(args) {
552
+ const key = ocrWorkerKey(args.lang, args.langPathEffective);
553
+ const existing = OCR_WORKER_POOL.get(key);
554
+ if (existing)
555
+ return { key, entry: existing };
556
+ const entry = {
557
+ workerPromise: (async () => {
558
+ const tesseract = await getTesseract();
559
+ const createWorker = tesseract?.createWorker;
560
+ if (typeof createWorker !== "function") {
561
+ throw new Error("tesseract.js missing createWorker() export (unsupported version)");
562
+ }
563
+ // createWorker() returns a wrapper that manages a worker_threads Worker under the hood.
564
+ const worker = await createWorker(args.lang, undefined, {
565
+ ...(args.langPathEffective ? { langPath: args.langPathEffective } : {}),
566
+ logger: () => {
567
+ // silence
568
+ },
569
+ });
570
+ return worker;
571
+ })(),
572
+ chain: Promise.resolve(),
573
+ activeCount: 0,
574
+ idleTimer: null,
575
+ };
576
+ OCR_WORKER_POOL.set(key, entry);
577
+ return { key, entry };
578
+ }
579
+ function scheduleOcrWorkerIdleTerminate(key, entry) {
580
+ if (entry.idleTimer)
581
+ clearTimeout(entry.idleTimer);
582
+ entry.idleTimer = setTimeout(() => {
583
+ // Fire-and-forget; do not block the event loop on teardown.
584
+ void (async () => {
585
+ OCR_WORKER_POOL.delete(key);
586
+ try {
587
+ const worker = await entry.workerPromise;
588
+ if (worker && typeof worker.terminate === "function")
589
+ await worker.terminate();
590
+ }
591
+ catch {
592
+ // ignore
593
+ }
594
+ })();
595
+ }, OCR_WORKER_IDLE_TERMINATE_MS);
596
+ // Keep the timer from preventing exit, while the worker thread still keeps the loop alive.
597
+ entry.idleTimer?.unref?.();
598
+ }
599
+ async function withOcrWorker(args, fn) {
600
+ const { key, entry } = await getOrCreateOcrWorkerEntry(args);
601
+ // If we were about to tear down, keep the worker alive for this request burst.
602
+ if (entry.idleTimer) {
603
+ clearTimeout(entry.idleTimer);
604
+ entry.idleTimer = null;
605
+ }
606
+ entry.activeCount += 1;
607
+ const run = async () => {
608
+ const worker = await entry.workerPromise;
609
+ return await fn(worker);
610
+ };
611
+ const p = entry.chain.then(run, run);
612
+ entry.chain = p.then(() => undefined, () => undefined);
613
+ return p.finally(() => {
614
+ entry.activeCount -= 1;
615
+ if (entry.activeCount <= 0) {
616
+ entry.activeCount = 0;
617
+ scheduleOcrWorkerIdleTerminate(key, entry);
618
+ }
419
619
  });
620
+ }
621
+ async function ocrRecognizeBuffer(args) {
622
+ // NOTE: Tesseract.recognize() (top-level) does not accept OutputFormats like TSV.
623
+ // We must use a worker's recognize() and request output.tsv explicitly.
624
+ const result = await withOcrWorker({ lang: args.lang, langPathEffective: args.langPathEffective }, (worker) => worker.recognize(args.buffer, args.tessOptions ?? {}, args.output ?? { text: true, tsv: true }));
420
625
  const data = result?.data ?? {};
421
626
  const text = String(data.text ?? "").trim();
422
627
  const confidence = typeof data.confidence === "number" ? data.confidence : null;
423
- const words = Array.isArray(data.words)
424
- ? data.words.map((w) => ({
425
- text: String(w?.text ?? ""),
426
- confidence: typeof w?.confidence === "number" ? w.confidence : null,
427
- bbox: toOcrBbox(w?.bbox),
428
- }))
429
- : [];
430
- const lines = Array.isArray(data.lines)
431
- ? data.lines.map((l) => ({
432
- text: String(l?.text ?? ""),
433
- confidence: typeof l?.confidence === "number" ? l.confidence : null,
434
- bbox: toOcrBbox(l?.bbox),
435
- }))
436
- : [];
628
+ let words = [];
629
+ let lines = [];
630
+ // tesseract.js v7 returns layout data primarily via TSV/HOCR; earlier versions may populate data.words/lines.
631
+ if (Array.isArray(data.words) || Array.isArray(data.lines)) {
632
+ words = Array.isArray(data.words)
633
+ ? data.words.map((w) => ({
634
+ text: String(w?.text ?? ""),
635
+ confidence: typeof w?.confidence === "number" ? w.confidence : null,
636
+ bbox: toOcrBbox(w?.bbox),
637
+ }))
638
+ : [];
639
+ lines = Array.isArray(data.lines)
640
+ ? data.lines.map((l) => ({
641
+ text: String(l?.text ?? ""),
642
+ confidence: typeof l?.confidence === "number" ? l.confidence : null,
643
+ bbox: toOcrBbox(l?.bbox),
644
+ }))
645
+ : [];
646
+ }
647
+ else if (typeof data.tsv === "string" && data.tsv.trim()) {
648
+ const parsed = parseTesseractTsv(String(data.tsv));
649
+ words = parsed.words;
650
+ lines = parsed.lines;
651
+ }
437
652
  return { text, confidence, words, lines };
438
653
  }
439
654
  async function ocrRecognizeImageFile(args) {
@@ -473,6 +688,8 @@ async function ocrRecognizeImageFileWithColorMask(args) {
473
688
  const { data, info } = await image.ensureAlpha().raw().toBuffer({ resolveWithObject: true });
474
689
  const out = Buffer.alloc(info.width * info.height);
475
690
  // Convert matching colored pixels to black ink on a white background.
691
+ const minPrimary = typeof args.minPrimary === "number" && Number.isFinite(args.minPrimary) ? args.minPrimary : 80;
692
+ const minDelta = typeof args.minDelta === "number" && Number.isFinite(args.minDelta) ? args.minDelta : 25;
476
693
  for (let i = 0, j = 0; i < data.length; i += 4, j++) {
477
694
  const r = data[i];
478
695
  const g = data[i + 1];
@@ -480,117 +697,1108 @@ async function ocrRecognizeImageFileWithColorMask(args) {
480
697
  const a = data[i + 3];
481
698
  let match = false;
482
699
  if (a >= 40) {
483
- if (args.color === "red") {
484
- match = r >= 90 && r - g >= 35 && r - b >= 35;
485
- }
486
- else {
487
- match = g >= 90 && g - r >= 35 && g - b >= 35;
488
- }
700
+ const primary = args.color === "red" ? r : g;
701
+ const other = args.color === "red" ? g : r;
702
+ match = primary >= minPrimary && primary - other >= minDelta && primary - b >= minDelta;
489
703
  }
490
704
  out[j] = match ? 0 : 255;
491
705
  }
492
- // Upscale a bit for OCR; keep it deterministic.
493
- const masked = await sharp(out, { raw: { width: info.width, height: info.height, channels: 1 } })
494
- .resize({ width: info.width * 2, height: info.height * 2, kernel: "nearest" })
495
- .threshold(180)
496
- .png()
497
- .toBuffer();
706
+ // Upscale for OCR; keep it deterministic.
707
+ const requestedUpscale = typeof args.upscale === "number" && Number.isFinite(args.upscale) && args.upscale >= 1
708
+ ? Math.trunc(args.upscale)
709
+ : info.width < 900
710
+ ? 4
711
+ : info.width < 1600
712
+ ? 3
713
+ : 2;
714
+ // Don't allow the upscaled image to explode in size.
715
+ const scaledMaxPixels = Math.max(maxPixels, Math.floor(maxPixels * 4));
716
+ let upscale = Math.max(1, Math.min(10, requestedUpscale));
717
+ while (upscale > 1 && info.width * upscale * info.height * upscale > scaledMaxPixels)
718
+ upscale--;
719
+ const blurSigma = typeof args.blurSigma === "number" && Number.isFinite(args.blurSigma) ? clampNumber(args.blurSigma, 0, 10) : 0.3;
720
+ const threshold = typeof args.threshold === "number" && Number.isFinite(args.threshold) ? clampInt(args.threshold, 180, 1, 254) : 180;
721
+ let pipeline = sharp(out, { raw: { width: info.width, height: info.height, channels: 1 } }).resize({
722
+ width: info.width * upscale,
723
+ height: info.height * upscale,
724
+ kernel: "nearest",
725
+ });
726
+ if (blurSigma >= 0.3)
727
+ pipeline = pipeline.blur(blurSigma);
728
+ const masked = await pipeline.threshold(threshold).png().toBuffer();
498
729
  const result = await ocrRecognizeBuffer({
499
730
  buffer: masked,
500
731
  lang: args.lang,
501
732
  langPathEffective: args.langPathEffective,
733
+ tessOptions: {
734
+ tessedit_char_whitelist: "0123456789",
735
+ // Sparse text works better for number grids (keeps tokens separate, reduces concatenation).
736
+ tessedit_pageseg_mode: "11",
737
+ user_defined_dpi: "300",
738
+ },
739
+ output: { text: true, tsv: false },
502
740
  });
503
741
  return { text: result.text, confidence: result.confidence, usedSharp: true };
504
742
  }
505
- const FASTER_WHISPER_PY_SCRIPT_V1 = `# NodeBench MCP audio transcription helper (faster-whisper)
506
- # This file is written to a temp directory at runtime.
507
- import argparse
508
- import json
509
- import sys
510
-
511
-
512
- def main() -> None:
513
- p = argparse.ArgumentParser()
514
- p.add_argument("--path", required=True)
515
- p.add_argument("--model", default="tiny.en")
516
- p.add_argument("--language", default="")
517
- p.add_argument("--task", default="transcribe")
518
- p.add_argument("--beam-size", type=int, default=5)
519
- p.add_argument("--vad-filter", type=int, default=0)
520
- p.add_argument("--max-chars", type=int, default=12000)
521
- p.add_argument("--include-segments", type=int, default=0)
522
- args = p.parse_args()
523
-
524
- try:
525
- from faster_whisper import WhisperModel
526
- except Exception:
527
- sys.stderr.write(
528
- "Missing python dependency: faster-whisper. Install with: pip install faster-whisper\\n"
529
- )
530
- raise
531
-
532
- model = WhisperModel(args.model, device="cpu", compute_type="int8")
533
- segments, info = model.transcribe(
534
- args.path,
535
- beam_size=max(1, int(args.beam_size)),
536
- language=(args.language or None),
537
- task=(args.task or "transcribe"),
538
- vad_filter=bool(int(args.vad_filter)),
539
- word_timestamps=False,
540
- temperature=0.0,
541
- )
542
-
543
- include_segments = bool(int(args.include_segments))
544
- max_chars = max(200, int(args.max_chars))
545
-
546
- parts = []
547
- segs = []
548
- char_budget = 0
549
- truncated = False
550
-
551
- for seg in segments:
552
- t = str(getattr(seg, "text", "") or "")
553
- if not t:
554
- continue
555
- parts.append(t)
556
- if include_segments:
557
- segs.append(
558
- {
559
- "start": float(getattr(seg, "start", 0.0) or 0.0),
560
- "end": float(getattr(seg, "end", 0.0) or 0.0),
561
- "text": t,
562
- }
563
- )
564
- char_budget += len(t)
565
- if char_budget >= max_chars:
566
- truncated = True
567
- break
568
-
569
- text = "".join(parts).strip()
570
- if len(text) > max_chars:
571
- text = text[:max_chars]
572
- truncated = True
573
-
574
- out = {
575
- "path": args.path,
576
- "model": args.model,
577
- "task": args.task,
578
- "language": getattr(info, "language", None),
579
- "languageProbability": getattr(info, "language_probability", None),
580
- "durationSeconds": getattr(info, "duration", None),
581
- "beamSize": int(args.beam_size),
582
- "vadFilter": bool(int(args.vad_filter)),
583
- "maxChars": max_chars,
584
- "truncated": truncated,
585
- "text": text,
743
+ async function ocrRecognizeImageFileWithPurpleMask(args) {
744
+ const sharp = await getSharpOptional();
745
+ if (!sharp) {
746
+ throw new Error("Missing optional dependency: sharp. Install it to use color-masked OCR.");
747
+ }
748
+ const buffer = await readFile(args.filePath);
749
+ const image = sharp(buffer);
750
+ const meta = await image.metadata();
751
+ const w = meta.width ?? 0;
752
+ const h = meta.height ?? 0;
753
+ if (!w || !h)
754
+ throw new Error("Unable to read image dimensions");
755
+ const maxPixels = typeof args.maxPixels === "number" && args.maxPixels > 0 ? args.maxPixels : 6_000_000;
756
+ if (w * h > maxPixels) {
757
+ throw new Error(`Refusing huge image (${w}x${h}) for masked OCR (maxPixels=${maxPixels})`);
758
+ }
759
+ const { data, info } = await image.ensureAlpha().raw().toBuffer({ resolveWithObject: true });
760
+ const out = Buffer.alloc(info.width * info.height);
761
+ // Purple labels: high R and B, relatively low G.
762
+ const minPrimary = typeof args.minPrimary === "number" && Number.isFinite(args.minPrimary) ? args.minPrimary : 90;
763
+ const maxGreen = typeof args.maxGreen === "number" && Number.isFinite(args.maxGreen) ? args.maxGreen : 170;
764
+ const minDelta = typeof args.minDelta === "number" && Number.isFinite(args.minDelta) ? args.minDelta : 25;
765
+ for (let i = 0, j = 0; i < data.length; i += 4, j++) {
766
+ const r = data[i];
767
+ const g = data[i + 1];
768
+ const b = data[i + 2];
769
+ const a = data[i + 3];
770
+ const match = a >= 40 && r >= minPrimary && b >= minPrimary && g <= maxGreen && r - g >= minDelta && b - g >= minDelta;
771
+ out[j] = match ? 0 : 255;
772
+ }
773
+ // Upscale for OCR; keep it deterministic.
774
+ const requestedUpscale = typeof args.upscale === "number" && Number.isFinite(args.upscale) && args.upscale >= 1
775
+ ? Math.trunc(args.upscale)
776
+ : info.width < 900
777
+ ? 4
778
+ : info.width < 1600
779
+ ? 3
780
+ : 2;
781
+ // Don't allow the upscaled image to explode in size.
782
+ const scaledMaxPixels = Math.max(maxPixels, Math.floor(maxPixels * 4));
783
+ let upscale = Math.max(1, Math.min(10, requestedUpscale));
784
+ while (upscale > 1 && info.width * upscale * info.height * upscale > scaledMaxPixels)
785
+ upscale--;
786
+ const blurSigma = typeof args.blurSigma === "number" && Number.isFinite(args.blurSigma) ? clampNumber(args.blurSigma, 0, 10) : 0.3;
787
+ const threshold = typeof args.threshold === "number" && Number.isFinite(args.threshold) ? clampInt(args.threshold, 180, 1, 254) : 180;
788
+ let pipeline = sharp(out, { raw: { width: info.width, height: info.height, channels: 1 } }).resize({
789
+ width: info.width * upscale,
790
+ height: info.height * upscale,
791
+ kernel: "nearest",
792
+ });
793
+ if (blurSigma >= 0.3)
794
+ pipeline = pipeline.blur(blurSigma);
795
+ const masked = await pipeline.threshold(threshold).png().toBuffer();
796
+ const result = await ocrRecognizeBuffer({
797
+ buffer: masked,
798
+ lang: args.lang,
799
+ langPathEffective: args.langPathEffective,
800
+ tessOptions: {
801
+ tessedit_char_whitelist: "0123456789.",
802
+ // Sparse text works better for isolated numeric labels.
803
+ tessedit_pageseg_mode: "11",
804
+ user_defined_dpi: "300",
805
+ },
806
+ output: { text: true, tsv: true },
807
+ });
808
+ return { ...result, usedSharp: true, upscale };
809
+ }
810
+ function rectHasInk(bw, width, height, rect, minCount) {
811
+ if (!width || !height)
812
+ return false;
813
+ const need = Math.max(1, Math.trunc(minCount));
814
+ const x0 = clampInt(Math.trunc(rect.x0), 0, 0, width - 1);
815
+ const x1 = clampInt(Math.trunc(rect.x1), width - 1, 0, width - 1);
816
+ const y0 = clampInt(Math.trunc(rect.y0), 0, 0, height - 1);
817
+ const y1 = clampInt(Math.trunc(rect.y1), height - 1, 0, height - 1);
818
+ if (x1 < x0 || y1 < y0)
819
+ return false;
820
+ let count = 0;
821
+ for (let y = y0; y <= y1; y++) {
822
+ const off = y * width;
823
+ for (let x = x0; x <= x1; x++) {
824
+ if (bw[off + x] < 128) {
825
+ count++;
826
+ if (count >= need)
827
+ return true;
828
+ }
829
+ }
830
+ }
831
+ return false;
832
+ }
833
+ function rectInkBounds(bw, width, height, rect, minCount) {
834
+ if (!width || !height)
835
+ return null;
836
+ const need = Math.max(1, Math.trunc(minCount));
837
+ const x0 = clampInt(Math.trunc(rect.x0), 0, 0, width - 1);
838
+ const x1 = clampInt(Math.trunc(rect.x1), width - 1, 0, width - 1);
839
+ const y0 = clampInt(Math.trunc(rect.y0), 0, 0, height - 1);
840
+ const y1 = clampInt(Math.trunc(rect.y1), height - 1, 0, height - 1);
841
+ if (x1 < x0 || y1 < y0)
842
+ return null;
843
+ let minX = Number.POSITIVE_INFINITY;
844
+ let minY = Number.POSITIVE_INFINITY;
845
+ let maxX = Number.NEGATIVE_INFINITY;
846
+ let maxY = Number.NEGATIVE_INFINITY;
847
+ let count = 0;
848
+ for (let y = y0; y <= y1; y++) {
849
+ const off = y * width;
850
+ for (let x = x0; x <= x1; x++) {
851
+ if (bw[off + x] < 128) {
852
+ count++;
853
+ if (x < minX)
854
+ minX = x;
855
+ if (x > maxX)
856
+ maxX = x;
857
+ if (y < minY)
858
+ minY = y;
859
+ if (y > maxY)
860
+ maxY = y;
861
+ }
862
+ }
863
+ }
864
+ if (count < need)
865
+ return null;
866
+ if (!Number.isFinite(minX) || !Number.isFinite(minY) || !Number.isFinite(maxX) || !Number.isFinite(maxY))
867
+ return null;
868
+ return { x0: minX, y0: minY, x1: maxX, y1: maxY, count };
869
+ }
870
+ function detectThinHorizontalBarsFromBw(bw, width, height, opts) {
871
+ const minRun = Math.max(1, Math.trunc(opts.minRun));
872
+ const maxThickness = typeof opts.maxThickness === "number" ? Math.max(1, Math.trunc(opts.maxThickness)) : 12;
873
+ const mergeY = typeof opts.mergeY === "number" ? Math.max(0, Math.trunc(opts.mergeY)) : 2;
874
+ const overlapRatio = typeof opts.overlapRatio === "number" ? clampNumber(opts.overlapRatio, 0.1, 1) : 0.6;
875
+ if (!width || !height)
876
+ return [];
877
+ const segments = [];
878
+ for (let y = 0; y < height; y++) {
879
+ const rowOff = y * width;
880
+ let x = 0;
881
+ while (x < width) {
882
+ while (x < width && bw[rowOff + x] >= 128)
883
+ x++;
884
+ const x0 = x;
885
+ while (x < width && bw[rowOff + x] < 128)
886
+ x++;
887
+ const x1 = x - 1;
888
+ if (x1 >= x0) {
889
+ const len = x1 - x0 + 1;
890
+ if (len >= minRun)
891
+ segments.push({ x0, x1, y });
892
+ }
893
+ }
894
+ }
895
+ // Merge segments across adjacent rows if they overlap significantly in X.
896
+ const bars = [];
897
+ for (const s of segments) {
898
+ let merged = false;
899
+ for (const b of bars) {
900
+ if (s.y > b.y1 + mergeY)
901
+ continue;
902
+ if (s.y < b.y0 - mergeY)
903
+ continue;
904
+ const overlap = Math.max(0, Math.min(b.x1, s.x1) - Math.max(b.x0, s.x0) + 1);
905
+ const minLen = Math.max(1, Math.min(b.x1 - b.x0 + 1, s.x1 - s.x0 + 1));
906
+ if (overlap >= minLen * overlapRatio) {
907
+ b.x0 = Math.min(b.x0, s.x0);
908
+ b.x1 = Math.max(b.x1, s.x1);
909
+ b.y0 = Math.min(b.y0, s.y);
910
+ b.y1 = Math.max(b.y1, s.y);
911
+ merged = true;
912
+ break;
913
+ }
914
+ }
915
+ if (!merged)
916
+ bars.push({ x0: s.x0, x1: s.x1, y0: s.y, y1: s.y });
917
+ }
918
+ return bars
919
+ .filter((b) => b.x1 - b.x0 + 1 >= minRun)
920
+ .filter((b) => b.y1 - b.y0 + 1 <= maxThickness)
921
+ .map((b) => {
922
+ const x0 = clampInt(b.x0, 0, 0, width - 1);
923
+ const x1 = clampInt(b.x1, width - 1, 0, width - 1);
924
+ const y0 = clampInt(b.y0, 0, 0, height - 1);
925
+ const y1 = clampInt(b.y1, height - 1, 0, height - 1);
926
+ const len = x1 - x0 + 1;
927
+ return { x0, x1, y0, y1, cx: (x0 + x1) / 2, cy: (y0 + y1) / 2, len };
928
+ })
929
+ .sort((a, b) => a.cy - b.cy || a.cx - b.cx);
930
+ }
931
+ function detectThinVerticalBarsFromBw(bw, width, height, opts) {
932
+ const minRun = Math.max(1, Math.trunc(opts.minRun));
933
+ const maxThickness = typeof opts.maxThickness === "number" ? Math.max(1, Math.trunc(opts.maxThickness)) : 12;
934
+ const mergeX = typeof opts.mergeX === "number" ? Math.max(0, Math.trunc(opts.mergeX)) : 2;
935
+ const overlapRatio = typeof opts.overlapRatio === "number" ? clampNumber(opts.overlapRatio, 0.1, 1) : 0.6;
936
+ if (!width || !height)
937
+ return [];
938
+ const segments = [];
939
+ for (let x = 0; x < width; x++) {
940
+ let y = 0;
941
+ while (y < height) {
942
+ while (y < height && bw[y * width + x] >= 128)
943
+ y++;
944
+ const y0 = y;
945
+ while (y < height && bw[y * width + x] < 128)
946
+ y++;
947
+ const y1 = y - 1;
948
+ if (y1 >= y0) {
949
+ const len = y1 - y0 + 1;
950
+ if (len >= minRun)
951
+ segments.push({ y0, y1, x });
952
+ }
953
+ }
954
+ }
955
+ // Merge segments across adjacent columns if they overlap significantly in Y.
956
+ const bars = [];
957
+ for (const s of segments) {
958
+ let merged = false;
959
+ for (const b of bars) {
960
+ if (s.x > b.x1 + mergeX)
961
+ continue;
962
+ if (s.x < b.x0 - mergeX)
963
+ continue;
964
+ const overlap = Math.max(0, Math.min(b.y1, s.y1) - Math.max(b.y0, s.y0) + 1);
965
+ const minLen = Math.max(1, Math.min(b.y1 - b.y0 + 1, s.y1 - s.y0 + 1));
966
+ if (overlap >= minLen * overlapRatio) {
967
+ b.x0 = Math.min(b.x0, s.x);
968
+ b.x1 = Math.max(b.x1, s.x);
969
+ b.y0 = Math.min(b.y0, s.y0);
970
+ b.y1 = Math.max(b.y1, s.y1);
971
+ merged = true;
972
+ break;
973
+ }
974
+ }
975
+ if (!merged)
976
+ bars.push({ x0: s.x, x1: s.x, y0: s.y0, y1: s.y1 });
977
+ }
978
+ return bars
979
+ .filter((b) => b.y1 - b.y0 + 1 >= minRun)
980
+ .filter((b) => b.x1 - b.x0 + 1 <= maxThickness)
981
+ .map((b) => {
982
+ const x0 = clampInt(b.x0, 0, 0, width - 1);
983
+ const x1 = clampInt(b.x1, width - 1, 0, width - 1);
984
+ const y0 = clampInt(b.y0, 0, 0, height - 1);
985
+ const y1 = clampInt(b.y1, height - 1, 0, height - 1);
986
+ const len = y1 - y0 + 1;
987
+ return { x0, x1, y0, y1, cx: (x0 + x1) / 2, cy: (y0 + y1) / 2, len };
988
+ })
989
+ .sort((a, b) => a.cx - b.cx || a.cy - b.cy);
990
+ }
991
+ async function ocrIntegerFromImageRegion(args) {
992
+ const left = Math.trunc(args.left);
993
+ const top = Math.trunc(args.top);
994
+ const width = Math.trunc(args.width);
995
+ const height = Math.trunc(args.height);
996
+ if (width <= 0 || height <= 0)
997
+ return null;
998
+ const minValue = typeof args.minValue === "number" && Number.isFinite(args.minValue) ? args.minValue : null;
999
+ const maxValue = typeof args.maxValue === "number" && Number.isFinite(args.maxValue) ? args.maxValue : null;
1000
+ const thresholdListRaw = Array.isArray(args.thresholds) && args.thresholds.length ? args.thresholds : [args.threshold];
1001
+ const thresholdList = thresholdListRaw
1002
+ .map((t) => clampInt(t, 200, 1, 254))
1003
+ .filter((t, i, arr) => arr.indexOf(t) === i);
1004
+ let best = null;
1005
+ for (const thr of thresholdList) {
1006
+ let buf;
1007
+ try {
1008
+ // Many GAIA image tasks contain tiny digits; upscaling materially improves OCR recall.
1009
+ const targetW = 320;
1010
+ const scale = width > 0 && width < targetW ? Math.max(1, Math.min(8, Math.ceil(targetW / width))) : 1;
1011
+ let pipeline = args.sharp(args.source).extract({ left, top, width, height }).grayscale().normalize();
1012
+ if (scale > 1) {
1013
+ pipeline = pipeline.resize({ width: width * scale, height: height * scale, kernel: "nearest" });
1014
+ }
1015
+ buf = await pipeline.threshold(thr).png().toBuffer();
1016
+ }
1017
+ catch {
1018
+ continue;
1019
+ }
1020
+ for (const psm of args.psms) {
1021
+ const out = await ocrRecognizeBuffer({
1022
+ buffer: buf,
1023
+ lang: args.lang,
1024
+ langPathEffective: args.langPathEffective,
1025
+ tessOptions: {
1026
+ tessedit_char_whitelist: "0123456789",
1027
+ tessedit_pageseg_mode: String(Math.trunc(psm)),
1028
+ user_defined_dpi: "300",
1029
+ },
1030
+ output: { text: true, tsv: false },
1031
+ });
1032
+ const rawText = String(out.text ?? "").trim();
1033
+ const runs = rawText.match(/-?\d+/g) ?? [];
1034
+ if (!runs.length)
1035
+ continue;
1036
+ // Prefer longer digit runs (avoids picking the row index "1" when the crop also contains "29").
1037
+ // Also add suffix candidates to repair common OCR concatenation like "129" (index + number).
1038
+ const candidates = [];
1039
+ for (const r0 of runs) {
1040
+ const r = String(r0 ?? "").trim();
1041
+ const digits = r.replace(/^-/, "").length;
1042
+ const n0 = Number.parseInt(r, 10);
1043
+ if (Number.isFinite(n0))
1044
+ candidates.push({ n: n0, digits });
1045
+ if (digits >= 3) {
1046
+ const suff2 = r.replace(/^-/, "").slice(-2);
1047
+ const n2 = Number.parseInt(suff2, 10);
1048
+ if (Number.isFinite(n2))
1049
+ candidates.push({ n: n2, digits: 2 });
1050
+ const suff1 = r.replace(/^-/, "").slice(-1);
1051
+ const n1 = Number.parseInt(suff1, 10);
1052
+ if (Number.isFinite(n1))
1053
+ candidates.push({ n: n1, digits: 1 });
1054
+ }
1055
+ }
1056
+ const filtered = candidates
1057
+ .filter((c) => Number.isFinite(c.n))
1058
+ .filter((c) => (minValue !== null ? c.n >= minValue : true))
1059
+ .filter((c) => (maxValue !== null ? c.n <= maxValue : true));
1060
+ if (!filtered.length)
1061
+ continue;
1062
+ filtered.sort((a, b) => b.digits - a.digits || b.n - a.n);
1063
+ const picked = filtered[0];
1064
+ const conf = typeof out.confidence === "number" && Number.isFinite(out.confidence) ? out.confidence : -1;
1065
+ // Primary key: digits (implicit via selection); secondary: OCR confidence.
1066
+ // Use confidence to break ties across different threshold/PSM attempts.
1067
+ if (!best || picked.digits > best.digits || (picked.digits === best.digits && conf > best.confidence)) {
1068
+ best = { n: picked.n, confidence: conf, digits: picked.digits };
1069
+ }
1070
+ }
1071
+ }
1072
+ return best ? best.n : null;
1073
+ }
1074
+ async function gradeFractionQuizFromImageRowBands(args) {
1075
+ const debugEnabled = process.env.NODEBENCH_DEBUG_FRACTION_QUIZ === "1";
1076
+ const meta = await args.sharp(args.filePath).metadata();
1077
+ const w0 = meta.width ?? 0;
1078
+ const h0 = meta.height ?? 0;
1079
+ if (!w0 || !h0)
1080
+ return null;
1081
+ const scale = w0 < 1200 ? 3 : w0 < 2000 ? 2 : 1;
1082
+ const width = w0 * scale;
1083
+ const height = h0 * scale;
1084
+ const base = await args.sharp(args.filePath)
1085
+ .grayscale()
1086
+ .resize({ width, height, kernel: "lanczos3" })
1087
+ .normalize()
1088
+ .png()
1089
+ .toBuffer();
1090
+ // Detect question row centers from a narrow left strip so math digits don't dominate.
1091
+ const { data: bw, info } = await args.sharp(base)
1092
+ .grayscale()
1093
+ .threshold(210)
1094
+ .raw()
1095
+ .toBuffer({ resolveWithObject: true });
1096
+ const bwW = info.width;
1097
+ const bwH = info.height;
1098
+ const leftW = Math.max(20, Math.floor(bwW * 0.04));
1099
+ const yMin = Math.floor(bwH * 0.08);
1100
+ const yMax = Math.floor(bwH * 0.92);
1101
+ const rowCounts = new Array(bwH).fill(0);
1102
+ let maxRow = 0;
1103
+ for (let y = yMin; y < yMax; y++) {
1104
+ let c = 0;
1105
+ const off = y * bwW;
1106
+ for (let x = 0; x < leftW; x++)
1107
+ if (bw[off + x] < 128)
1108
+ c++;
1109
+ rowCounts[y] = c;
1110
+ if (c > maxRow)
1111
+ maxRow = c;
1112
+ }
1113
+ if (maxRow <= 0)
1114
+ return null;
1115
+ const peakThresh = Math.max(2, Math.floor(maxRow * 0.45));
1116
+ const segs = [];
1117
+ for (let y = yMin; y < yMax; y++) {
1118
+ if (rowCounts[y] < peakThresh)
1119
+ continue;
1120
+ let y2 = y;
1121
+ let peak = rowCounts[y];
1122
+ while (y2 + 1 < yMax && rowCounts[y2 + 1] >= peakThresh) {
1123
+ y2++;
1124
+ peak = Math.max(peak, rowCounts[y2]);
1125
+ }
1126
+ segs.push({ cy: (y + y2) / 2, peak });
1127
+ y = y2;
1128
+ }
1129
+ if (segs.length < 6)
1130
+ return null;
1131
+ segs.sort((a, b) => a.cy - b.cy);
1132
+ // Merge nearby segments (digits like "10" can produce multiple peaks).
1133
+ const merged = [];
1134
+ const mergeTol = Math.max(20, Math.round(bwH * 0.018));
1135
+ for (const s of segs) {
1136
+ const last = merged[merged.length - 1];
1137
+ if (!last || Math.abs(s.cy - last.cy) > mergeTol) {
1138
+ merged.push({ ...s });
1139
+ continue;
1140
+ }
1141
+ const wA = Math.max(1, last.peak);
1142
+ const wB = Math.max(1, s.peak);
1143
+ last.cy = (last.cy * wA + s.cy * wB) / (wA + wB);
1144
+ last.peak = Math.max(last.peak, s.peak);
1145
+ }
1146
+ const pickBestWindow = (centers, k) => {
1147
+ if (centers.length <= k)
1148
+ return centers;
1149
+ let best = centers.slice(0, k);
1150
+ let bestScore = Number.POSITIVE_INFINITY;
1151
+ for (let i = 0; i + k - 1 < centers.length; i++) {
1152
+ const cand = centers.slice(i, i + k);
1153
+ const spacings = cand.slice(1).map((y, j) => y - cand[j]);
1154
+ const avg = spacings.reduce((s, n) => s + n, 0) / spacings.length;
1155
+ const variance = spacings.reduce((s, n) => s + (n - avg) * (n - avg), 0) / spacings.length;
1156
+ if (variance < bestScore) {
1157
+ bestScore = variance;
1158
+ best = cand;
1159
+ }
1160
+ }
1161
+ return best;
1162
+ };
1163
+ const centersAll = merged.map((m) => m.cy).filter((n) => Number.isFinite(n));
1164
+ const targetK = centersAll.length >= 10 ? 10 : centersAll.length;
1165
+ const centers = pickBestWindow(centersAll, targetK).sort((a, b) => a - b);
1166
+ if (centers.length < 6)
1167
+ return null;
1168
+ // Use padded bands around each detected row-center. Midpoint-only bands were too tight on
1169
+ // GAIA's fraction quiz screenshots and could clip stacked numerators/denominators or the
1170
+ // answer box (conversion rows often place the box on the next line).
1171
+ const spacingAt = (i) => {
1172
+ const prev = i > 0 ? centers[i] - centers[i - 1] : centers.length > 1 ? centers[1] - centers[0] : height;
1173
+ const next = i + 1 < centers.length ? centers[i + 1] - centers[i] : prev;
1174
+ const s = Math.max(1, Math.min(prev, next));
1175
+ return s;
1176
+ };
1177
+ const rowBand = (i) => {
1178
+ const spacing = spacingAt(i);
1179
+ const padTop = Math.max(24, Math.round(spacing * 0.45));
1180
+ const padBot = Math.max(24, Math.round(spacing * 0.75));
1181
+ const y0 = clampInt(Math.floor(centers[i] - padTop), 0, 0, height - 1);
1182
+ const y1 = clampInt(Math.ceil(centers[i] + padBot), height, y0 + 1, height);
1183
+ return { y0, y1, spacing };
1184
+ };
1185
+ // These ratios are tuned to GAIA's fraction quiz screenshot layout.
1186
+ // Expression region must be wide enough to include stacked fractions.
1187
+ // Expression region must be wide enough to include stacked fractions.
1188
+ const exprX1 = Math.min(width, Math.floor(width * 0.55));
1189
+ const answerX0 = Math.floor(width * 0.12);
1190
+ const answerX1 = Math.min(width, Math.floor(width * 0.46));
1191
+ const perQuestion = [];
1192
+ let total = 0;
1193
+ const parseAllFractionsLoose = (text) => {
1194
+ const out = [];
1195
+ for (const m of String(text ?? "").matchAll(/(-?\d+)\s*\/\s*(\d+)/g)) {
1196
+ const n = Number.parseInt(m[1], 10);
1197
+ const d = Number.parseInt(m[2], 10);
1198
+ if (!Number.isFinite(n) || !Number.isFinite(d) || d === 0)
1199
+ continue;
1200
+ try {
1201
+ out.push(normalizeFraction({ n, d }));
1202
+ }
1203
+ catch {
1204
+ // ignore
1205
+ }
1206
+ }
1207
+ return out;
1208
+ };
1209
+ const parseAllMixedNumbersLoose = (text) => {
1210
+ const out = [];
1211
+ for (const m of String(text ?? "").matchAll(/(-?\d+)\s+(-?\d+)\s*\/\s*(\d+)/g)) {
1212
+ const whole = Number.parseInt(m[1], 10);
1213
+ const n = Number.parseInt(m[2], 10);
1214
+ const d = Number.parseInt(m[3], 10);
1215
+ if (!Number.isFinite(whole) || !Number.isFinite(n) || !Number.isFinite(d) || d === 0)
1216
+ continue;
1217
+ try {
1218
+ out.push({ whole, frac: normalizeFraction({ n, d }) });
1219
+ }
1220
+ catch {
1221
+ // ignore
1222
+ }
1223
+ }
1224
+ return out;
1225
+ };
1226
+ for (let i = 0; i < centers.length; i++) {
1227
+ const idx = i + 1;
1228
+ const band = rowBand(i);
1229
+ const y0 = band.y0;
1230
+ const y1 = band.y1;
1231
+ const rowH = Math.max(1, y1 - y0);
1232
+ // Detect the answer box X-range for this row. We use it to:
1233
+ // 1) Exclude answer-box fraction bars from operand extraction (critical for conversion rows)
1234
+ // 2) Narrow answer OCR to the box region for cleaner parsing.
1235
+ const rowGrayFull = await args.sharp(base)
1236
+ .extract({ left: 0, top: y0, width, height: rowH })
1237
+ .png()
1238
+ .toBuffer();
1239
+ const { data: rowBwFull, info: rowInfo } = await args.sharp(rowGrayFull)
1240
+ .grayscale()
1241
+ .threshold(210)
1242
+ .raw()
1243
+ .toBuffer({ resolveWithObject: true });
1244
+ // Light OCR on the row text to detect conversion prompts ("Turn ... into ...").
1245
+ // We keep this cheap (no TSV) and deterministic.
1246
+ const rowTextBuf = await args.sharp(base)
1247
+ .extract({ left: 0, top: y0, width: Math.max(1, Math.floor(width * 0.62)), height: rowH })
1248
+ .grayscale()
1249
+ .normalize()
1250
+ .threshold(200)
1251
+ .png()
1252
+ .toBuffer();
1253
+ const rowOcr = await ocrRecognizeBuffer({
1254
+ buffer: rowTextBuf,
1255
+ lang: args.lang,
1256
+ langPathEffective: args.langPathEffective,
1257
+ tessOptions: { tessedit_pageseg_mode: "6", user_defined_dpi: "300" },
1258
+ output: { text: true, tsv: false },
1259
+ });
1260
+ const rowLower = rowOcr.text.toLowerCase();
1261
+ const rowHasTurn = rowLower.includes("turn");
1262
+ const rowHintMixed = rowLower.includes("mixed");
1263
+ const rowHintImproper = rowLower.includes("improper");
1264
+ const detectAnswerBoxRect = () => {
1265
+ // Answer boxes are drawn as rectangles with long-ish horizontal borders.
1266
+ // Detect the longest horizontal bar that isn't the page border, then expand to all overlapping bars.
1267
+ const minRun = Math.max(30, Math.round(rowInfo.width * 0.06));
1268
+ const maxThickness = Math.max(2, Math.round(rowInfo.height * 0.18));
1269
+ const bars = detectThinHorizontalBarsFromBw(rowBwFull, rowInfo.width, rowInfo.height, {
1270
+ minRun,
1271
+ maxThickness,
1272
+ mergeY: 1,
1273
+ overlapRatio: 0.65,
1274
+ })
1275
+ // Exclude anything that starts in the left margin (question numbers).
1276
+ .filter((b) => b.x0 >= Math.round(rowInfo.width * 0.08))
1277
+ // Exclude extremely long lines (page borders / separators).
1278
+ .filter((b) => b.len <= Math.round(rowInfo.width * 0.5));
1279
+ if (!bars.length)
1280
+ return null;
1281
+ const best = [...bars].sort((a, b) => b.len - a.len)[0];
1282
+ const overlap = (a, b) => Math.max(0, Math.min(a.x1, b.x1) - Math.max(a.x0, b.x0) + 1);
1283
+ const overlapsBest = bars.filter((b) => {
1284
+ const ov = overlap(best, b);
1285
+ const minLen = Math.max(1, Math.min(best.len, b.len));
1286
+ return ov >= minLen * 0.72;
1287
+ });
1288
+ let x0 = best.x0;
1289
+ let x1 = best.x1;
1290
+ let y0 = best.y0;
1291
+ let y1 = best.y1;
1292
+ for (const b of overlapsBest) {
1293
+ if (b.x0 < x0)
1294
+ x0 = b.x0;
1295
+ if (b.x1 > x1)
1296
+ x1 = b.x1;
1297
+ if (b.y0 < y0)
1298
+ y0 = b.y0;
1299
+ if (b.y1 > y1)
1300
+ y1 = b.y1;
1301
+ }
1302
+ // Sanity: require a reasonable box width.
1303
+ const boxW = x1 - x0 + 1;
1304
+ if (boxW < Math.round(rowInfo.width * 0.06) || boxW > Math.round(rowInfo.width * 0.45))
1305
+ return null;
1306
+ const boxH = y1 - y0 + 1;
1307
+ if (boxH < Math.max(10, Math.round(rowInfo.height * 0.12)) || boxH > Math.round(rowInfo.height * 0.9)) {
1308
+ return null;
1309
+ }
1310
+ return { x0, x1, y0, y1 };
1311
+ };
1312
+ const answerBox = detectAnswerBoxRect();
1313
+ // Pixel-run fraction-bar detection is more reliable than OCR token geometry for stacked fractions.
1314
+ // OCR struggles to emit numerator/denominator tokens consistently for tiny stacked fractions.
1315
+ const exprGray = await args.sharp(base)
1316
+ .extract({
1317
+ left: 0,
1318
+ top: y0,
1319
+ width: Math.max(1, Math.min(exprX1,
1320
+ // Prefer excluding the answer box so we don't treat student-answer fractions as operands.
1321
+ answerBox ? Math.max(1, Math.round(answerBox.x0 - Math.max(10, rowInfo.width * 0.01))) : exprX1)),
1322
+ height: rowH,
1323
+ })
1324
+ .png()
1325
+ .toBuffer();
1326
+ const { data: exprBw, info: exprInfo } = await args.sharp(exprGray)
1327
+ .grayscale()
1328
+ .threshold(210)
1329
+ .raw()
1330
+ .toBuffer({ resolveWithObject: true });
1331
+ // Fraction bars are short relative to the row width; keep minRun small and filter by ink above/below.
1332
+ // Raise the minimum run length to avoid misclassifying operator glyphs (notably the "÷" bar)
1333
+ // as fraction bars. Stacked fraction bars in this screenshot are materially longer.
1334
+ const minBarRun = Math.max(18, Math.round(exprInfo.width * 0.02));
1335
+ const bars = detectThinHorizontalBarsFromBw(exprBw, exprInfo.width, exprInfo.height, {
1336
+ minRun: minBarRun,
1337
+ maxThickness: Math.max(2, Math.round(exprInfo.height * 0.14)),
1338
+ });
1339
+ const rowCenterLocal = centers[i] - y0;
1340
+ // Use row spacing (not row crop height) to keep parsing stable even if bands overlap slightly.
1341
+ const yWindow = Math.max(30, Math.round(band.spacing * 0.42));
1342
+ const aboveH = Math.max(8, Math.round(exprInfo.height * 0.18));
1343
+ const belowH = aboveH;
1344
+ const candidateBars = bars
1345
+ .filter((b) => Math.abs(b.cy - rowCenterLocal) <= yWindow)
1346
+ .filter((b) => b.len >= minBarRun)
1347
+ // Avoid giant horizontal lines like answer-box borders.
1348
+ .filter((b) => b.len <= Math.max(minBarRun, Math.round(exprInfo.width * 0.28)))
1349
+ .filter((b) => {
1350
+ const padX = Math.max(2, Math.round(b.len * 0.25));
1351
+ const x0 = clampInt(b.x0 - padX, 0, 0, exprInfo.width - 1);
1352
+ const x1 = clampInt(b.x1 + padX, exprInfo.width - 1, 0, exprInfo.width - 1);
1353
+ const aboveY0 = clampInt(b.y0 - aboveH, 0, 0, exprInfo.height - 1);
1354
+ const aboveY1 = clampInt(b.y0 - 1, exprInfo.height - 1, 0, exprInfo.height - 1);
1355
+ const belowY0 = clampInt(b.y1 + 1, exprInfo.height - 1, 0, exprInfo.height - 1);
1356
+ const belowY1 = clampInt(b.y1 + belowH, exprInfo.height - 1, 0, exprInfo.height - 1);
1357
+ const minInk = Math.max(4, Math.round(b.len * 0.05));
1358
+ return (rectHasInk(exprBw, exprInfo.width, exprInfo.height, { x0, y0: aboveY0, x1, y1: aboveY1 }, minInk) &&
1359
+ rectHasInk(exprBw, exprInfo.width, exprInfo.height, { x0, y0: belowY0, x1, y1: belowY1 }, minInk));
1360
+ })
1361
+ .sort((a, b) => Math.abs(a.cy - rowCenterLocal) - Math.abs(b.cy - rowCenterLocal) || b.len - a.len || a.cx - b.cx);
1362
+ const fracPairs = [];
1363
+ for (const bar of candidateBars) {
1364
+ // Adaptive crop sizes: longer fraction bars generally mean larger numerators/denominators (more digits).
1365
+ const padY = Math.max(1, Math.round(exprInfo.height * 0.012));
1366
+ const boxH = clampInt(Math.round(Math.max(exprInfo.height * 0.18, bar.len * 0.85)), 44, 14, 80);
1367
+ // Bar detection sometimes yields a short segment of the true fraction bar (especially under thin fonts).
1368
+ // Use a wider, center-based crop so 2-digit numerators/denominators aren't clipped.
1369
+ const desiredW = clampInt(Math.round(bar.len * 2.4), 80, Math.max(44, Math.round(exprInfo.width * 0.035)), Math.max(80, Math.round(exprInfo.width * 0.22)));
1370
+ const cropX0 = clampInt(Math.round(bar.cx - desiredW / 2), 0, 0, exprInfo.width - 1);
1371
+ const cropX1 = clampInt(cropX0 + desiredW - 1, exprInfo.width - 1, 0, exprInfo.width - 1);
1372
+ const cropW = Math.max(1, cropX1 - cropX0 + 1);
1373
+ const numY1 = clampInt(bar.y0 - padY, exprInfo.height, 0, exprInfo.height);
1374
+ const numY0 = clampInt(numY1 - boxH, 0, 0, numY1);
1375
+ const numH = numY1 - numY0;
1376
+ const denY0 = clampInt(bar.y1 + padY, exprInfo.height - 1, 0, exprInfo.height - 1);
1377
+ const denY1 = clampInt(denY0 + boxH, exprInfo.height, denY0, exprInfo.height);
1378
+ const denH = denY1 - denY0;
1379
+ if (numH < 6 || denH < 6)
1380
+ continue;
1381
+ const tightenForOcr = (rect) => {
1382
+ const minInk = Math.max(2, Math.round(bar.len * 0.03));
1383
+ const bounds = rectInkBounds(exprBw, exprInfo.width, exprInfo.height, rect, minInk);
1384
+ if (!bounds) {
1385
+ return {
1386
+ left: rect.x0,
1387
+ top: rect.y0,
1388
+ width: Math.max(1, rect.x1 - rect.x0 + 1),
1389
+ height: Math.max(1, rect.y1 - rect.y0 + 1),
1390
+ };
1391
+ }
1392
+ const pad = Math.max(2, Math.round(bar.len * 0.08));
1393
+ const x0 = clampInt(bounds.x0 - pad, 0, 0, exprInfo.width - 1);
1394
+ const x1 = clampInt(bounds.x1 + pad, exprInfo.width - 1, 0, exprInfo.width - 1);
1395
+ const y0 = clampInt(bounds.y0 - pad, 0, 0, exprInfo.height - 1);
1396
+ const y1 = clampInt(bounds.y1 + pad, exprInfo.height - 1, 0, exprInfo.height - 1);
1397
+ return {
1398
+ left: x0,
1399
+ top: y0,
1400
+ width: Math.max(1, x1 - x0 + 1),
1401
+ height: Math.max(1, y1 - y0 + 1),
1402
+ };
1403
+ };
1404
+ const numCrop = tightenForOcr({ x0: cropX0, y0: numY0, x1: cropX0 + cropW - 1, y1: numY0 + numH - 1 });
1405
+ const denCrop = tightenForOcr({ x0: cropX0, y0: denY0, x1: cropX0 + cropW - 1, y1: denY0 + denH - 1 });
1406
+ const n = await ocrIntegerFromImageRegion({
1407
+ sharp: args.sharp,
1408
+ source: exprGray,
1409
+ left: numCrop.left,
1410
+ top: numCrop.top,
1411
+ width: numCrop.width,
1412
+ height: numCrop.height,
1413
+ threshold: 200,
1414
+ thresholds: [150, 170, 190, 210],
1415
+ lang: args.lang,
1416
+ langPathEffective: args.langPathEffective,
1417
+ psms: [11, 7],
1418
+ minValue: 0,
1419
+ maxValue: 99,
1420
+ });
1421
+ const d = await ocrIntegerFromImageRegion({
1422
+ sharp: args.sharp,
1423
+ source: exprGray,
1424
+ left: denCrop.left,
1425
+ top: denCrop.top,
1426
+ width: denCrop.width,
1427
+ height: denCrop.height,
1428
+ threshold: 200,
1429
+ thresholds: [150, 170, 190, 210],
1430
+ lang: args.lang,
1431
+ langPathEffective: args.langPathEffective,
1432
+ psms: [7, 6],
1433
+ minValue: 1,
1434
+ maxValue: 99,
1435
+ });
1436
+ if (n === null || d === null || d === 0)
1437
+ continue;
1438
+ try {
1439
+ fracPairs.push({ x: bar.cx, frac: normalizeFraction({ n, d }), bar });
1440
+ }
1441
+ catch {
1442
+ // ignore
1443
+ }
1444
+ // Keep scanning a bit: OCR can fail for one bar; grabbing multiple candidates improves recall.
1445
+ if (fracPairs.length >= 4)
1446
+ break;
1447
+ }
1448
+ fracPairs.sort((a, b) => a.x - b.x);
1449
+ const fracs = [];
1450
+ const minDx = Math.max(10, Math.round(exprInfo.width * 0.035));
1451
+ for (const fp of fracPairs) {
1452
+ const last = fracs[fracs.length - 1];
1453
+ if (last && Math.abs(fp.x - last.x) < minDx)
1454
+ continue;
1455
+ fracs.push(fp);
1456
+ if (fracs.length >= 2)
1457
+ break;
1458
+ }
1459
+ // OCR answer box (digits + slash). Some rows place the answer box on the next line and slightly more
1460
+ // to the left; if the primary crop yields nothing parseable, do a second pass on the lower portion.
1461
+ const ocrAnswerFromCrop = async (crop) => {
1462
+ const buf = await args.sharp(base).extract(crop).png().toBuffer();
1463
+ const out = await ocrRecognizeBuffer({
1464
+ buffer: buf,
1465
+ lang: args.lang,
1466
+ langPathEffective: args.langPathEffective,
1467
+ tessOptions: {
1468
+ tessedit_char_whitelist: "0123456789/- ",
1469
+ tessedit_pageseg_mode: "7",
1470
+ user_defined_dpi: "300",
1471
+ },
1472
+ output: { text: true, tsv: false },
1473
+ });
1474
+ return out.text.replace(/\r/g, "\n").trim();
1475
+ };
1476
+ const defaultAnswerCrop = {
1477
+ left: answerX0,
1478
+ top: y0,
1479
+ width: Math.max(1, answerX1 - answerX0),
1480
+ height: rowH,
1481
+ };
1482
+ const answerCrop = (() => {
1483
+ if (!answerBox)
1484
+ return defaultAnswerCrop;
1485
+ // Crop tightly to the answer box's vertical range to avoid including operand fractions
1486
+ // (which can prepend extra digits like the row index).
1487
+ const padX = Math.max(6, Math.round(rowInfo.width * 0.01));
1488
+ const padY = Math.max(6, Math.round(rowInfo.height * 0.06));
1489
+ const left = clampInt(answerBox.x0 - padX, 0, 0, width - 1);
1490
+ const right = clampInt(answerBox.x1 + padX, width - 1, 0, width - 1);
1491
+ const w = Math.max(1, right - left + 1);
1492
+ const top = clampInt(y0 + answerBox.y0 - padY, 0, 0, height - 1);
1493
+ const bottom = clampInt(y0 + answerBox.y1 + padY, height, top + 1, height);
1494
+ return { left, top, width: w, height: Math.max(1, bottom - top) };
1495
+ })();
1496
+ let ansText = await ocrAnswerFromCrop({
1497
+ left: answerCrop.left,
1498
+ top: answerCrop.top,
1499
+ width: answerCrop.width,
1500
+ height: answerCrop.height,
1501
+ });
1502
+ let studentMixedAll = parseAllMixedNumbersLoose(ansText);
1503
+ let studentFracAll = parseAllFractionsLoose(ansText);
1504
+ if (!studentMixedAll.length && !studentFracAll.length) {
1505
+ const altLeft = Math.floor(width * 0.05);
1506
+ const altRight = Math.min(width, Math.floor(width * 0.45));
1507
+ const altTop = y0 + Math.floor(rowH * 0.35);
1508
+ if (altRight > altLeft && altTop < y1) {
1509
+ const altText = await ocrAnswerFromCrop({
1510
+ left: altLeft,
1511
+ top: altTop,
1512
+ width: Math.max(1, altRight - altLeft),
1513
+ height: Math.max(1, y1 - altTop),
1514
+ });
1515
+ const mixedAlt = parseAllMixedNumbersLoose(altText);
1516
+ const fracAlt = parseAllFractionsLoose(altText);
1517
+ if (mixedAlt.length || fracAlt.length) {
1518
+ ansText = altText;
1519
+ studentMixedAll = mixedAlt;
1520
+ studentFracAll = fracAlt;
1521
+ }
1522
+ }
1523
+ }
1524
+ // Conversion rows can place the answer box below the row midpoint band.
1525
+ // If we suspect a conversion prompt and still found nothing, scan lower (but stop before the next row center).
1526
+ if (rowHasTurn && !studentMixedAll.length && !studentFracAll.length && i + 1 < centers.length) {
1527
+ const spacing = centers[i + 1] - centers[i];
1528
+ const y1Ext = clampInt(Math.ceil(centers[i] + spacing * 0.9), height, y1, height);
1529
+ const extLeft = Math.floor(width * 0.05);
1530
+ const extRight = Math.min(width, Math.floor(width * 0.55));
1531
+ if (y1Ext > y0 + 8 && extRight > extLeft + 20) {
1532
+ const extText = await ocrAnswerFromCrop({
1533
+ left: extLeft,
1534
+ top: y0,
1535
+ width: Math.max(1, extRight - extLeft),
1536
+ height: Math.max(1, y1Ext - y0),
1537
+ });
1538
+ const mixedExt = parseAllMixedNumbersLoose(extText);
1539
+ const fracExt = parseAllFractionsLoose(extText);
1540
+ if (mixedExt.length || fracExt.length) {
1541
+ ansText = extText;
1542
+ studentMixedAll = mixedExt;
1543
+ studentFracAll = fracExt;
1544
+ }
1545
+ }
1546
+ }
1547
+ // Decide if this is a conversion row ("Turn ..."). Prefer explicit row text.
1548
+ const conversionModeHint = rowHasTurn
1549
+ ? rowHintImproper && !rowHintMixed
1550
+ ? "improper"
1551
+ : rowHintMixed && !rowHintImproper
1552
+ ? "mixed"
1553
+ : null
1554
+ : null;
1555
+ let conversionMode = null;
1556
+ if (conversionModeHint)
1557
+ conversionMode = conversionModeHint;
1558
+ else if (rowHasTurn) {
1559
+ // If OCR missed the keywords, infer from the student's answer shape.
1560
+ if (studentMixedAll.length > 0)
1561
+ conversionMode = "mixed";
1562
+ else if (studentFracAll.length > 0)
1563
+ conversionMode = "improper";
1564
+ }
1565
+ // Grade conversion tasks first. Even if bar detection accidentally yields 2 fractions, "Turn" rows are conversions.
1566
+ if (rowHasTurn && conversionMode && fracs.length >= 1) {
1567
+ const givenEntry = [...fracs].sort((a, b) => b.bar.len - a.bar.len || a.x - b.x)[0];
1568
+ const given = givenEntry.frac;
1569
+ const givenBar = givenEntry.bar;
1570
+ if (conversionMode === "improper") {
1571
+ // Turn W N/D into an improper fraction.
1572
+ let whole = null;
1573
+ const m = rowLower.match(/turn\s+(-?\d+)/);
1574
+ const wholeFromText = m ? toIntegerOrNull(m[1]) : null;
1575
+ if (typeof wholeFromText === "number" && Number.isFinite(wholeFromText)) {
1576
+ whole = wholeFromText;
1577
+ }
1578
+ // OCR the number just to the left of the fraction (avoid the question index column). Prefer pixel OCR if it succeeds.
1579
+ const regionX1 = givenBar.x0 - 1;
1580
+ const regionW = Math.max(24, Math.round(exprInfo.width * 0.26));
1581
+ const idxColX1 = Math.round(exprInfo.width * 0.12);
1582
+ const regionX0 = Math.max(idxColX1, Math.max(0, regionX1 - regionW));
1583
+ const cropW = regionX1 - regionX0 + 1;
1584
+ if (regionX1 >= regionX0 && cropW >= 10) {
1585
+ const boxH = clampInt(Math.round(givenBar.len * 1.2), Math.round(exprInfo.height * 0.28), 14, Math.max(14, Math.round(exprInfo.height * 0.55)));
1586
+ const y0w = clampInt(Math.round(givenBar.cy - boxH), 0, 0, exprInfo.height - 1);
1587
+ const y1w = clampInt(Math.round(givenBar.cy + boxH), exprInfo.height, y0w + 1, exprInfo.height);
1588
+ const h = y1w - y0w;
1589
+ if (h >= 8) {
1590
+ const wholeFromPixels = await ocrIntegerFromImageRegion({
1591
+ sharp: args.sharp,
1592
+ source: exprGray,
1593
+ left: regionX0,
1594
+ top: y0w,
1595
+ width: cropW,
1596
+ height: h,
1597
+ threshold: 200,
1598
+ thresholds: [150, 170, 190, 210],
1599
+ lang: args.lang,
1600
+ langPathEffective: args.langPathEffective,
1601
+ psms: [7, 6, 11],
1602
+ minValue: 0,
1603
+ maxValue: 200,
1604
+ });
1605
+ if (wholeFromPixels !== null)
1606
+ whole = wholeFromPixels;
1607
+ }
1608
+ }
1609
+ if (whole === null)
1610
+ continue;
1611
+ const correct = normalizeFraction({ n: whole * given.d + Math.abs(given.n), d: given.d });
1612
+ const picked = studentFracAll.find((f) => fractionsEqual(f, correct)) ?? null;
1613
+ const ok = !!picked;
1614
+ const pts = ok ? args.pointsImproperFraction : 0;
1615
+ total += pts;
1616
+ perQuestion.push({
1617
+ index: idx,
1618
+ type: "improper_fraction",
1619
+ correct: ok,
1620
+ points: pts,
1621
+ studentAnswer: picked
1622
+ ? fractionToString(picked)
1623
+ : studentFracAll[0]
1624
+ ? fractionToString(studentFracAll[0])
1625
+ : null,
1626
+ correctAnswer: fractionToString(correct),
1627
+ });
1628
+ continue;
1629
+ }
1630
+ // conversionMode === "mixed"
1631
+ const wholeOut = Math.trunc(given.n / given.d);
1632
+ const rem = Math.abs(given.n % given.d);
1633
+ const correctFrac = normalizeFraction({ n: rem, d: given.d });
1634
+ const picked = studentMixedAll.find((m) => m.whole === wholeOut && fractionsEqual(m.frac, correctFrac)) ?? null;
1635
+ const ok = !!picked;
1636
+ const pts = ok ? args.pointsMixedNumber : 0;
1637
+ total += pts;
1638
+ perQuestion.push({
1639
+ index: idx,
1640
+ type: "mixed_number",
1641
+ correct: ok,
1642
+ points: pts,
1643
+ studentAnswer: picked
1644
+ ? mixedNumberToString(picked.whole, picked.frac)
1645
+ : studentMixedAll[0]
1646
+ ? mixedNumberToString(studentMixedAll[0].whole, studentMixedAll[0].frac)
1647
+ : null,
1648
+ correctAnswer: mixedNumberToString(wholeOut, correctFrac),
1649
+ });
1650
+ continue;
1651
+ }
1652
+ // Grade arithmetic by matching the student's answer against all possible ops.
1653
+ if (fracs.length >= 2) {
1654
+ const a = fracs[0].frac;
1655
+ const b = fracs[1].frac;
1656
+ const candidates = [];
1657
+ try {
1658
+ candidates.push({ kind: "add_subtract", frac: addFractions(a, b), pts: args.pointsAddSubtract });
1659
+ }
1660
+ catch { }
1661
+ try {
1662
+ candidates.push({ kind: "add_subtract", frac: subFractions(a, b), pts: args.pointsAddSubtract });
1663
+ }
1664
+ catch { }
1665
+ try {
1666
+ candidates.push({ kind: "multiply_divide", frac: mulFractions(a, b), pts: args.pointsMultiplyDivide });
1667
+ }
1668
+ catch { }
1669
+ try {
1670
+ candidates.push({ kind: "multiply_divide", frac: divFractions(a, b), pts: args.pointsMultiplyDivide });
1671
+ }
1672
+ catch { }
1673
+ let match = null;
1674
+ let studentPicked = null;
1675
+ for (const sf of studentFracAll) {
1676
+ const m = candidates.find((c) => fractionsEqual(sf, c.frac));
1677
+ if (m) {
1678
+ match = m;
1679
+ studentPicked = sf;
1680
+ break;
1681
+ }
1682
+ }
1683
+ const ok = !!match;
1684
+ const pts = ok && match ? match.pts : 0;
1685
+ total += pts;
1686
+ perQuestion.push({
1687
+ index: idx,
1688
+ type: match?.kind ?? "arithmetic",
1689
+ correct: ok,
1690
+ points: pts,
1691
+ studentAnswer: studentPicked
1692
+ ? fractionToString(studentPicked)
1693
+ : studentFracAll[0]
1694
+ ? fractionToString(studentFracAll[0])
1695
+ : null,
1696
+ correctAnswer: match ? fractionToString(match.frac) : fractionToString(addFractions(a, b)),
1697
+ debug: debugEnabled
1698
+ ? {
1699
+ operands: [fractionToString(a), fractionToString(b)],
1700
+ studentCandidates: studentFracAll.map(fractionToString).slice(0, 8),
1701
+ }
1702
+ : undefined,
1703
+ });
1704
+ continue;
1705
+ }
586
1706
  }
587
- if include_segments:
588
- out["segments"] = segs
589
- sys.stdout.write(json.dumps(out, ensure_ascii=False))
590
-
591
-
592
- if __name__ == "__main__":
593
- main()
1707
+ total += args.bonusPoints;
1708
+ // Return only if we graded a meaningful number of rows (avoids false positives on unrelated images).
1709
+ if (perQuestion.length < Math.min(centers.length, 6))
1710
+ return null;
1711
+ return { extractedQuestionCount: perQuestion.length, score: total, perQuestion };
1712
+ }
1713
+ const FASTER_WHISPER_PY_SCRIPT_V1 = `# NodeBench MCP audio transcription helper (faster-whisper)
1714
+ # This file is written to a temp directory at runtime.
1715
+ import argparse
1716
+ import json
1717
+ import sys
1718
+
1719
+
1720
+ def main() -> None:
1721
+ p = argparse.ArgumentParser()
1722
+ p.add_argument("--path", required=True)
1723
+ p.add_argument("--model", default="tiny.en")
1724
+ p.add_argument("--language", default="")
1725
+ p.add_argument("--task", default="transcribe")
1726
+ p.add_argument("--beam-size", type=int, default=5)
1727
+ p.add_argument("--vad-filter", type=int, default=0)
1728
+ p.add_argument("--max-chars", type=int, default=12000)
1729
+ p.add_argument("--include-segments", type=int, default=0)
1730
+ args = p.parse_args()
1731
+
1732
+ try:
1733
+ from faster_whisper import WhisperModel
1734
+ except Exception:
1735
+ sys.stderr.write(
1736
+ "Missing python dependency: faster-whisper. Install with: pip install faster-whisper\\n"
1737
+ )
1738
+ raise
1739
+
1740
+ model = WhisperModel(args.model, device="cpu", compute_type="int8")
1741
+ segments, info = model.transcribe(
1742
+ args.path,
1743
+ beam_size=max(1, int(args.beam_size)),
1744
+ language=(args.language or None),
1745
+ task=(args.task or "transcribe"),
1746
+ vad_filter=bool(int(args.vad_filter)),
1747
+ word_timestamps=False,
1748
+ temperature=0.0,
1749
+ )
1750
+
1751
+ include_segments = bool(int(args.include_segments))
1752
+ max_chars = max(200, int(args.max_chars))
1753
+
1754
+ parts = []
1755
+ segs = []
1756
+ char_budget = 0
1757
+ truncated = False
1758
+
1759
+ for seg in segments:
1760
+ t = str(getattr(seg, "text", "") or "")
1761
+ if not t:
1762
+ continue
1763
+ parts.append(t)
1764
+ if include_segments:
1765
+ segs.append(
1766
+ {
1767
+ "start": float(getattr(seg, "start", 0.0) or 0.0),
1768
+ "end": float(getattr(seg, "end", 0.0) or 0.0),
1769
+ "text": t,
1770
+ }
1771
+ )
1772
+ char_budget += len(t)
1773
+ if char_budget >= max_chars:
1774
+ truncated = True
1775
+ break
1776
+
1777
+ text = "".join(parts).strip()
1778
+ if len(text) > max_chars:
1779
+ text = text[:max_chars]
1780
+ truncated = True
1781
+
1782
+ out = {
1783
+ "path": args.path,
1784
+ "model": args.model,
1785
+ "task": args.task,
1786
+ "language": getattr(info, "language", None),
1787
+ "languageProbability": getattr(info, "language_probability", None),
1788
+ "durationSeconds": getattr(info, "duration", None),
1789
+ "beamSize": int(args.beam_size),
1790
+ "vadFilter": bool(int(args.vad_filter)),
1791
+ "maxChars": max_chars,
1792
+ "truncated": truncated,
1793
+ "text": text,
1794
+ }
1795
+ if include_segments:
1796
+ out["segments"] = segs
1797
+ sys.stdout.write(json.dumps(out, ensure_ascii=False))
1798
+
1799
+
1800
+ if __name__ == "__main__":
1801
+ main()
594
1802
  `;
595
1803
  function findPythonExecutable() {
596
1804
  const override = process.env.NODEBENCH_PYTHON ||
@@ -998,7 +2206,15 @@ async function loadXlsxTable(args, opts) {
998
2206
  dataRows,
999
2207
  };
1000
2208
  }
1001
- export const localFileTools = [
2209
+ const GAIA_SOLVER_NAMES = new Set([
2210
+ "solve_red_green_deviation_average_from_image",
2211
+ "solve_green_polygon_area_from_image",
2212
+ "grade_fraction_quiz_from_image",
2213
+ "extract_fractions_and_simplify_from_image",
2214
+ "solve_bass_clef_age_from_image",
2215
+ "solve_storage_upgrade_cost_per_file_from_image",
2216
+ ]);
2217
+ const _ALL_LOCAL_FILE_TOOLS = [
1002
2218
  {
1003
2219
  name: "read_csv_file",
1004
2220
  description: "Read a local CSV file and return a bounded table preview (headers + rows). Deterministic, no network.",
@@ -2850,6 +4066,1825 @@ export const localFileTools = [
2850
4066
  };
2851
4067
  },
2852
4068
  },
4069
+ {
4070
+ name: "solve_red_green_deviation_average_from_image",
4071
+ description: "Extract red and green numbers from an image, compute population stdev(red) and sample stdev(green), then return their average. Deterministic, no network.",
4072
+ inputSchema: {
4073
+ type: "object",
4074
+ properties: {
4075
+ path: {
4076
+ type: "string",
4077
+ description: "Path to a local image file (absolute or relative to current working directory).",
4078
+ },
4079
+ decimals: {
4080
+ type: "number",
4081
+ description: "Decimal places to round to (default: 3).",
4082
+ default: 3,
4083
+ },
4084
+ lang: {
4085
+ type: "string",
4086
+ description: "Tesseract language code (default: eng).",
4087
+ default: "eng",
4088
+ },
4089
+ langPath: {
4090
+ type: "string",
4091
+ description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
4092
+ },
4093
+ maxPixels: {
4094
+ type: "number",
4095
+ description: "Safety cap on pixels to process (default: 6,000,000).",
4096
+ default: 6000000,
4097
+ },
4098
+ debug: {
4099
+ type: "boolean",
4100
+ description: "If true, include detailed debug info (labels + segment assignments).",
4101
+ default: false,
4102
+ },
4103
+ },
4104
+ required: ["path"],
4105
+ },
4106
+ handler: async (args) => {
4107
+ const filePath = resolveLocalPath(args?.path);
4108
+ if (!existsSync(filePath))
4109
+ throw new Error(`File not found: ${filePath}`);
4110
+ const decimals = clampInt(args?.decimals, 3, 0, 8);
4111
+ const lang = String(args?.lang ?? "eng").trim() || "eng";
4112
+ const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
4113
+ const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
4114
+ const langPathEffective = langPathArg
4115
+ ? resolveLocalPath(langPathArg)
4116
+ : existsSync(defaultLangPath)
4117
+ ? defaultLangPath
4118
+ : null;
4119
+ const maxPixels = clampInt(args?.maxPixels, 6000000, 10000, 100_000_000);
4120
+ const redOcr = await ocrRecognizeImageFileWithColorMask({
4121
+ filePath,
4122
+ color: "red",
4123
+ lang,
4124
+ langPathEffective,
4125
+ maxPixels,
4126
+ });
4127
+ const greenOcr = await ocrRecognizeImageFileWithColorMask({
4128
+ filePath,
4129
+ color: "green",
4130
+ lang,
4131
+ langPathEffective,
4132
+ maxPixels,
4133
+ });
4134
+ // OCR on masked grids can concatenate adjacent numbers. Recover by chunking digit runs.
4135
+ const repairTwoDigitGrid = (nums) => {
4136
+ const ones = nums.filter((n) => Number.isFinite(n) && n >= 0 && n <= 9);
4137
+ const twos = nums.filter((n) => Number.isFinite(n) && n >= 10 && n <= 99);
4138
+ // If OCR dropped the leading digit for a handful of tokens (common with '5'),
4139
+ // repair by choosing the 2-digit candidate closest to the distribution median.
4140
+ if (twos.length < 10 || ones.length === 0 || ones.length > 6)
4141
+ return twos;
4142
+ const sorted = [...twos].sort((a, b) => a - b);
4143
+ const med = sorted[Math.floor(sorted.length / 2)];
4144
+ const repaired = [...twos];
4145
+ for (const d of ones) {
4146
+ let best = 10 + d;
4147
+ let bestDist = Math.abs(best - med);
4148
+ for (let k = 2; k <= 9; k++) {
4149
+ const cand = k * 10 + d;
4150
+ const dist = Math.abs(cand - med);
4151
+ if (dist < bestDist) {
4152
+ best = cand;
4153
+ bestDist = dist;
4154
+ }
4155
+ }
4156
+ repaired.push(best);
4157
+ }
4158
+ return repaired;
4159
+ };
4160
+ const redRaw = extractChunkedIntsFromText(redOcr.text, { chunkSize: 2, min: 0, max: 99 });
4161
+ const greenRaw = extractChunkedIntsFromText(greenOcr.text, { chunkSize: 2, min: 0, max: 99 });
4162
+ const redNums = repairTwoDigitGrid(redRaw);
4163
+ const greenNums = repairTwoDigitGrid(greenRaw);
4164
+ if (!redNums.length)
4165
+ throw new Error("No red numbers found via OCR");
4166
+ if (greenNums.length < 2)
4167
+ throw new Error("Need at least 2 green numbers via OCR");
4168
+ const redDev = pstdev(redNums);
4169
+ const greenDev = stdev(greenNums);
4170
+ const avg = (redDev + greenDev) / 2;
4171
+ const rounded = Number(avg.toFixed(decimals));
4172
+ return {
4173
+ path: filePath,
4174
+ decimals,
4175
+ redCount: redNums.length,
4176
+ greenCount: greenNums.length,
4177
+ redPstdev: Number(redDev.toFixed(decimals + 3)),
4178
+ greenStdev: Number(greenDev.toFixed(decimals + 3)),
4179
+ average: rounded,
4180
+ answer: rounded.toFixed(decimals),
4181
+ };
4182
+ },
4183
+ },
4184
+ {
4185
+ name: "solve_green_polygon_area_from_image",
4186
+ description: "Compute the area of a green filled polygon in an image by pixel segmentation, calibrating pixel-to-unit scale from nearby purple length labels. Deterministic, no network.",
4187
+ inputSchema: {
4188
+ type: "object",
4189
+ properties: {
4190
+ path: {
4191
+ type: "string",
4192
+ description: "Path to a local image file (absolute or relative to current working directory).",
4193
+ },
4194
+ lang: {
4195
+ type: "string",
4196
+ description: "Tesseract language code for reading purple numeric labels (default: eng).",
4197
+ default: "eng",
4198
+ },
4199
+ langPath: {
4200
+ type: "string",
4201
+ description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
4202
+ },
4203
+ maxPixels: {
4204
+ type: "number",
4205
+ description: "Safety cap on pixels to process (default: 6,000,000).",
4206
+ default: 6000000,
4207
+ },
4208
+ },
4209
+ required: ["path"],
4210
+ },
4211
+ handler: async (args) => {
4212
+ const filePath = resolveLocalPath(args?.path);
4213
+ if (!existsSync(filePath))
4214
+ throw new Error(`File not found: ${filePath}`);
4215
+ const sharp = await getSharpOptional();
4216
+ if (!sharp)
4217
+ throw new Error("Missing optional dependency: sharp. Install it to use polygon area parsing.");
4218
+ const lang = String(args?.lang ?? "eng").trim() || "eng";
4219
+ const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
4220
+ const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
4221
+ const langPathEffective = langPathArg
4222
+ ? resolveLocalPath(langPathArg)
4223
+ : existsSync(defaultLangPath)
4224
+ ? defaultLangPath
4225
+ : null;
4226
+ const maxPixels = clampInt(args?.maxPixels, 6000000, 10000, 100_000_000);
4227
+ const debug = args?.debug === true || process.env.NODEBENCH_DEBUG_GREEN_POLYGON === "1";
4228
+ // Segment green pixels (filled polygon).
4229
+ const image = sharp(await readFile(filePath));
4230
+ const meta = await image.metadata();
4231
+ const w = meta.width ?? 0;
4232
+ const h = meta.height ?? 0;
4233
+ if (!w || !h)
4234
+ throw new Error("Unable to read image dimensions");
4235
+ if (w * h > maxPixels) {
4236
+ throw new Error(`Refusing huge image (${w}x${h}) for polygon parsing (maxPixels=${maxPixels})`);
4237
+ }
4238
+ const { data, info } = await image.ensureAlpha().raw().toBuffer({ resolveWithObject: true });
4239
+ const width = info.width;
4240
+ const height = info.height;
4241
+ const green = new Uint8Array(width * height);
4242
+ let areaPx = 0;
4243
+ for (let i = 0, j = 0; i < data.length; i += 4, j++) {
4244
+ const r = data[i];
4245
+ const g = data[i + 1];
4246
+ const b = data[i + 2];
4247
+ const a = data[i + 3];
4248
+ // Conservative "green" heuristic: G is high and dominates R/B.
4249
+ const isGreen = a >= 40 && g >= 110 && g - r >= 25 && g - b >= 25;
4250
+ if (isGreen) {
4251
+ green[j] = 1;
4252
+ areaPx++;
4253
+ }
4254
+ }
4255
+ if (!areaPx)
4256
+ throw new Error("No green region detected");
4257
+ // Trace the polygon boundary on a grid, then assign labeled lengths to segments and compute area in units.
4258
+ const vertW = width + 1;
4259
+ const adj = new Map();
4260
+ const pushNeighbor = (a, b) => {
4261
+ const arr = adj.get(a);
4262
+ if (arr) {
4263
+ if (!arr.includes(b))
4264
+ arr.push(b);
4265
+ }
4266
+ else {
4267
+ adj.set(a, [b]);
4268
+ }
4269
+ };
4270
+ const addEdge = (ax, ay, bx, by) => {
4271
+ const aId = ay * vertW + ax;
4272
+ const bId = by * vertW + bx;
4273
+ pushNeighbor(aId, bId);
4274
+ pushNeighbor(bId, aId);
4275
+ };
4276
+ const pix = (x, y) => green[y * width + x] === 1;
4277
+ for (let y = 0; y < height; y++) {
4278
+ for (let x = 0; x < width; x++) {
4279
+ if (!pix(x, y))
4280
+ continue;
4281
+ // Add boundary edges where the neighbor pixel is empty/out-of-bounds.
4282
+ if (y === 0 || !pix(x, y - 1))
4283
+ addEdge(x, y, x + 1, y); // top edge
4284
+ if (y === height - 1 || !pix(x, y + 1))
4285
+ addEdge(x, y + 1, x + 1, y + 1); // bottom edge
4286
+ if (x === 0 || !pix(x - 1, y))
4287
+ addEdge(x, y, x, y + 1); // left edge
4288
+ if (x === width - 1 || !pix(x + 1, y))
4289
+ addEdge(x + 1, y, x + 1, y + 1); // right edge
4290
+ }
4291
+ }
4292
+ if (!adj.size)
4293
+ throw new Error("Failed to build polygon boundary graph");
4294
+ const idToXY = (id) => ({ x: id % vertW, y: Math.floor(id / vertW) });
4295
+ // Extract all boundary cycles (outer boundary + holes). Filled pixel regions can contain holes,
4296
+ // and we must subtract them from the outer boundary area.
4297
+ const edgeKey = (a, b) => (a < b ? `${a}-${b}` : `${b}-${a}`);
4298
+ const visitedEdge = new Set();
4299
+ const maxSteps = Math.max(10_000, width * height);
4300
+ const cycleIds = [];
4301
+ for (const [u, ns] of adj.entries()) {
4302
+ for (const v of ns) {
4303
+ const k0 = edgeKey(u, v);
4304
+ if (visitedEdge.has(k0))
4305
+ continue;
4306
+ const pathIds = [u];
4307
+ let prev = u;
4308
+ let curr = v;
4309
+ visitedEdge.add(k0);
4310
+ for (let step = 0; step < maxSteps; step++) {
4311
+ pathIds.push(curr);
4312
+ if (curr === u)
4313
+ break;
4314
+ const nbrs = adj.get(curr) ?? [];
4315
+ if (!nbrs.length)
4316
+ break;
4317
+ // Prefer an unvisited edge continuing forward, else fall back to "not prev".
4318
+ let next = nbrs.find((n) => n !== prev && !visitedEdge.has(edgeKey(curr, n))) ??
4319
+ nbrs.find((n) => n !== prev) ??
4320
+ nbrs[0];
4321
+ if (typeof next !== "number")
4322
+ break;
4323
+ visitedEdge.add(edgeKey(curr, next));
4324
+ prev = curr;
4325
+ curr = next;
4326
+ }
4327
+ if (pathIds.length >= 4 && pathIds[pathIds.length - 1] === u) {
4328
+ cycleIds.push(pathIds);
4329
+ }
4330
+ }
4331
+ }
4332
+ if (!cycleIds.length)
4333
+ throw new Error("Failed to extract boundary cycles");
4334
+ const cycleData = [];
4335
+ for (const ids of cycleIds) {
4336
+ const pts = ids.map(idToXY);
4337
+ if (pts.length < 4)
4338
+ continue;
4339
+ let x0 = pts[0].x, x1 = pts[0].x, y0 = pts[0].y, y1 = pts[0].y;
4340
+ for (const p of pts) {
4341
+ if (p.x < x0)
4342
+ x0 = p.x;
4343
+ if (p.x > x1)
4344
+ x1 = p.x;
4345
+ if (p.y < y0)
4346
+ y0 = p.y;
4347
+ if (p.y > y1)
4348
+ y1 = p.y;
4349
+ }
4350
+ let twiceArea = 0;
4351
+ for (let i = 0; i < pts.length - 1; i++) {
4352
+ twiceArea += pts[i].x * pts[i + 1].y - pts[i + 1].x * pts[i].y;
4353
+ }
4354
+ const areaPx2 = Math.abs(twiceArea) / 2;
4355
+ if (!Number.isFinite(areaPx2) || areaPx2 < 10)
4356
+ continue;
4357
+ cycleData.push({ pts, areaPx2, box: { x0, y0, x1, y1 } });
4358
+ }
4359
+ if (!cycleData.length)
4360
+ throw new Error("No valid boundary cycles after filtering");
4361
+ const allSegs = [];
4362
+ const cycleSegs = [];
4363
+ for (let cycleIndex = 0; cycleIndex < cycleData.length; cycleIndex++) {
4364
+ const pts = cycleData[cycleIndex].pts;
4365
+ const segs = [];
4366
+ let runDir = null;
4367
+ let runLen = 0;
4368
+ let sx = pts[0].x;
4369
+ let sy = pts[0].y;
4370
+ for (let i = 1; i < pts.length; i++) {
4371
+ const a = pts[i - 1];
4372
+ const b = pts[i];
4373
+ const dx = b.x - a.x;
4374
+ const dy = b.y - a.y;
4375
+ const dir = dx === 1 ? "R" : dx === -1 ? "L" : dy === 1 ? "D" : "U";
4376
+ if (runDir === null) {
4377
+ runDir = dir;
4378
+ runLen = 1;
4379
+ sx = a.x;
4380
+ sy = a.y;
4381
+ continue;
4382
+ }
4383
+ if (dir === runDir) {
4384
+ runLen++;
4385
+ continue;
4386
+ }
4387
+ segs.push({
4388
+ dir: runDir,
4389
+ pxLen: runLen,
4390
+ x0: sx,
4391
+ y0: sy,
4392
+ x1: a.x,
4393
+ y1: a.y,
4394
+ unitLen: null,
4395
+ labelRaw: null,
4396
+ cycleIndex,
4397
+ });
4398
+ runDir = dir;
4399
+ runLen = 1;
4400
+ sx = a.x;
4401
+ sy = a.y;
4402
+ }
4403
+ if (runDir !== null && runLen > 0) {
4404
+ const lastA = pts[pts.length - 2];
4405
+ segs.push({
4406
+ dir: runDir,
4407
+ pxLen: runLen,
4408
+ x0: sx,
4409
+ y0: sy,
4410
+ x1: lastA.x,
4411
+ y1: lastA.y,
4412
+ unitLen: null,
4413
+ labelRaw: null,
4414
+ cycleIndex,
4415
+ });
4416
+ }
4417
+ cycleSegs.push(segs);
4418
+ allSegs.push(...segs);
4419
+ }
4420
+ if (allSegs.length < 4)
4421
+ throw new Error("Failed to simplify boundary into segments");
4422
+ // Build a purple-only binary mask for labels.
4423
+ const purpleBw = new Uint8Array(width * height);
4424
+ for (let i = 0, j = 0; i < data.length; i += 4, j++) {
4425
+ const r = data[i];
4426
+ const g = data[i + 1];
4427
+ const b = data[i + 2];
4428
+ const a = data[i + 3];
4429
+ // Purple length labels: relatively high R and B, lower G. Keep this somewhat permissive to
4430
+ // avoid dropping thin digits (e.g. "4") due to anti-aliasing.
4431
+ const isPurple = a >= 40 && r >= 120 && b >= 120 && g <= 220 && r - g >= 10 && b - g >= 10;
4432
+ purpleBw[j] = isPurple ? 0 : 255;
4433
+ }
4434
+ const parseLabelValue = (raw) => {
4435
+ const cleaned = String(raw ?? "")
4436
+ .trim()
4437
+ .replace(/,/g, ".")
4438
+ .replace(/[^0-9.]/g, "");
4439
+ if (!cleaned)
4440
+ return null;
4441
+ const parts = cleaned.split(".");
4442
+ const normalized = parts.length <= 2 ? cleaned : `${parts[0]}.${parts.slice(1).join("")}`;
4443
+ const n = Number.parseFloat(normalized);
4444
+ return Number.isFinite(n) ? n : null;
4445
+ };
4446
+ const purpleBuf = Buffer.from(purpleBw);
4447
+ const visited = new Uint8Array(width * height);
4448
+ const inBounds = (x, y) => x >= 0 && x < width && y >= 0 && y < height;
4449
+ const neighbors = [
4450
+ [1, 0],
4451
+ [-1, 0],
4452
+ [0, 1],
4453
+ [0, -1],
4454
+ [1, 1],
4455
+ [-1, -1],
4456
+ [1, -1],
4457
+ [-1, 1],
4458
+ ];
4459
+ const comps = [];
4460
+ const idx2 = (x, y) => y * width + x;
4461
+ for (let y = 0; y < height; y++) {
4462
+ for (let x = 0; x < width; x++) {
4463
+ const startIdx = idx2(x, y);
4464
+ if (visited[startIdx])
4465
+ continue;
4466
+ visited[startIdx] = 1;
4467
+ if (purpleBw[startIdx] >= 128)
4468
+ continue;
4469
+ let area = 0;
4470
+ let sx2 = 0;
4471
+ let sy2 = 0;
4472
+ let x0 = x, x1 = x, y0 = y, y1 = y;
4473
+ const qx = [x];
4474
+ const qy = [y];
4475
+ for (let qi = 0; qi < qx.length; qi++) {
4476
+ const px = qx[qi];
4477
+ const py = qy[qi];
4478
+ const pidx = idx2(px, py);
4479
+ if (purpleBw[pidx] >= 128)
4480
+ continue;
4481
+ area++;
4482
+ sx2 += px;
4483
+ sy2 += py;
4484
+ if (px < x0)
4485
+ x0 = px;
4486
+ if (px > x1)
4487
+ x1 = px;
4488
+ if (py < y0)
4489
+ y0 = py;
4490
+ if (py > y1)
4491
+ y1 = py;
4492
+ for (const [dx, dy] of neighbors) {
4493
+ const nx = px + dx;
4494
+ const ny = py + dy;
4495
+ if (!inBounds(nx, ny))
4496
+ continue;
4497
+ const nidx = idx2(nx, ny);
4498
+ if (visited[nidx])
4499
+ continue;
4500
+ visited[nidx] = 1;
4501
+ if (purpleBw[nidx] < 128) {
4502
+ qx.push(nx);
4503
+ qy.push(ny);
4504
+ }
4505
+ }
4506
+ }
4507
+ const bw = x1 - x0 + 1;
4508
+ const bh = y1 - y0 + 1;
4509
+ if (area < 6)
4510
+ continue;
4511
+ if (bw < 2 || bh < 2)
4512
+ continue;
4513
+ if (bw > Math.round(width * 0.25) || bh > Math.round(height * 0.25))
4514
+ continue;
4515
+ const cx = sx2 / area;
4516
+ const cy = sy2 / area;
4517
+ comps.push({ area, x0, y0, x1, y1, cx, cy });
4518
+ }
4519
+ }
4520
+ // Group digit components into label boxes.
4521
+ comps.sort((a, b) => a.cy - b.cy || a.cx - b.cx);
4522
+ const rowTol = Math.max(8, Math.round(height * 0.03));
4523
+ const rows = [];
4524
+ for (const c of comps) {
4525
+ const row = rows.find((r) => Math.abs(r.cy - c.cy) <= rowTol);
4526
+ if (!row) {
4527
+ rows.push({ cy: c.cy, comps: [c] });
4528
+ continue;
4529
+ }
4530
+ row.comps.push(c);
4531
+ row.cy = (row.cy * (row.comps.length - 1) + c.cy) / row.comps.length;
4532
+ }
4533
+ const labelBoxes = [];
4534
+ const xGapTol = Math.max(6, Math.round(width * 0.015));
4535
+ for (const r of rows) {
4536
+ const cs = [...r.comps].sort((a, b) => a.cx - b.cx);
4537
+ let group = [];
4538
+ const flush = () => {
4539
+ if (!group.length)
4540
+ return;
4541
+ let x0 = group[0].x0, y0 = group[0].y0, x1 = group[0].x1, y1 = group[0].y1;
4542
+ for (const c of group) {
4543
+ if (c.x0 < x0)
4544
+ x0 = c.x0;
4545
+ if (c.y0 < y0)
4546
+ y0 = c.y0;
4547
+ if (c.x1 > x1)
4548
+ x1 = c.x1;
4549
+ if (c.y1 > y1)
4550
+ y1 = c.y1;
4551
+ }
4552
+ labelBoxes.push({ x0, y0, x1, y1, cx: (x0 + x1) / 2, cy: (y0 + y1) / 2 });
4553
+ group = [];
4554
+ };
4555
+ for (const c of cs) {
4556
+ const last = group[group.length - 1];
4557
+ if (!last) {
4558
+ group.push(c);
4559
+ continue;
4560
+ }
4561
+ const gap = c.x0 - last.x1;
4562
+ if (gap <= xGapTol) {
4563
+ group.push(c);
4564
+ }
4565
+ else {
4566
+ flush();
4567
+ group.push(c);
4568
+ }
4569
+ }
4570
+ flush();
4571
+ }
4572
+ const ocrLabelBox = async (box) => {
4573
+ const pad = 2;
4574
+ const left = clampInt(box.x0 - pad, 0, 0, width - 1);
4575
+ const top = clampInt(box.y0 - pad, 0, 0, height - 1);
4576
+ const right = clampInt(box.x1 + pad, width - 1, 0, width - 1);
4577
+ const bottom = clampInt(box.y1 + pad, height - 1, 0, height - 1);
4578
+ const w2 = right - left + 1;
4579
+ const h2 = bottom - top + 1;
4580
+ if (w2 < 4 || h2 < 4)
4581
+ return null;
4582
+ const targetW = 140;
4583
+ const scale = w2 < targetW ? Math.max(1, Math.min(8, Math.ceil(targetW / w2))) : 1;
4584
+ const buf = await sharp(purpleBuf, { raw: { width, height, channels: 1 } })
4585
+ .extract({ left, top, width: w2, height: h2 })
4586
+ .resize({ width: w2 * scale, height: h2 * scale, kernel: "nearest" })
4587
+ // sharp blur() requires sigma >= 0.3. Use a tiny blur to suppress mask speckle without destroying digits.
4588
+ .blur(0.3)
4589
+ .threshold(180)
4590
+ .png()
4591
+ .toBuffer();
4592
+ const psms = [7, 8, 11];
4593
+ let best = null;
4594
+ for (const psm of psms) {
4595
+ const out = await ocrRecognizeBuffer({
4596
+ buffer: buf,
4597
+ lang,
4598
+ langPathEffective,
4599
+ tessOptions: {
4600
+ tessedit_char_whitelist: "0123456789.",
4601
+ tessedit_pageseg_mode: String(psm),
4602
+ user_defined_dpi: "300",
4603
+ },
4604
+ output: { text: true, tsv: false },
4605
+ });
4606
+ const raw = String(out.text ?? "").trim();
4607
+ const v = parseLabelValue(raw);
4608
+ if (v === null)
4609
+ continue;
4610
+ const conf = typeof out.confidence === "number" && Number.isFinite(out.confidence) ? out.confidence : -1;
4611
+ if (!best || conf > best.conf)
4612
+ best = { value: v, raw, conf };
4613
+ }
4614
+ return best ? { value: best.value, raw: best.raw } : null;
4615
+ };
4616
+ const labels = [];
4617
+ for (const b of labelBoxes) {
4618
+ const o = await ocrLabelBox(b);
4619
+ if (!o)
4620
+ continue;
4621
+ // Basic sanity filter (avoid spurious large numbers).
4622
+ if (o.value <= 0 || o.value > 1000)
4623
+ continue;
4624
+ labels.push({ value: o.value, cx: b.cx, cy: b.cy, raw: o.raw });
4625
+ }
4626
+ if (!labels.length)
4627
+ throw new Error("Failed to extract any purple labels");
4628
+ const dist2PointToSeg = (px, py, s) => {
4629
+ const xMin = Math.min(s.x0, s.x1);
4630
+ const xMax = Math.max(s.x0, s.x1);
4631
+ const yMin = Math.min(s.y0, s.y1);
4632
+ const yMax = Math.max(s.y0, s.y1);
4633
+ const dx = px < xMin ? xMin - px : px > xMax ? px - xMax : 0;
4634
+ const dy = py < yMin ? yMin - py : py > yMax ? py - yMax : 0;
4635
+ return dx * dx + dy * dy;
4636
+ };
4637
+ const segBestDist = new Array(allSegs.length).fill(Number.POSITIVE_INFINITY);
4638
+ for (const lab of labels) {
4639
+ let bestIdx = -1;
4640
+ let bestD = Number.POSITIVE_INFINITY;
4641
+ for (let i = 0; i < allSegs.length; i++) {
4642
+ const d2 = dist2PointToSeg(lab.cx, lab.cy, allSegs[i]);
4643
+ if (d2 < bestD) {
4644
+ bestD = d2;
4645
+ bestIdx = i;
4646
+ }
4647
+ }
4648
+ if (bestIdx < 0)
4649
+ continue;
4650
+ if (bestD < segBestDist[bestIdx]) {
4651
+ segBestDist[bestIdx] = bestD;
4652
+ allSegs[bestIdx].unitLen = lab.value;
4653
+ allSegs[bestIdx].labelRaw = lab.raw;
4654
+ }
4655
+ }
4656
+ const labeled = allSegs.filter((s) => s.unitLen !== null && s.unitLen > 0);
4657
+ if (!labeled.length)
4658
+ throw new Error("No segments received labels; cannot compute area");
4659
+ const median = (xs) => {
4660
+ const s = [...xs].sort((a, b) => a - b);
4661
+ const mid = Math.floor(s.length / 2);
4662
+ return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
4663
+ };
4664
+ // Calibrate px-per-unit from labeled segments (best effort). Use per-orientation medians so
4665
+ // one bad label doesn't poison all inferred segments.
4666
+ const pxPerUnitAll = median(labeled.map((s) => s.pxLen / s.unitLen));
4667
+ const labeledH = labeled.filter((s) => s.dir === "R" || s.dir === "L");
4668
+ const labeledV = labeled.filter((s) => s.dir === "U" || s.dir === "D");
4669
+ const pxPerUnitH = labeledH.length ? median(labeledH.map((s) => s.pxLen / s.unitLen)) : pxPerUnitAll;
4670
+ const pxPerUnitV = labeledV.length ? median(labeledV.map((s) => s.pxLen / s.unitLen)) : pxPerUnitAll;
4671
+ // Infer missing segment lengths in *units* for unlabeled segments (fallback only).
4672
+ for (const s of allSegs) {
4673
+ if (s.unitLen !== null)
4674
+ continue;
4675
+ const ppu = s.dir === "R" || s.dir === "L" ? pxPerUnitH : pxPerUnitV;
4676
+ const safe = ppu > 0 ? ppu : 1;
4677
+ s.unitLen = s.pxLen / safe;
4678
+ }
4679
+ // Compute area in *unit* coordinates via shoelace on each cycle.
4680
+ const cycleAreasUnits = [];
4681
+ for (let cycleIndex = 0; cycleIndex < cycleSegs.length; cycleIndex++) {
4682
+ const segs = cycleSegs[cycleIndex] ?? [];
4683
+ if (!segs.length) {
4684
+ cycleAreasUnits.push(0);
4685
+ continue;
4686
+ }
4687
+ let ux = 0;
4688
+ let uy = 0;
4689
+ const verts = [{ x: ux, y: uy }];
4690
+ for (const s of segs) {
4691
+ const len = s.unitLen ?? 0;
4692
+ if (s.dir === "R")
4693
+ ux += len;
4694
+ else if (s.dir === "L")
4695
+ ux -= len;
4696
+ else if (s.dir === "D")
4697
+ uy += len;
4698
+ else
4699
+ uy -= len;
4700
+ verts.push({ x: ux, y: uy });
4701
+ }
4702
+ let twiceArea = 0;
4703
+ for (let i = 0; i < verts.length - 1; i++) {
4704
+ twiceArea += verts[i].x * verts[i + 1].y - verts[i + 1].x * verts[i].y;
4705
+ }
4706
+ cycleAreasUnits.push(Math.abs(twiceArea) / 2);
4707
+ }
4708
+ // Subtract holes that are strictly inside the outer boundary bbox (same heuristic as px-space).
4709
+ const outerIdx = cycleData.reduce((bestIdx, c, i) => (c.areaPx2 > cycleData[bestIdx].areaPx2 ? i : bestIdx), 0);
4710
+ const outerBox = cycleData[outerIdx].box;
4711
+ let totalAreaUnits = 0;
4712
+ for (let i = 0; i < cycleData.length; i++) {
4713
+ const c = cycleData[i];
4714
+ const isInsideOuter = i !== outerIdx &&
4715
+ c.box.x0 > outerBox.x0 &&
4716
+ c.box.x1 < outerBox.x1 &&
4717
+ c.box.y0 > outerBox.y0 &&
4718
+ c.box.y1 < outerBox.y1;
4719
+ totalAreaUnits += i === outerIdx ? (cycleAreasUnits[i] ?? 0) : isInsideOuter ? -(cycleAreasUnits[i] ?? 0) : (cycleAreasUnits[i] ?? 0);
4720
+ }
4721
+ const areaUnits = Math.abs(totalAreaUnits);
4722
+ const rounded = Math.round(areaUnits);
4723
+ return {
4724
+ path: filePath,
4725
+ width,
4726
+ height,
4727
+ areaPx,
4728
+ cyclesDetected: cycleData.length,
4729
+ segments: allSegs.length,
4730
+ labeledSegments: labeled.length,
4731
+ labelsDetected: labels.length,
4732
+ pxPerUnit: Number((pxPerUnitAll > 0 ? pxPerUnitAll : 1).toFixed(4)),
4733
+ areaUnits: Number(areaUnits.toFixed(4)),
4734
+ answer: String(rounded),
4735
+ ...(debug
4736
+ ? {
4737
+ labels: labels
4738
+ .map((l) => ({
4739
+ value: l.value,
4740
+ raw: l.raw,
4741
+ cx: Math.round(l.cx),
4742
+ cy: Math.round(l.cy),
4743
+ }))
4744
+ .sort((a, b) => a.cy - b.cy || a.cx - b.cx),
4745
+ outerCycleIndex: outerIdx,
4746
+ outerCycleSegments: (cycleSegs[outerIdx] ?? []).map((s) => ({
4747
+ dir: s.dir,
4748
+ pxLen: s.pxLen,
4749
+ unitLen: s.unitLen,
4750
+ labelRaw: s.labelRaw,
4751
+ })),
4752
+ }
4753
+ : {}),
4754
+ };
4755
+ },
4756
+ },
4757
+ {
4758
+ name: "grade_fraction_quiz_from_image",
4759
+ description: "Grade a fraction quiz shown in an image by OCRing the problems + student answers, computing correct answers, and scoring by problem type. Deterministic, no network.",
4760
+ inputSchema: {
4761
+ type: "object",
4762
+ properties: {
4763
+ path: {
4764
+ type: "string",
4765
+ description: "Path to a local image file (absolute or relative to current working directory).",
4766
+ },
4767
+ bonusPoints: {
4768
+ type: "number",
4769
+ description: "Bonus points added to the final total (default: 0).",
4770
+ default: 0,
4771
+ },
4772
+ pointsAddSubtract: {
4773
+ type: "number",
4774
+ description: "Points for add/subtract fraction problems (default: 5).",
4775
+ default: 5,
4776
+ },
4777
+ pointsMultiplyDivide: {
4778
+ type: "number",
4779
+ description: "Points for multiply/divide fraction problems (default: 10).",
4780
+ default: 10,
4781
+ },
4782
+ pointsImproperFraction: {
4783
+ type: "number",
4784
+ description: "Points for forming an improper fraction (default: 15).",
4785
+ default: 15,
4786
+ },
4787
+ pointsMixedNumber: {
4788
+ type: "number",
4789
+ description: "Points for forming a mixed number (default: 20).",
4790
+ default: 20,
4791
+ },
4792
+ lang: {
4793
+ type: "string",
4794
+ description: "Tesseract language code (default: eng).",
4795
+ default: "eng",
4796
+ },
4797
+ langPath: {
4798
+ type: "string",
4799
+ description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
4800
+ },
4801
+ preprocess: {
4802
+ type: "boolean",
4803
+ description: "If true (default), basic sharp preprocessing is applied before OCR.",
4804
+ default: true,
4805
+ },
4806
+ maxChars: {
4807
+ type: "number",
4808
+ description: "Maximum OCR text characters to consider.",
4809
+ default: 80000,
4810
+ },
4811
+ maxQuestions: {
4812
+ type: "number",
4813
+ description: "Maximum question count to scan for (default: 30).",
4814
+ default: 30,
4815
+ },
4816
+ },
4817
+ required: ["path"],
4818
+ },
4819
+ handler: async (args) => {
4820
+ const filePath = resolveLocalPath(args?.path);
4821
+ if (!existsSync(filePath))
4822
+ throw new Error(`File not found: ${filePath}`);
4823
+ const bonusPoints = clampInt(args?.bonusPoints, 0, -100000, 100000);
4824
+ const pointsAddSubtract = clampInt(args?.pointsAddSubtract, 5, 0, 1000);
4825
+ const pointsMultiplyDivide = clampInt(args?.pointsMultiplyDivide, 10, 0, 1000);
4826
+ const pointsImproperFraction = clampInt(args?.pointsImproperFraction, 15, 0, 1000);
4827
+ const pointsMixedNumber = clampInt(args?.pointsMixedNumber, 20, 0, 1000);
4828
+ const maxQuestions = clampInt(args?.maxQuestions, 30, 1, 200);
4829
+ const lang = String(args?.lang ?? "eng").trim() || "eng";
4830
+ const preprocess = args?.preprocess !== false;
4831
+ const maxChars = clampInt(args?.maxChars, 80000, 5000, 200000);
4832
+ const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
4833
+ const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
4834
+ const langPathEffective = langPathArg
4835
+ ? resolveLocalPath(langPathArg)
4836
+ : existsSync(defaultLangPath)
4837
+ ? defaultLangPath
4838
+ : null;
4839
+ // Preferred path (deterministic + robust): detect row bands from pixels, then OCR each row separately.
4840
+ const sharp = await getSharpOptional();
4841
+ if (sharp && preprocess) {
4842
+ const rowRes = await gradeFractionQuizFromImageRowBands({
4843
+ sharp,
4844
+ filePath,
4845
+ lang,
4846
+ langPathEffective,
4847
+ bonusPoints,
4848
+ pointsAddSubtract,
4849
+ pointsMultiplyDivide,
4850
+ pointsImproperFraction,
4851
+ pointsMixedNumber,
4852
+ });
4853
+ if (rowRes) {
4854
+ return {
4855
+ path: filePath,
4856
+ bonusPoints,
4857
+ extractedQuestionCount: rowRes.extractedQuestionCount,
4858
+ score: rowRes.score,
4859
+ answer: String(rowRes.score),
4860
+ perQuestion: rowRes.perQuestion,
4861
+ ocr: {
4862
+ lang,
4863
+ langPath: langPathEffective,
4864
+ preprocess,
4865
+ usedSharp: true,
4866
+ confidence: null,
4867
+ },
4868
+ };
4869
+ }
4870
+ // Fallback: older whole-page OCR geometry approach.
4871
+ const meta = await sharp(filePath).metadata();
4872
+ const w0 = meta.width ?? 0;
4873
+ const h0 = meta.height ?? 0;
4874
+ if (w0 && h0) {
4875
+ const scale = w0 < 1200 ? 3 : w0 < 2000 ? 2 : 1;
4876
+ const processed = await sharp(filePath)
4877
+ .grayscale()
4878
+ .resize({ width: w0 * scale, height: h0 * scale, kernel: "lanczos3" })
4879
+ .normalize()
4880
+ .threshold(180)
4881
+ .png()
4882
+ .toBuffer();
4883
+ const ocrGeomBase = await ocrRecognizeBuffer({ buffer: processed, lang, langPathEffective });
4884
+ const width = w0 * scale;
4885
+ const height = h0 * scale;
4886
+ const tokens = ocrGeomBase.words
4887
+ .filter((w) => !!w.bbox && !!w.text && w.text.trim())
4888
+ .map((w) => {
4889
+ const bbox = w.bbox;
4890
+ return {
4891
+ text: w.text.trim(),
4892
+ bbox,
4893
+ cx: (bbox.x0 + bbox.x1) / 2,
4894
+ cy: (bbox.y0 + bbox.y1) / 2,
4895
+ };
4896
+ });
4897
+ const leftMaxX = width * 0.08;
4898
+ const answerRegionX0 = width * 0.32;
4899
+ const candidates = tokens
4900
+ .map((t) => ({ t, v: toIntegerOrNull(t.text) }))
4901
+ .filter((x) => typeof x.v === "number" && Number.isFinite(x.v))
4902
+ .filter((x) => x.v >= 1 && x.v <= maxQuestions)
4903
+ .filter((x) => x.t.bbox.x1 <= leftMaxX)
4904
+ .sort((a, b) => a.t.cy - b.t.cy);
4905
+ const candidates10 = candidates.filter((c) => c.v <= 10);
4906
+ const anchorsRaw = candidates10.length >= 6 ? candidates10 : candidates;
4907
+ const anchors = [];
4908
+ const yTol = Math.max(12, Math.round(height * 0.01));
4909
+ for (const a of anchorsRaw) {
4910
+ const last = anchors[anchors.length - 1];
4911
+ if (last && Math.abs(a.t.cy - last.cy) <= yTol)
4912
+ continue;
4913
+ anchors.push({ index: a.v, cy: a.t.cy });
4914
+ }
4915
+ anchors.sort((a, b) => a.cy - b.cy);
4916
+ if (anchors.length >= 2) {
4917
+ const bandTop = (i) => (i === 0 ? 0 : (anchors[i - 1].cy + anchors[i].cy) / 2);
4918
+ const bandBottom = (i) => i === anchors.length - 1 ? height : (anchors[i].cy + anchors[i + 1].cy) / 2;
4919
+ const perQuestion = [];
4920
+ let total = 0;
4921
+ for (let i = 0; i < anchors.length; i++) {
4922
+ const idx = anchors[i].index;
4923
+ const y0 = bandTop(i);
4924
+ const y1 = bandBottom(i);
4925
+ const row = tokens.filter((t) => t.cy >= y0 && t.cy < y1);
4926
+ if (!row.length)
4927
+ continue;
4928
+ const answerTokens = row
4929
+ .filter((t) => t.bbox.x0 >= answerRegionX0)
4930
+ .sort((a, b) => a.bbox.x0 - b.bbox.x0);
4931
+ const answerText = answerTokens.map((t) => t.text).join(" ").trim();
4932
+ const studentMixed = parseMixedNumberLoose(answerText);
4933
+ const studentFrac = parseFractionLoose(answerText);
4934
+ const left = row.filter((t) => t.bbox.x1 < answerRegionX0);
4935
+ const numeric = left
4936
+ .map((t) => ({ t, v: toIntegerOrNull(t.text) }))
4937
+ .filter((x) => typeof x.v === "number" && Number.isFinite(x.v))
4938
+ .filter((x) => !(x.v === idx && x.t.bbox.x1 <= leftMaxX));
4939
+ const heights = numeric
4940
+ .map((x) => x.t.bbox.y1 - x.t.bbox.y0)
4941
+ .filter((n) => Number.isFinite(n) && n > 0)
4942
+ .sort((a, b) => a - b);
4943
+ const medianH = heights.length ? heights[Math.floor(heights.length / 2)] : 20;
4944
+ const maxDy = Math.max(20, Math.round(medianH * 1.8));
4945
+ const xTol = Math.max(18, Math.round(width * 0.015));
4946
+ const cols = [];
4947
+ for (const it of numeric) {
4948
+ const cx = it.t.cx;
4949
+ const col = cols.find((c) => Math.abs(c.x - cx) <= xTol);
4950
+ if (col)
4951
+ col.items.push({ v: it.v, bbox: it.t.bbox, tok: it.t });
4952
+ else
4953
+ cols.push({ x: cx, items: [{ v: it.v, bbox: it.t.bbox, tok: it.t }] });
4954
+ }
4955
+ const used = new Set();
4956
+ const fracPairs = [];
4957
+ for (const col of cols) {
4958
+ const items = col.items.sort((a, b) => a.bbox.y0 - b.bbox.y0);
4959
+ for (let j = 0; j < items.length - 1; j++) {
4960
+ const top = items[j];
4961
+ const bot = items[j + 1];
4962
+ const dy = bot.bbox.y0 - top.bbox.y1;
4963
+ if (dy < -2 || dy > maxDy)
4964
+ continue;
4965
+ if (bot.v === 0)
4966
+ continue;
4967
+ fracPairs.push({ x: col.x, frac: normalizeFraction({ n: top.v, d: bot.v }) });
4968
+ used.add(top.tok);
4969
+ used.add(bot.tok);
4970
+ }
4971
+ }
4972
+ fracPairs.sort((a, b) => a.x - b.x);
4973
+ const fracs = fracPairs.length > 2 ? fracPairs.slice(0, 2) : fracPairs;
4974
+ const opCandidates = left
4975
+ .map((t) => ({ t, s: t.text.trim() }))
4976
+ .filter((x) => x.s.length && x.s.length <= 2)
4977
+ .map((x) => ({ ...x, ch: x.s.replace(/[^\+\-\*xX/]/g, "")[0] ?? "" }))
4978
+ .filter((x) => !!x.ch);
4979
+ const pickOp = (aX, bX) => {
4980
+ const mid = (aX + bX) / 2;
4981
+ const between = opCandidates
4982
+ .filter((o) => o.t.cx >= Math.min(aX, bX) && o.t.cx <= Math.max(aX, bX))
4983
+ .sort((p, q) => Math.abs(p.t.cx - mid) - Math.abs(q.t.cx - mid));
4984
+ return between.length ? between[0].ch : "+";
4985
+ };
4986
+ if (fracs.length >= 2) {
4987
+ const a = fracs[0].frac;
4988
+ const bFrac = fracs[1].frac;
4989
+ const student = studentFrac;
4990
+ if (!student)
4991
+ continue;
4992
+ const opChar = pickOp(fracs[0].x, fracs[1].x);
4993
+ let correct;
4994
+ let kind;
4995
+ if (opChar === "+" || opChar === "-") {
4996
+ correct = opChar === "+" ? addFractions(a, bFrac) : subFractions(a, bFrac);
4997
+ kind = "add_subtract";
4998
+ }
4999
+ else if (opChar === "x" || opChar === "X" || opChar === "*") {
5000
+ correct = mulFractions(a, bFrac);
5001
+ kind = "multiply_divide";
5002
+ }
5003
+ else {
5004
+ correct = divFractions(a, bFrac);
5005
+ kind = "multiply_divide";
5006
+ }
5007
+ const ok = fractionsEqual(student, correct);
5008
+ const pts = ok ? (kind === "add_subtract" ? pointsAddSubtract : pointsMultiplyDivide) : 0;
5009
+ total += pts;
5010
+ perQuestion.push({
5011
+ index: idx,
5012
+ type: kind,
5013
+ correct: ok,
5014
+ points: pts,
5015
+ studentAnswer: fractionToString(student),
5016
+ correctAnswer: fractionToString(correct),
5017
+ });
5018
+ continue;
5019
+ }
5020
+ if (fracs.length === 1) {
5021
+ const given = fracs[0].frac;
5022
+ const wholeCandidates = numeric
5023
+ .filter((n) => !used.has(n.t))
5024
+ .map((n) => ({ v: n.v, x: n.t.bbox.x0 }))
5025
+ .filter((n) => n.x > leftMaxX && n.x < fracs[0].x - xTol * 0.5);
5026
+ const whole = wholeCandidates.length ? wholeCandidates[wholeCandidates.length - 1].v : null;
5027
+ if (whole !== null) {
5028
+ const student = studentFrac;
5029
+ if (!student)
5030
+ continue;
5031
+ const correct = normalizeFraction({ n: whole * given.d + Math.abs(given.n), d: given.d });
5032
+ const ok = fractionsEqual(student, correct);
5033
+ const pts = ok ? pointsImproperFraction : 0;
5034
+ total += pts;
5035
+ perQuestion.push({
5036
+ index: idx,
5037
+ type: "improper_fraction",
5038
+ correct: ok,
5039
+ points: pts,
5040
+ studentAnswer: fractionToString(student),
5041
+ correctAnswer: fractionToString(correct),
5042
+ });
5043
+ continue;
5044
+ }
5045
+ const wholeOut = Math.trunc(given.n / given.d);
5046
+ const rem = Math.abs(given.n % given.d);
5047
+ const correctFrac = normalizeFraction({ n: rem, d: given.d });
5048
+ const ok = studentMixed !== null &&
5049
+ studentMixed.whole === wholeOut &&
5050
+ fractionsEqual(studentMixed.frac, correctFrac);
5051
+ const pts = ok ? pointsMixedNumber : 0;
5052
+ total += pts;
5053
+ perQuestion.push({
5054
+ index: idx,
5055
+ type: "mixed_number",
5056
+ correct: ok,
5057
+ points: pts,
5058
+ studentAnswer: studentMixed ? mixedNumberToString(studentMixed.whole, studentMixed.frac) : null,
5059
+ correctAnswer: mixedNumberToString(wholeOut, correctFrac),
5060
+ });
5061
+ }
5062
+ }
5063
+ total += bonusPoints;
5064
+ // If we graded at least one question, return the geometry-based score.
5065
+ if (perQuestion.length) {
5066
+ return {
5067
+ path: filePath,
5068
+ bonusPoints,
5069
+ extractedQuestionCount: perQuestion.length,
5070
+ score: total,
5071
+ answer: String(total),
5072
+ perQuestion,
5073
+ ocr: {
5074
+ lang,
5075
+ langPath: langPathEffective,
5076
+ preprocess,
5077
+ usedSharp: true,
5078
+ confidence: ocrGeomBase.confidence,
5079
+ },
5080
+ };
5081
+ }
5082
+ }
5083
+ }
5084
+ }
5085
+ const ocr = await ocrRecognizeImageFile({ filePath, lang, langPathEffective, preprocess });
5086
+ const ocrText = ocr.text.slice(0, maxChars);
5087
+ const rawLines = ocr.lines.map((l) => l.text).filter((t) => t && t.trim());
5088
+ const lines = rawLines.length
5089
+ ? rawLines
5090
+ : ocrText
5091
+ .split(/\r?\n/)
5092
+ .map((l) => l.trim())
5093
+ .filter(Boolean);
5094
+ const blocks = [];
5095
+ for (let i = 0; i < lines.length; i++) {
5096
+ const m = lines[i].match(/^\s*(\d{1,3})\b\s*(.*)$/);
5097
+ if (!m)
5098
+ continue;
5099
+ const idx = Number.parseInt(m[1], 10);
5100
+ if (!Number.isFinite(idx) || idx <= 0 || idx > maxQuestions)
5101
+ continue;
5102
+ let text = (m[2] ?? "").trim();
5103
+ // Include subsequent non-index lines as part of the same question.
5104
+ for (let j = i + 1; j < lines.length; j++) {
5105
+ if (/^\s*\d{1,3}\b/.test(lines[j]))
5106
+ break;
5107
+ if (lines[j].trim())
5108
+ text += ` ${lines[j].trim()}`;
5109
+ i = j;
5110
+ }
5111
+ blocks.push({ index: idx, text: text.trim() });
5112
+ }
5113
+ // If we couldn't detect blocks, fall back to the whole OCR text as a single block.
5114
+ if (!blocks.length)
5115
+ blocks.push({ index: 1, text: ocrText });
5116
+ blocks.sort((a, b) => a.index - b.index);
5117
+ const perQuestion = [];
5118
+ let total = 0;
5119
+ for (const b of blocks) {
5120
+ const t = b.text;
5121
+ const lower = t.toLowerCase();
5122
+ // Mixed number conversion: Turn a/b into a mixed number
5123
+ if (lower.includes("turn") && lower.includes("mixed")) {
5124
+ const given = parseFractionLoose(t);
5125
+ const student = parseMixedNumberLoose(t);
5126
+ if (!given)
5127
+ continue;
5128
+ const whole = Math.trunc(given.n / given.d);
5129
+ const rem = Math.abs(given.n % given.d);
5130
+ const correctFrac = normalizeFraction({ n: rem, d: given.d });
5131
+ const correctAnswer = mixedNumberToString(whole, correctFrac);
5132
+ const ok = student !== null &&
5133
+ student.whole === whole &&
5134
+ fractionsEqual(student.frac, correctFrac);
5135
+ const pts = ok ? pointsMixedNumber : 0;
5136
+ total += pts;
5137
+ perQuestion.push({
5138
+ index: b.index,
5139
+ type: "mixed_number",
5140
+ correct: ok,
5141
+ points: pts,
5142
+ studentAnswer: student ? mixedNumberToString(student.whole, student.frac) : null,
5143
+ correctAnswer,
5144
+ });
5145
+ continue;
5146
+ }
5147
+ // Improper fraction conversion: Turn W N/D into an improper fraction
5148
+ if (lower.includes("turn") && lower.includes("improper")) {
5149
+ const given = parseMixedNumberLoose(t);
5150
+ const allFracMatches = Array.from(t.matchAll(/-?\d+\s*\/\s*\d+/g)).map((m) => m[0]);
5151
+ const studentRaw = allFracMatches.length ? allFracMatches[allFracMatches.length - 1] : "";
5152
+ const student = parseFractionLoose(studentRaw);
5153
+ if (!given)
5154
+ continue;
5155
+ const sgn = given.whole < 0 ? -1 : 1;
5156
+ const correct = normalizeFraction({
5157
+ n: given.whole * given.frac.d + sgn * Math.abs(given.frac.n),
5158
+ d: given.frac.d,
5159
+ });
5160
+ const ok = student !== null && fractionsEqual(student, correct);
5161
+ const pts = ok ? pointsImproperFraction : 0;
5162
+ total += pts;
5163
+ perQuestion.push({
5164
+ index: b.index,
5165
+ type: "improper_fraction",
5166
+ correct: ok,
5167
+ points: pts,
5168
+ studentAnswer: student ? fractionToString(student) : null,
5169
+ correctAnswer: fractionToString(correct),
5170
+ });
5171
+ continue;
5172
+ }
5173
+ // Operation with two fractions + a student answer fraction.
5174
+ const matches = Array.from(t.matchAll(/-?\d+\s*\/\s*\d+/g)).map((m) => m[0]);
5175
+ const parsed = matches.map((m) => parseFractionLoose(m)).filter((f) => !!f);
5176
+ if (parsed.length < 3)
5177
+ continue;
5178
+ const a = parsed[0];
5179
+ const bFrac = parsed[1];
5180
+ const student = parsed[parsed.length - 1];
5181
+ // Prefer operator between first and second fraction if possible.
5182
+ const opM = t.match(/-?\d+\s*\/\s*\d+\s*([+\-x×*÷/])\s*-?\d+\s*\/\s*\d+/i);
5183
+ const op = opM ? opM[1] : "+";
5184
+ let correct;
5185
+ let kind;
5186
+ if (op === "+" || op === "-") {
5187
+ correct = op === "+" ? addFractions(a, bFrac) : subFractions(a, bFrac);
5188
+ kind = "add_subtract";
5189
+ }
5190
+ else if (op === "x" || op === "×" || op === "*") {
5191
+ correct = mulFractions(a, bFrac);
5192
+ kind = "multiply_divide";
5193
+ }
5194
+ else {
5195
+ correct = divFractions(a, bFrac);
5196
+ kind = "multiply_divide";
5197
+ }
5198
+ const ok = fractionsEqual(student, correct);
5199
+ const pts = ok ? (kind === "add_subtract" ? pointsAddSubtract : pointsMultiplyDivide) : 0;
5200
+ total += pts;
5201
+ perQuestion.push({
5202
+ index: b.index,
5203
+ type: kind,
5204
+ correct: ok,
5205
+ points: pts,
5206
+ studentAnswer: fractionToString(student),
5207
+ correctAnswer: fractionToString(correct),
5208
+ });
5209
+ }
5210
+ total += bonusPoints;
5211
+ return {
5212
+ path: filePath,
5213
+ bonusPoints,
5214
+ extractedQuestionCount: perQuestion.length,
5215
+ score: total,
5216
+ answer: String(total),
5217
+ perQuestion,
5218
+ ocr: {
5219
+ lang,
5220
+ langPath: langPathEffective,
5221
+ preprocess,
5222
+ usedSharp: ocr.usedSharp,
5223
+ confidence: ocr.confidence,
5224
+ },
5225
+ };
5226
+ },
5227
+ },
5228
+ {
5229
+ name: "extract_fractions_and_simplify_from_image",
5230
+ description: "Extract slash-style fractions (e.g. 3/4) from body text in an image and also detect stacked numerator/denominator fractions in a worksheet-style region, returning the simplified answers. Deterministic, no network.",
5231
+ inputSchema: {
5232
+ type: "object",
5233
+ properties: {
5234
+ path: {
5235
+ type: "string",
5236
+ description: "Path to a local image file (absolute or relative to current working directory).",
5237
+ },
5238
+ lang: {
5239
+ type: "string",
5240
+ description: "Tesseract language code (default: eng).",
5241
+ default: "eng",
5242
+ },
5243
+ langPath: {
5244
+ type: "string",
5245
+ description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
5246
+ },
5247
+ preprocess: {
5248
+ type: "boolean",
5249
+ description: "If true (default), basic sharp preprocessing is applied before OCR.",
5250
+ default: true,
5251
+ },
5252
+ bodyBottomFrac: {
5253
+ type: "number",
5254
+ description: "Body cutoff as fraction of image height (words below are treated as worksheet region). Default: 0.7.",
5255
+ default: 0.7,
5256
+ },
5257
+ maxChars: {
5258
+ type: "number",
5259
+ description: "Maximum OCR text characters to consider.",
5260
+ default: 120000,
5261
+ },
5262
+ },
5263
+ required: ["path"],
5264
+ },
5265
+ handler: async (args) => {
5266
+ const filePath = resolveLocalPath(args?.path);
5267
+ if (!existsSync(filePath))
5268
+ throw new Error(`File not found: ${filePath}`);
5269
+ const sharp = await getSharpOptional();
5270
+ if (!sharp) {
5271
+ throw new Error("Missing optional dependency: sharp. Install it to use image fraction extraction.");
5272
+ }
5273
+ const meta = await sharp(filePath).metadata();
5274
+ const imgH = meta.height ?? 0;
5275
+ if (!imgH)
5276
+ throw new Error("Unable to read image height");
5277
+ const lang = String(args?.lang ?? "eng").trim() || "eng";
5278
+ const preprocess = args?.preprocess !== false;
5279
+ const maxChars = clampInt(args?.maxChars, 120000, 5000, 200000);
5280
+ const bodyBottomFrac = clampNumber(Number(args?.bodyBottomFrac ?? 0.7), 0.3, 0.95);
5281
+ const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
5282
+ const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
5283
+ const langPathEffective = langPathArg
5284
+ ? resolveLocalPath(langPathArg)
5285
+ : existsSync(defaultLangPath)
5286
+ ? defaultLangPath
5287
+ : null;
5288
+ const ocr = await ocrRecognizeImageFile({ filePath, lang, langPathEffective, preprocess });
5289
+ const bodyYMax = imgH * bodyBottomFrac;
5290
+ // Use OCR text (not per-word) since OCR may split "3/4" into multiple tokens (e.g. "3", "/", "4").
5291
+ const ocrText = ocr.text.slice(0, maxChars);
5292
+ const bodyFractions = Array.from(ocrText.matchAll(/(\d+)\s*\/\s*(\d+)/g)).map((m) => `${m[1]}/${m[2]}`);
5293
+ // Detect stacked fractions (numerator over denominator) below cutoff by looking for fraction bars,
5294
+ // then OCR numerator and denominator from sub-crops. This is more robust than relying on whole-page
5295
+ // OCR word boxes, which often miss one half of stacked fractions.
5296
+ const imgW = meta.width ?? 0;
5297
+ if (!imgW)
5298
+ throw new Error("Unable to read image width");
5299
+ const scale = imgW < 1200 ? 3 : imgW < 2000 ? 2 : 1;
5300
+ const baseW = imgW * scale;
5301
+ const baseH = imgH * scale;
5302
+ const base = await sharp(filePath)
5303
+ .grayscale()
5304
+ .resize({ width: baseW, height: baseH, kernel: "lanczos3" })
5305
+ .normalize()
5306
+ .png()
5307
+ .toBuffer();
5308
+ const extractStackedFromCrop = async (cutoffFrac) => {
5309
+ const cropTop = clampInt(Math.floor(imgH * cutoffFrac) * scale, 0, 0, baseH - 1);
5310
+ const cropH = Math.max(1, baseH - cropTop);
5311
+ const worksheetGray = await sharp(base)
5312
+ .extract({ left: 0, top: cropTop, width: baseW, height: cropH })
5313
+ .png()
5314
+ .toBuffer();
5315
+ const { data: wsBw, info: wsInfo } = await sharp(worksheetGray)
5316
+ .grayscale()
5317
+ .threshold(210)
5318
+ .raw()
5319
+ .toBuffer({ resolveWithObject: true });
5320
+ // Worksheet stacked-fraction bars are much shorter than the page width; keep minRun small and
5321
+ // filter by ink above/below to avoid picking input-box borders.
5322
+ const minBarRun = Math.max(12, Math.round(wsInfo.width * 0.004));
5323
+ const bars = detectThinHorizontalBarsFromBw(wsBw, wsInfo.width, wsInfo.height, {
5324
+ minRun: minBarRun,
5325
+ maxThickness: Math.max(2, Math.round(wsInfo.height * 0.08)),
5326
+ });
5327
+ const stacked = [];
5328
+ for (const bar of bars) {
5329
+ // Skip far-right bars; in this worksheet layout, fraction bars live left of the input boxes.
5330
+ if (bar.cx > wsInfo.width * 0.6)
5331
+ continue;
5332
+ // Avoid long horizontal lines like answer-box borders.
5333
+ if (bar.len > wsInfo.width * 0.28)
5334
+ continue;
5335
+ // Require ink above AND below the bar in the same X band (numerator/denominator).
5336
+ const padXInk = Math.max(2, Math.round(bar.len * 0.25));
5337
+ const xInk0 = clampInt(bar.x0 - padXInk, 0, 0, wsInfo.width - 1);
5338
+ const xInk1 = clampInt(bar.x1 + padXInk, wsInfo.width - 1, 0, wsInfo.width - 1);
5339
+ const aboveH = Math.max(10, Math.round(wsInfo.height * 0.12));
5340
+ const belowH = aboveH;
5341
+ const aboveY0 = clampInt(bar.y0 - aboveH, 0, 0, wsInfo.height - 1);
5342
+ const aboveY1 = clampInt(bar.y0 - 1, wsInfo.height - 1, 0, wsInfo.height - 1);
5343
+ const belowY0 = clampInt(bar.y1 + 1, wsInfo.height - 1, 0, wsInfo.height - 1);
5344
+ const belowY1 = clampInt(bar.y1 + belowH, wsInfo.height - 1, 0, wsInfo.height - 1);
5345
+ const minInk = Math.max(5, Math.round(bar.len * 0.05));
5346
+ if (!rectHasInk(wsBw, wsInfo.width, wsInfo.height, { x0: xInk0, y0: aboveY0, x1: xInk1, y1: aboveY1 }, minInk) ||
5347
+ !rectHasInk(wsBw, wsInfo.width, wsInfo.height, { x0: xInk0, y0: belowY0, x1: xInk1, y1: belowY1 }, minInk)) {
5348
+ continue;
5349
+ }
5350
+ const padX = Math.max(2, Math.round(bar.len * 0.25));
5351
+ const padY = Math.max(1, Math.round(wsInfo.height * 0.01));
5352
+ const boxH = clampInt(Math.round(bar.len * 1.2), Math.round(wsInfo.height * 0.14), 14, Math.max(14, Math.round(wsInfo.height * 0.35)));
5353
+ const cropX0 = clampInt(bar.x0 - padX, 0, 0, wsInfo.width - 1);
5354
+ const cropX1 = clampInt(bar.x1 + padX, wsInfo.width - 1, 0, wsInfo.width - 1);
5355
+ const cropW = cropX1 - cropX0 + 1;
5356
+ const numY1 = clampInt(bar.y0 - padY, wsInfo.height, 0, wsInfo.height);
5357
+ const numY0 = clampInt(numY1 - boxH, 0, 0, numY1);
5358
+ const numH = numY1 - numY0;
5359
+ const denY0 = clampInt(bar.y1 + padY, wsInfo.height - 1, 0, wsInfo.height - 1);
5360
+ const denY1 = clampInt(denY0 + boxH, wsInfo.height, denY0, wsInfo.height);
5361
+ const denH = denY1 - denY0;
5362
+ if (numH < 6 || denH < 6)
5363
+ continue;
5364
+ const n = await ocrIntegerFromImageRegion({
5365
+ sharp,
5366
+ source: worksheetGray,
5367
+ left: cropX0,
5368
+ top: numY0,
5369
+ width: cropW,
5370
+ height: numH,
5371
+ threshold: 200,
5372
+ thresholds: [150, 170, 190, 210],
5373
+ lang,
5374
+ langPathEffective,
5375
+ psms: [11, 7],
5376
+ minValue: 0,
5377
+ maxValue: 10000,
5378
+ });
5379
+ const d = await ocrIntegerFromImageRegion({
5380
+ sharp,
5381
+ source: worksheetGray,
5382
+ left: cropX0,
5383
+ top: denY0,
5384
+ width: cropW,
5385
+ height: denH,
5386
+ threshold: 200,
5387
+ thresholds: [150, 170, 190, 210],
5388
+ lang,
5389
+ langPathEffective,
5390
+ psms: [7, 6],
5391
+ minValue: 1,
5392
+ maxValue: 10000,
5393
+ });
5394
+ if (n === null || d === null || d === 0)
5395
+ continue;
5396
+ try {
5397
+ stacked.push({ y: bar.cy, x: bar.cx, frac: normalizeFraction({ n, d }) });
5398
+ }
5399
+ catch {
5400
+ // ignore
5401
+ }
5402
+ }
5403
+ stacked.sort((a, b) => a.y - b.y || a.x - b.x);
5404
+ const simplified = [];
5405
+ const dedupeTolY = Math.max(6, Math.round(wsInfo.height * 0.015));
5406
+ let lastY = -1e9;
5407
+ for (const s of stacked) {
5408
+ if (Math.abs(s.y - lastY) < dedupeTolY)
5409
+ continue;
5410
+ simplified.push(fractionToString(s.frac));
5411
+ lastY = s.y;
5412
+ }
5413
+ return {
5414
+ cutoffFrac,
5415
+ cropTop,
5416
+ barCount: bars.length,
5417
+ stackedCandidateCount: stacked.length,
5418
+ simplified,
5419
+ };
5420
+ };
5421
+ // GAIA image layouts vary; callers often provide a body cutoff, but it can be too low/high.
5422
+ // Start with the requested cutoff, then try a few additional cutoffs if we didn't recover
5423
+ // enough stacked fractions.
5424
+ const cutoffCandidatesRaw = [
5425
+ bodyBottomFrac,
5426
+ bodyBottomFrac - 0.1,
5427
+ bodyBottomFrac - 0.2,
5428
+ 0.7,
5429
+ 0.65,
5430
+ 0.6,
5431
+ 0.55,
5432
+ 0.5,
5433
+ 0.45,
5434
+ 0.4,
5435
+ 0.35,
5436
+ ]
5437
+ .map((n) => clampNumber(n, 0.3, 0.95))
5438
+ .map((n) => Number(n.toFixed(3)));
5439
+ const seen = new Set();
5440
+ const cutoffCandidates = [];
5441
+ for (const c of cutoffCandidatesRaw) {
5442
+ if (seen.has(c))
5443
+ continue;
5444
+ seen.add(c);
5445
+ cutoffCandidates.push(c);
5446
+ }
5447
+ let best = await extractStackedFromCrop(cutoffCandidates[0] ?? bodyBottomFrac);
5448
+ if (best.simplified.length < 5) {
5449
+ for (const c of cutoffCandidates.slice(1)) {
5450
+ const cand = await extractStackedFromCrop(c);
5451
+ if (cand.simplified.length > best.simplified.length)
5452
+ best = cand;
5453
+ // Early stop: once we hit a "healthy" count, avoid extra OCR passes.
5454
+ if (best.simplified.length >= 7)
5455
+ break;
5456
+ }
5457
+ }
5458
+ const simplified = best.simplified;
5459
+ const all = [...bodyFractions, ...simplified];
5460
+ const answer = all.join(",");
5461
+ return {
5462
+ path: filePath,
5463
+ bodyBottomFrac,
5464
+ worksheetBottomFracUsed: best.cutoffFrac,
5465
+ bodyFractionCount: bodyFractions.length,
5466
+ worksheetFractionCount: simplified.length,
5467
+ answer,
5468
+ fractions: all,
5469
+ debug: {
5470
+ barCount: best.barCount,
5471
+ stackedCandidateCount: best.stackedCandidateCount,
5472
+ cutoffsTried: cutoffCandidates,
5473
+ },
5474
+ ocr: {
5475
+ lang,
5476
+ langPath: langPathEffective,
5477
+ preprocess,
5478
+ usedSharp: ocr.usedSharp,
5479
+ confidence: ocr.confidence,
5480
+ maxChars,
5481
+ truncated: ocr.text.length > maxChars,
5482
+ },
5483
+ };
5484
+ },
5485
+ },
5486
+ {
5487
+ name: "solve_bass_clef_age_from_image",
5488
+ description: "Extract bass-clef note letters from a simple staff image and compute the derived 'age' for time-words like DECADE/CENTURY. Deterministic, no network.",
5489
+ inputSchema: {
5490
+ type: "object",
5491
+ properties: {
5492
+ path: {
5493
+ type: "string",
5494
+ description: "Path to a local image file (absolute or relative to current working directory).",
5495
+ },
5496
+ maxPixels: {
5497
+ type: "number",
5498
+ description: "Safety cap on pixels to process (default: 1,000,000).",
5499
+ default: 1000000,
5500
+ },
5501
+ threshold: {
5502
+ type: "number",
5503
+ description: "Binarization threshold (0-255). Default 160.",
5504
+ default: 160,
5505
+ },
5506
+ },
5507
+ required: ["path"],
5508
+ },
5509
+ handler: async (args) => {
5510
+ const filePath = resolveLocalPath(args?.path);
5511
+ if (!existsSync(filePath))
5512
+ throw new Error(`File not found: ${filePath}`);
5513
+ const sharp = await getSharpOptional();
5514
+ if (!sharp)
5515
+ throw new Error("Missing optional dependency: sharp. Install it to use music staff parsing.");
5516
+ const maxPixels = clampInt(args?.maxPixels, 1000000, 10000, 100_000_000);
5517
+ const threshold = clampInt(args?.threshold, 160, 1, 254);
5518
+ const meta = await sharp(filePath).metadata();
5519
+ const w = meta.width ?? 0;
5520
+ const h = meta.height ?? 0;
5521
+ if (!w || !h)
5522
+ throw new Error("Unable to read image dimensions");
5523
+ // These staff images can be tiny (e.g. ~300px wide). Upscale to make line/note detection robust.
5524
+ let scale = 1;
5525
+ if (w < 600 || h < 140) {
5526
+ scale = Math.max(2, Math.min(10, Math.ceil(1800 / w)));
5527
+ }
5528
+ while (w * scale * h * scale > maxPixels && scale > 1)
5529
+ scale--;
5530
+ if (w * scale * h * scale > maxPixels) {
5531
+ throw new Error(`Refusing huge image (${w}x${h}) even after scaling checks (scale=${scale}) (maxPixels=${maxPixels})`);
5532
+ }
5533
+ let pipeline = sharp(filePath);
5534
+ if (scale > 1) {
5535
+ pipeline = pipeline.resize({
5536
+ width: w * scale,
5537
+ height: h * scale,
5538
+ kernel: "nearest",
5539
+ });
5540
+ }
5541
+ const { data, info } = await pipeline
5542
+ .grayscale()
5543
+ .threshold(threshold)
5544
+ .raw()
5545
+ .toBuffer({ resolveWithObject: true });
5546
+ const width = info.width;
5547
+ const height = info.height;
5548
+ const idxOf = (x, y) => y * width + x;
5549
+ const isBlack = (x, y) => data[idxOf(x, y)] < 128;
5550
+ // Horizontal projection to find staff lines.
5551
+ const rowCounts = new Array(height).fill(0);
5552
+ let maxRow = 0;
5553
+ for (let y = 0; y < height; y++) {
5554
+ let c = 0;
5555
+ for (let x = 0; x < width; x++)
5556
+ if (isBlack(x, y))
5557
+ c++;
5558
+ rowCounts[y] = c;
5559
+ if (c > maxRow)
5560
+ maxRow = c;
5561
+ }
5562
+ const lineThresh = Math.max(5, Math.floor(maxRow * 0.55));
5563
+ const lineYs = [];
5564
+ for (let y = 0; y < height; y++) {
5565
+ if (rowCounts[y] < lineThresh)
5566
+ continue;
5567
+ let y2 = y;
5568
+ while (y2 + 1 < height && rowCounts[y2 + 1] >= lineThresh)
5569
+ y2++;
5570
+ lineYs.push((y + y2) / 2);
5571
+ y = y2;
5572
+ }
5573
+ if (lineYs.length < 5)
5574
+ throw new Error(`Failed to detect 5 staff lines (found ${lineYs.length})`);
5575
+ lineYs.sort((a, b) => a - b);
5576
+ // Choose the best contiguous group of 5 lines by spacing consistency.
5577
+ let staff = lineYs.slice(0, 5);
5578
+ if (lineYs.length > 5) {
5579
+ let bestScore = Number.POSITIVE_INFINITY;
5580
+ let best = null;
5581
+ for (let i = 0; i + 4 < lineYs.length; i++) {
5582
+ const cand = lineYs.slice(i, i + 5);
5583
+ const spacings = cand.slice(1).map((y, j) => y - cand[j]);
5584
+ const avg = spacings.reduce((s, n) => s + n, 0) / spacings.length;
5585
+ const variance = spacings.reduce((s, n) => s + (n - avg) * (n - avg), 0) / spacings.length;
5586
+ // Penalize implausible spacing (too tight or too large).
5587
+ const spacingPenalty = avg < 2 ? 1e6 : avg > height / 4 ? 1e6 : 0;
5588
+ const score = variance + spacingPenalty;
5589
+ if (score < bestScore) {
5590
+ bestScore = score;
5591
+ best = cand;
5592
+ }
5593
+ }
5594
+ if (best)
5595
+ staff = best;
5596
+ }
5597
+ const spacings = staff.slice(1).map((y, i) => y - staff[i]);
5598
+ const lineSpacing = spacings.length ? spacings.reduce((s, n) => s + n, 0) / spacings.length : 1;
5599
+ // Remove staff lines into a mutable buffer. Keep pixels that belong to note heads
5600
+ // (detected by having black pixels both above or below the line).
5601
+ const buf = Buffer.from(data);
5602
+ const band = Math.max(1, Math.round(lineSpacing * 0.12));
5603
+ const probe = Math.max(1, Math.round(lineSpacing * 0.35));
5604
+ for (const ly of staff) {
5605
+ const yCenter = Math.round(ly);
5606
+ const y0 = Math.max(0, yCenter - band);
5607
+ const y1 = Math.min(height - 1, yCenter + band);
5608
+ for (let yy = y0; yy <= y1; yy++) {
5609
+ for (let x = 0; x < width; x++) {
5610
+ const idx = idxOf(x, yy);
5611
+ if (buf[idx] >= 128)
5612
+ continue;
5613
+ const aboveY = yy - probe;
5614
+ const belowY = yy + probe;
5615
+ const above = aboveY >= 0 ? buf[idxOf(x, aboveY)] : 255;
5616
+ const below = belowY < height ? buf[idxOf(x, belowY)] : 255;
5617
+ if (above >= 128 && below >= 128) {
5618
+ buf[idx] = 255;
5619
+ }
5620
+ }
5621
+ }
5622
+ }
5623
+ const visited = new Uint8Array(width * height);
5624
+ const inBounds = (x, y) => x >= 0 && x < width && y >= 0 && y < height;
5625
+ const neighbors = [
5626
+ [1, 0],
5627
+ [-1, 0],
5628
+ [0, 1],
5629
+ [0, -1],
5630
+ [1, 1],
5631
+ [-1, -1],
5632
+ [1, -1],
5633
+ [-1, 1],
5634
+ ];
5635
+ const components = [];
5636
+ for (let y = 0; y < height; y++) {
5637
+ for (let x = 0; x < width; x++) {
5638
+ const startIdx = idxOf(x, y);
5639
+ if (visited[startIdx])
5640
+ continue;
5641
+ visited[startIdx] = 1;
5642
+ if (buf[startIdx] >= 128)
5643
+ continue;
5644
+ let area = 0;
5645
+ let sx = 0;
5646
+ let sy = 0;
5647
+ let x0 = x, x1 = x, y0 = y, y1 = y;
5648
+ const qx = [x];
5649
+ const qy = [y];
5650
+ for (let qi = 0; qi < qx.length; qi++) {
5651
+ const px = qx[qi];
5652
+ const py = qy[qi];
5653
+ const pidx = idxOf(px, py);
5654
+ if (buf[pidx] >= 128)
5655
+ continue;
5656
+ area++;
5657
+ sx += px;
5658
+ sy += py;
5659
+ if (px < x0)
5660
+ x0 = px;
5661
+ if (px > x1)
5662
+ x1 = px;
5663
+ if (py < y0)
5664
+ y0 = py;
5665
+ if (py > y1)
5666
+ y1 = py;
5667
+ for (const [dx, dy] of neighbors) {
5668
+ const nx = px + dx;
5669
+ const ny = py + dy;
5670
+ if (!inBounds(nx, ny))
5671
+ continue;
5672
+ const nidx = idxOf(nx, ny);
5673
+ if (visited[nidx])
5674
+ continue;
5675
+ visited[nidx] = 1;
5676
+ if (buf[nidx] < 128) {
5677
+ qx.push(nx);
5678
+ qy.push(ny);
5679
+ }
5680
+ }
5681
+ }
5682
+ const bw = x1 - x0 + 1;
5683
+ const bh = y1 - y0 + 1;
5684
+ // Keep note-head-ish blobs. Scale thresholds by staff spacing.
5685
+ const minArea = Math.max(20, Math.floor(lineSpacing * lineSpacing * 0.35));
5686
+ const maxArea = Math.max(minArea, Math.floor(lineSpacing * lineSpacing * 30));
5687
+ if (area < minArea || area > maxArea)
5688
+ continue;
5689
+ const minDim = Math.max(6, Math.floor(lineSpacing * 0.6));
5690
+ const maxDim = Math.max(minDim, Math.floor(lineSpacing * 4.5));
5691
+ if (bw < minDim || bh < minDim)
5692
+ continue;
5693
+ if (bw > maxDim || bh > maxDim)
5694
+ continue;
5695
+ const cx = sx / area;
5696
+ const cy = sy / area;
5697
+ components.push({ area, cx, cy, x0, y0, x1, y1 });
5698
+ }
5699
+ }
5700
+ let notes = components;
5701
+ if (notes.length > 12) {
5702
+ // If noise produced extra small blobs, keep the most prominent ones.
5703
+ notes = [...notes].sort((a, b) => b.area - a.area).slice(0, 12);
5704
+ }
5705
+ notes = notes.sort((a, b) => a.cx - b.cx);
5706
+ if (!notes.length)
5707
+ throw new Error("No note-like blobs detected");
5708
+ const staffLineCount = staff.length;
5709
+ const noteCount = notes.length;
5710
+ const notesOnLines = notes.filter((n) => {
5711
+ const closest = staff.reduce((best, yy) => Math.abs(yy - n.cy) < Math.abs(best - n.cy) ? yy : best, staff[0]);
5712
+ return Math.abs(closest - n.cy) <= lineSpacing * 0.18;
5713
+ }).length;
5714
+ const bottomLineY = Math.max(...staff);
5715
+ const step = lineSpacing / 2;
5716
+ const lettersSeq = ["G", "A", "B", "C", "D", "E", "F"];
5717
+ const noteLetters = notes.map((n) => {
5718
+ const pos = Math.round((bottomLineY - n.cy) / step);
5719
+ const idx = ((pos % 7) + 7) % 7;
5720
+ return lettersSeq[idx];
5721
+ });
5722
+ const word = noteLetters.join("");
5723
+ const wordLower = word.toLowerCase();
5724
+ const timeWordValue = {
5725
+ decade: 10,
5726
+ score: 20,
5727
+ century: 100,
5728
+ millennium: 1000,
5729
+ year: 1,
5730
+ };
5731
+ const value = timeWordValue[wordLower];
5732
+ if (typeof value !== "number")
5733
+ throw new Error(`Unrecognized time-word from notes: ${word}`);
5734
+ const derived = staffLineCount + noteCount - notesOnLines;
5735
+ const age = value * derived;
5736
+ return {
5737
+ path: filePath,
5738
+ staffLineCount,
5739
+ noteCount,
5740
+ notesOnLines,
5741
+ word,
5742
+ wordValue: value,
5743
+ derived,
5744
+ age,
5745
+ answer: String(age),
5746
+ };
5747
+ },
5748
+ },
5749
+ {
5750
+ name: "solve_storage_upgrade_cost_per_file_from_image",
5751
+ description: "OCR plan tiers from an image, compute required storage from equally-sized file counts, and return average incremental $/file beyond the current plan limit. Deterministic, no network.",
5752
+ inputSchema: {
5753
+ type: "object",
5754
+ properties: {
5755
+ path: {
5756
+ type: "string",
5757
+ description: "Path to a local image file (absolute or relative to current working directory).",
5758
+ },
5759
+ currentPlanName: {
5760
+ type: "string",
5761
+ description: "Name of the current plan (e.g. 'Standard').",
5762
+ },
5763
+ filesUploaded: {
5764
+ type: "number",
5765
+ description: "Number of equally-sized files already uploaded.",
5766
+ },
5767
+ overLimitGb: {
5768
+ type: "number",
5769
+ description: "How many GB over the current plan limit after uploading filesUploaded.",
5770
+ },
5771
+ additionalFiles: {
5772
+ type: "number",
5773
+ description: "Additional equally-sized files to upload.",
5774
+ },
5775
+ decimals: {
5776
+ type: "number",
5777
+ description: "Decimal places to round to (default: 2).",
5778
+ default: 2,
5779
+ },
5780
+ lang: {
5781
+ type: "string",
5782
+ description: "Tesseract language code (default: eng).",
5783
+ default: "eng",
5784
+ },
5785
+ langPath: {
5786
+ type: "string",
5787
+ description: "Optional directory containing traineddata files. If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
5788
+ },
5789
+ preprocess: {
5790
+ type: "boolean",
5791
+ description: "If true (default), basic sharp preprocessing is applied before OCR.",
5792
+ default: true,
5793
+ },
5794
+ maxChars: {
5795
+ type: "number",
5796
+ description: "Maximum OCR text characters to consider.",
5797
+ default: 60000,
5798
+ },
5799
+ },
5800
+ required: ["path", "currentPlanName", "filesUploaded", "overLimitGb", "additionalFiles"],
5801
+ },
5802
+ handler: async (args) => {
5803
+ const filePath = resolveLocalPath(args?.path);
5804
+ if (!existsSync(filePath))
5805
+ throw new Error(`File not found: ${filePath}`);
5806
+ const currentPlanName = String(args?.currentPlanName ?? "").trim();
5807
+ if (!currentPlanName)
5808
+ throw new Error("currentPlanName is required");
5809
+ const filesUploaded = toNumberOrNull(args?.filesUploaded);
5810
+ const overLimitGb = toNumberOrNull(args?.overLimitGb);
5811
+ const additionalFiles = toNumberOrNull(args?.additionalFiles);
5812
+ if (filesUploaded === null || overLimitGb === null || additionalFiles === null) {
5813
+ throw new Error("filesUploaded, overLimitGb, and additionalFiles must be numbers");
5814
+ }
5815
+ if (filesUploaded <= 0)
5816
+ throw new Error("filesUploaded must be > 0");
5817
+ if (overLimitGb < 0)
5818
+ throw new Error("overLimitGb must be >= 0");
5819
+ if (additionalFiles < 0)
5820
+ throw new Error("additionalFiles must be >= 0");
5821
+ const decimals = clampInt(args?.decimals, 2, 0, 6);
5822
+ const lang = String(args?.lang ?? "eng").trim() || "eng";
5823
+ const preprocess = args?.preprocess !== false;
5824
+ const maxChars = clampInt(args?.maxChars, 60000, 5000, 200000);
5825
+ const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
5826
+ const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
5827
+ const langPathEffective = langPathArg
5828
+ ? resolveLocalPath(langPathArg)
5829
+ : existsSync(defaultLangPath)
5830
+ ? defaultLangPath
5831
+ : null;
5832
+ const ocr = await ocrRecognizeImageFile({ filePath, lang, langPathEffective, preprocess });
5833
+ const text = ocr.text.slice(0, maxChars);
5834
+ const lower = text.toLowerCase();
5835
+ const orderedKeys = ["standard", "plus", "premium"].filter((k) => lower.includes(k));
5836
+ const plans = [];
5837
+ for (let i = 0; i < orderedKeys.length; i++) {
5838
+ const key = orderedKeys[i];
5839
+ const start = lower.indexOf(key);
5840
+ const end = i + 1 < orderedKeys.length ? lower.indexOf(orderedKeys[i + 1], start + 1) : lower.length;
5841
+ const block = text.slice(start, end);
5842
+ const priceM = block.match(/\$\s*(\d+(?:\.\d+)?)\s*\/?\s*month/i);
5843
+ const storageM = block.match(/\b(\d+(?:\.\d+)?)\s*tb\b/i);
5844
+ if (!priceM || !storageM)
5845
+ continue;
5846
+ const pricePerMonth = Number.parseFloat(priceM[1]);
5847
+ const storageTb = Number.parseFloat(storageM[1]);
5848
+ if (!Number.isFinite(pricePerMonth) || !Number.isFinite(storageTb))
5849
+ continue;
5850
+ plans.push({
5851
+ name: key[0].toUpperCase() + key.slice(1),
5852
+ pricePerMonth,
5853
+ storageTb,
5854
+ });
5855
+ }
5856
+ if (!plans.length)
5857
+ throw new Error("Failed to parse plans from OCR");
5858
+ const current = plans.find((p) => p.name.toLowerCase() === currentPlanName.toLowerCase());
5859
+ if (!current)
5860
+ throw new Error(`Current plan not found in OCR plans: ${currentPlanName}`);
5861
+ const currentLimitGb = current.storageTb * 1000;
5862
+ const usedGb = currentLimitGb + overLimitGb;
5863
+ const fileSizeGb = usedGb / filesUploaded;
5864
+ const totalFiles = filesUploaded + additionalFiles;
5865
+ const requiredGb = totalFiles * fileSizeGb;
5866
+ const needed = [...plans].sort((a, b) => a.storageTb - b.storageTb).find((p) => p.storageTb * 1000 >= requiredGb);
5867
+ if (!needed)
5868
+ throw new Error("No plan tier can satisfy required storage");
5869
+ const upgradeCost = needed.pricePerMonth - current.pricePerMonth;
5870
+ const includedFilesCapacity = currentLimitGb / fileSizeGb;
5871
+ const filesOverLimit = Math.max(0, totalFiles - includedFilesCapacity);
5872
+ const costPerFile = filesOverLimit > 0 ? upgradeCost / filesOverLimit : 0;
5873
+ const rounded = Number(costPerFile.toFixed(decimals));
5874
+ return {
5875
+ path: filePath,
5876
+ plans,
5877
+ current,
5878
+ needed,
5879
+ fileSizeGb: Number(fileSizeGb.toFixed(4)),
5880
+ requiredGb: Number(requiredGb.toFixed(2)),
5881
+ filesOverLimit: Number(filesOverLimit.toFixed(4)),
5882
+ upgradeCost: Number(upgradeCost.toFixed(2)),
5883
+ costPerFile: rounded,
5884
+ answer: rounded.toFixed(decimals),
5885
+ };
5886
+ },
5887
+ },
2853
5888
  {
2854
5889
  name: "transcribe_audio_file",
2855
5890
  description: "Transcribe a local audio file (MP3/WAV/etc) to text using faster-whisper via Python. Deterministic, no network.",
@@ -2968,4 +6003,8 @@ export const localFileTools = [
2968
6003
  },
2969
6004
  },
2970
6005
  ];
6006
+ /** General-purpose local file parsing tools (19 tools) */
6007
+ export const localFileTools = _ALL_LOCAL_FILE_TOOLS.filter((t) => !GAIA_SOLVER_NAMES.has(t.name));
6008
+ /** Specialized GAIA media image solver tools (6 tools) */
6009
+ export const gaiaMediaSolvers = _ALL_LOCAL_FILE_TOOLS.filter((t) => GAIA_SOLVER_NAMES.has(t.name));
2971
6010
  //# sourceMappingURL=localFileTools.js.map