nodebench-mcp 2.8.0 → 2.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@
11
11
  * - papaparse: CSV parsing
12
12
  * - pdf-parse: PDF text extraction (page-aware)
13
13
  * - yauzl: ZIP/DOCX/PPTX parsing
14
+ * - tesseract.js: Image OCR (PNG/JPG/etc)
14
15
  */
15
16
  import { readFile } from "node:fs/promises";
16
17
  import { existsSync } from "node:fs";
@@ -197,6 +198,149 @@ async function getYauzl() {
197
198
  throw new Error("Missing optional dependency: yauzl. Install it (or run npm install in packages/mcp-local) to use ZIP/DOCX/PPTX parsing.");
198
199
  }
199
200
  }
201
+ async function getSharpOptional() {
202
+ try {
203
+ const mod = await import("sharp");
204
+ return mod.default ?? mod;
205
+ }
206
+ catch {
207
+ return null;
208
+ }
209
+ }
210
+ async function getTesseract() {
211
+ try {
212
+ // Use a non-literal dynamic import so TypeScript doesn't require the optional
213
+ // dependency to be installed at build time.
214
+ const pkg = "tesseract.js";
215
+ const mod = await import(pkg);
216
+ return mod.default ?? mod;
217
+ }
218
+ catch {
219
+ throw new Error("Missing optional dependency: tesseract.js. Install it (or run npm install in packages/mcp-local) to use image OCR.");
220
+ }
221
+ }
222
+ const FASTER_WHISPER_PY_SCRIPT_V1 = `# NodeBench MCP audio transcription helper (faster-whisper)
223
+ # This file is written to a temp directory at runtime.
224
+ import argparse
225
+ import json
226
+ import sys
227
+
228
+
229
+ def main() -> None:
230
+ p = argparse.ArgumentParser()
231
+ p.add_argument("--path", required=True)
232
+ p.add_argument("--model", default="tiny.en")
233
+ p.add_argument("--language", default="")
234
+ p.add_argument("--task", default="transcribe")
235
+ p.add_argument("--beam-size", type=int, default=5)
236
+ p.add_argument("--vad-filter", type=int, default=0)
237
+ p.add_argument("--max-chars", type=int, default=12000)
238
+ p.add_argument("--include-segments", type=int, default=0)
239
+ args = p.parse_args()
240
+
241
+ try:
242
+ from faster_whisper import WhisperModel
243
+ except Exception:
244
+ sys.stderr.write(
245
+ "Missing python dependency: faster-whisper. Install with: pip install faster-whisper\\n"
246
+ )
247
+ raise
248
+
249
+ model = WhisperModel(args.model, device="cpu", compute_type="int8")
250
+ segments, info = model.transcribe(
251
+ args.path,
252
+ beam_size=max(1, int(args.beam_size)),
253
+ language=(args.language or None),
254
+ task=(args.task or "transcribe"),
255
+ vad_filter=bool(int(args.vad_filter)),
256
+ word_timestamps=False,
257
+ temperature=0.0,
258
+ )
259
+
260
+ include_segments = bool(int(args.include_segments))
261
+ max_chars = max(200, int(args.max_chars))
262
+
263
+ parts = []
264
+ segs = []
265
+ char_budget = 0
266
+ truncated = False
267
+
268
+ for seg in segments:
269
+ t = str(getattr(seg, "text", "") or "")
270
+ if not t:
271
+ continue
272
+ parts.append(t)
273
+ if include_segments:
274
+ segs.append(
275
+ {
276
+ "start": float(getattr(seg, "start", 0.0) or 0.0),
277
+ "end": float(getattr(seg, "end", 0.0) or 0.0),
278
+ "text": t,
279
+ }
280
+ )
281
+ char_budget += len(t)
282
+ if char_budget >= max_chars:
283
+ truncated = True
284
+ break
285
+
286
+ text = "".join(parts).strip()
287
+ if len(text) > max_chars:
288
+ text = text[:max_chars]
289
+ truncated = True
290
+
291
+ out = {
292
+ "path": args.path,
293
+ "model": args.model,
294
+ "task": args.task,
295
+ "language": getattr(info, "language", None),
296
+ "languageProbability": getattr(info, "language_probability", None),
297
+ "durationSeconds": getattr(info, "duration", None),
298
+ "beamSize": int(args.beam_size),
299
+ "vadFilter": bool(int(args.vad_filter)),
300
+ "maxChars": max_chars,
301
+ "truncated": truncated,
302
+ "text": text,
303
+ }
304
+ if include_segments:
305
+ out["segments"] = segs
306
+ sys.stdout.write(json.dumps(out, ensure_ascii=False))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()
311
+ `;
312
+ function findPythonExecutable() {
313
+ const override = process.env.NODEBENCH_PYTHON ||
314
+ process.env.NODEBENCH_AUDIO_PYTHON ||
315
+ process.env.PYTHON ||
316
+ process.env.PYTHON_EXE ||
317
+ "";
318
+ if (override)
319
+ return String(override);
320
+ const candidates = [
321
+ path.join(process.cwd(), "python-mcp-servers", ".venv", "Scripts", "python.exe"),
322
+ path.join(process.cwd(), "python-mcp-servers", ".venv", "bin", "python"),
323
+ path.join(process.cwd(), "..", "python-mcp-servers", ".venv", "Scripts", "python.exe"),
324
+ path.join(process.cwd(), "..", "python-mcp-servers", ".venv", "bin", "python"),
325
+ path.join(process.cwd(), "..", "..", "python-mcp-servers", ".venv", "Scripts", "python.exe"),
326
+ path.join(process.cwd(), "..", "..", "python-mcp-servers", ".venv", "bin", "python"),
327
+ ];
328
+ for (const p of candidates) {
329
+ if (existsSync(p))
330
+ return p;
331
+ }
332
+ return "python";
333
+ }
334
+ async function ensureFasterWhisperHelperScript() {
335
+ const dir = path.join(os.tmpdir(), "nodebench-mcp", "audio");
336
+ const scriptPath = path.join(dir, "transcribe_faster_whisper_v1.py");
337
+ if (!existsSync(scriptPath)) {
338
+ const fs = await import("node:fs/promises");
339
+ await fs.mkdir(dir, { recursive: true });
340
+ await fs.writeFile(scriptPath, FASTER_WHISPER_PY_SCRIPT_V1, "utf8");
341
+ }
342
+ return scriptPath;
343
+ }
200
344
  function decodeXmlEntities(text) {
201
345
  return text
202
346
  .replace(/"/g, "\"")
@@ -2281,5 +2425,214 @@ export const localFileTools = [
2281
2425
  };
2282
2426
  },
2283
2427
  },
2428
+ {
2429
+ name: "read_image_ocr_text",
2430
+ description: "Extract text from a local image (PNG/JPG/etc) using OCR (tesseract.js). Deterministic, no network.",
2431
+ inputSchema: {
2432
+ type: "object",
2433
+ properties: {
2434
+ path: {
2435
+ type: "string",
2436
+ description: "Path to a local image file (absolute or relative to current working directory).",
2437
+ },
2438
+ lang: {
2439
+ type: "string",
2440
+ description: "Tesseract language code (default: eng).",
2441
+ default: "eng",
2442
+ },
2443
+ langPath: {
2444
+ type: "string",
2445
+ description: "Optional directory containing traineddata files (e.g. eng.traineddata). If omitted, tesseract.js defaults apply. If .cache/tesseract exists under the current working directory, it is used by default.",
2446
+ },
2447
+ preprocess: {
2448
+ type: "boolean",
2449
+ description: "If true (default), attempts basic preprocessing with sharp (grayscale + normalize + PNG conversion) to improve OCR.",
2450
+ default: true,
2451
+ },
2452
+ maxChars: {
2453
+ type: "number",
2454
+ description: "Maximum characters to return (text is truncated).",
2455
+ default: 12000,
2456
+ },
2457
+ },
2458
+ required: ["path"],
2459
+ },
2460
+ handler: async (args) => {
2461
+ const filePath = resolveLocalPath(args?.path);
2462
+ if (!existsSync(filePath))
2463
+ throw new Error(`File not found: ${filePath}`);
2464
+ const lang = String(args?.lang ?? "eng").trim() || "eng";
2465
+ const maxChars = clampInt(args?.maxChars, 12000, 200, 200000);
2466
+ const preprocess = args?.preprocess !== false;
2467
+ let buffer = await readFile(filePath);
2468
+ let usedSharp = false;
2469
+ if (preprocess) {
2470
+ const sharp = await getSharpOptional();
2471
+ if (sharp) {
2472
+ try {
2473
+ // Normalize to PNG and improve contrast for OCR.
2474
+ buffer = await sharp(buffer).grayscale().normalize().png().toBuffer();
2475
+ usedSharp = true;
2476
+ }
2477
+ catch {
2478
+ // If preprocessing fails, fall back to the original buffer.
2479
+ }
2480
+ }
2481
+ }
2482
+ const langPathArg = typeof args?.langPath === "string" ? args.langPath.trim() : "";
2483
+ const defaultLangPath = path.join(process.cwd(), ".cache", "tesseract");
2484
+ const langPathEffective = langPathArg
2485
+ ? resolveLocalPath(langPathArg)
2486
+ : existsSync(defaultLangPath)
2487
+ ? defaultLangPath
2488
+ : null;
2489
+ const tesseract = await getTesseract();
2490
+ const recognize = tesseract?.recognize;
2491
+ if (typeof recognize !== "function") {
2492
+ throw new Error("tesseract.js missing recognize() export (unsupported version)");
2493
+ }
2494
+ const result = await recognize(buffer, lang, {
2495
+ ...(langPathEffective ? { langPath: langPathEffective } : {}),
2496
+ logger: () => {
2497
+ // silence
2498
+ },
2499
+ });
2500
+ let text = String(result?.data?.text ?? "").trim();
2501
+ const confidence = typeof result?.data?.confidence === "number" ? result.data.confidence : null;
2502
+ let truncated = false;
2503
+ if (text.length > maxChars) {
2504
+ text = text.slice(0, maxChars);
2505
+ truncated = true;
2506
+ }
2507
+ return {
2508
+ path: filePath,
2509
+ lang,
2510
+ langPath: langPathEffective,
2511
+ preprocess,
2512
+ usedSharp,
2513
+ confidence,
2514
+ maxChars,
2515
+ truncated,
2516
+ text,
2517
+ };
2518
+ },
2519
+ },
2520
+ {
2521
+ name: "transcribe_audio_file",
2522
+ description: "Transcribe a local audio file (MP3/WAV/etc) to text using faster-whisper via Python. Deterministic, no network.",
2523
+ inputSchema: {
2524
+ type: "object",
2525
+ properties: {
2526
+ path: {
2527
+ type: "string",
2528
+ description: "Path to a local audio file (absolute or relative to current working directory).",
2529
+ },
2530
+ model: {
2531
+ type: "string",
2532
+ description: "Whisper model name (default: tiny.en).",
2533
+ default: "tiny.en",
2534
+ },
2535
+ language: {
2536
+ type: "string",
2537
+ description: "Optional language hint (e.g. 'en'). If omitted, model auto-detects.",
2538
+ },
2539
+ task: {
2540
+ type: "string",
2541
+ description: "Task mode: transcribe or translate.",
2542
+ default: "transcribe",
2543
+ enum: ["transcribe", "translate"],
2544
+ },
2545
+ beamSize: {
2546
+ type: "number",
2547
+ description: "Beam size (higher = potentially better, slower).",
2548
+ default: 5,
2549
+ },
2550
+ vadFilter: {
2551
+ type: "boolean",
2552
+ description: "If true, enables VAD filtering (can help noisy audio). Default false for determinism.",
2553
+ default: false,
2554
+ },
2555
+ includeSegments: {
2556
+ type: "boolean",
2557
+ description: "If true, returns per-segment timestamps (can be verbose).",
2558
+ default: false,
2559
+ },
2560
+ maxChars: {
2561
+ type: "number",
2562
+ description: "Maximum characters to return (text is truncated).",
2563
+ default: 12000,
2564
+ },
2565
+ timeoutMs: {
2566
+ type: "number",
2567
+ description: "Maximum transcription time before aborting (ms).",
2568
+ default: 300000,
2569
+ },
2570
+ },
2571
+ required: ["path"],
2572
+ },
2573
+ handler: async (args) => {
2574
+ const filePath = resolveLocalPath(args?.path);
2575
+ if (!existsSync(filePath))
2576
+ throw new Error(`File not found: ${filePath}`);
2577
+ const model = String(args?.model ?? "tiny.en").trim() || "tiny.en";
2578
+ const language = typeof args?.language === "string" ? args.language.trim() : "";
2579
+ const task = args?.task === "translate" ? "translate" : "transcribe";
2580
+ const beamSize = clampInt(args?.beamSize, 5, 1, 10);
2581
+ const vadFilter = args?.vadFilter === true;
2582
+ const includeSegments = args?.includeSegments === true;
2583
+ const maxChars = clampInt(args?.maxChars, 12000, 200, 200000);
2584
+ const timeoutMs = clampInt(args?.timeoutMs, 300000, 1000, 1800000);
2585
+ const pythonExe = findPythonExecutable();
2586
+ const scriptPath = await ensureFasterWhisperHelperScript();
2587
+ const child = await import("node:child_process");
2588
+ const util = await import("node:util");
2589
+ const execFileAsync = util.promisify(child.execFile);
2590
+ const argv = [
2591
+ scriptPath,
2592
+ "--path",
2593
+ filePath,
2594
+ "--model",
2595
+ model,
2596
+ "--task",
2597
+ task,
2598
+ "--beam-size",
2599
+ String(beamSize),
2600
+ "--vad-filter",
2601
+ vadFilter ? "1" : "0",
2602
+ "--max-chars",
2603
+ String(maxChars),
2604
+ "--include-segments",
2605
+ includeSegments ? "1" : "0",
2606
+ ];
2607
+ if (language) {
2608
+ argv.push("--language", language);
2609
+ }
2610
+ try {
2611
+ const { stdout, stderr } = (await execFileAsync(pythonExe, argv, {
2612
+ timeout: timeoutMs,
2613
+ maxBuffer: 32 * 1024 * 1024,
2614
+ env: {
2615
+ ...process.env,
2616
+ // Avoid unicode surprises on Windows consoles.
2617
+ PYTHONUTF8: "1",
2618
+ },
2619
+ }));
2620
+ const raw = String(stdout ?? "").trim();
2621
+ if (!raw) {
2622
+ throw new Error(`No output from transcription helper. Stderr: ${String(stderr ?? "").trim() || "(empty)"}`);
2623
+ }
2624
+ const parsed = JSON.parse(raw);
2625
+ return parsed;
2626
+ }
2627
+ catch (err) {
2628
+ const msg = err?.message ?? String(err);
2629
+ const stderr = String(err?.stderr ?? "").trim();
2630
+ const hint = stderr.includes("Missing python dependency: faster-whisper") || msg.includes("No module named")
2631
+ ? "Install the python dependency first: pip install faster-whisper"
2632
+ : "";
2633
+ throw new Error(`Audio transcription failed (python=\"${pythonExe}\", model=\"${model}\"). ${hint}\n${stderr || msg}`);
2634
+ }
2635
+ },
2636
+ },
2284
2637
  ];
2285
2638
  //# sourceMappingURL=localFileTools.js.map