omnius 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +4959 -0
  2. package/dist/index.d.ts +6 -0
  3. package/dist/index.js +630665 -0
  4. package/dist/launcher.cjs +78 -0
  5. package/dist/postinstall-daemon.cjs +776 -0
  6. package/dist/preinstall.cjs +92 -0
  7. package/dist/scripts/autoresearch-prepare.py +459 -0
  8. package/dist/scripts/autoresearch-train.py +661 -0
  9. package/dist/scripts/crawlee-scraper.py +358 -0
  10. package/dist/scripts/live-nemotron.py +478 -0
  11. package/dist/scripts/live-whisper.py +242 -0
  12. package/dist/scripts/ocr-advanced.py +571 -0
  13. package/dist/scripts/start-moondream.py +112 -0
  14. package/dist/scripts/tor/UPSTREAM-README.md +148 -0
  15. package/dist/scripts/tor/destroy_tor.sh +29 -0
  16. package/dist/scripts/tor/tor_setup.sh +163 -0
  17. package/dist/scripts/transcribe-file.py +63 -0
  18. package/dist/scripts/web_scrape.py +1295 -0
  19. package/npm-shrinkwrap.json +7412 -0
  20. package/package.json +142 -0
  21. package/prompts/agentic/system-large.md +569 -0
  22. package/prompts/agentic/system-medium.md +211 -0
  23. package/prompts/agentic/system-small.md +114 -0
  24. package/prompts/compaction/context-compaction.md +44 -0
  25. package/prompts/personality/level-1-minimal.md +3 -0
  26. package/prompts/personality/level-2-concise.md +3 -0
  27. package/prompts/personality/level-4-explanatory.md +3 -0
  28. package/prompts/personality/level-5-thorough.md +3 -0
  29. package/prompts/personality/level-autist.md +3 -0
  30. package/prompts/personality/level-stark.md +3 -0
  31. package/prompts/runners/dispatcher.md +24 -0
  32. package/prompts/runners/editor.md +44 -0
  33. package/prompts/runners/evaluator.md +30 -0
  34. package/prompts/runners/merge-summary.md +9 -0
  35. package/prompts/runners/normalizer.md +23 -0
  36. package/prompts/runners/planner.md +33 -0
  37. package/prompts/runners/scout.md +39 -0
  38. package/prompts/runners/verifier.md +36 -0
  39. package/prompts/skill-builder/seed-analysis.md +30 -0
  40. package/prompts/skill-builder/skill-expansion.md +76 -0
  41. package/prompts/skill-builder/skill-validation.md +31 -0
  42. package/prompts/templates/analysis.md +14 -0
  43. package/prompts/templates/code-review.md +16 -0
  44. package/prompts/templates/code.md +13 -0
  45. package/prompts/templates/document.md +13 -0
  46. package/prompts/templates/error-diagnosis.md +14 -0
  47. package/prompts/templates/general.md +9 -0
  48. package/prompts/templates/plan.md +15 -0
  49. package/prompts/templates/system.md +16 -0
  50. package/prompts/tui/dmn-gather.md +128 -0
  51. package/prompts/tui/dream-consolidate.md +48 -0
  52. package/prompts/tui/dream-lucid-eval.md +17 -0
  53. package/prompts/tui/dream-lucid-implement.md +14 -0
  54. package/prompts/tui/dream-stages.md +19 -0
  55. package/prompts/tui/emotion-behavioral.md +2 -0
  56. package/prompts/tui/emotion-center.md +12 -0
  57. package/voices/personaplex/OverBarn.pt +0 -0
  58. package/voices/personaplex/clone-voice.py +384 -0
  59. package/voices/personaplex/dequant-loader.py +174 -0
  60. package/voices/personaplex/quantize-weights.py +167 -0
@@ -0,0 +1,571 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ocr-advanced.py — Multi-variant, multi-PSM OCR pipeline for open-agents.
4
+
5
+ Implements a full preprocessing + OCR + cross-reference pipeline:
6
+ 1. Load image → grayscale → 2x upscale
7
+ 2. Generate 7 preprocessing variants (two adaptive windows, OTSU, two fixed
8
+ thresholds, two sharpen kernels, denoise)
9
+ 3. Run Tesseract with PSM 4, 6, 11 on each variant (up to 21 passes)
10
+ 4. Score results using combined heuristic (confidence * coverage + line bonus)
11
+ 5. Optionally extract regions (header/body/footer) with cross-reference
12
+ 6. Output as JSON, text, CSV, or write all formats to an output directory
13
+
14
+ Usage:
15
+ python3 ocr-advanced.py <image_or_dir> [options]
16
+
17
+ Single image:
18
+ python3 ocr-advanced.py photo.jpg --output json
19
+ python3 ocr-advanced.py scan.png --output-dir ./ocr_out --regions
20
+
21
+ Batch directory:
22
+ python3 ocr-advanced.py ./images/ --output-dir ./ocr_out --batch
23
+
24
+ Output (JSON to stdout):
25
+ {
26
+ "text": "best extracted text",
27
+ "confidence": 85.2,
28
+ "variant": "otsu_psm6",
29
+ "lines": 42,
30
+ "all_variants": { ... },
31
+ "regions": { ... }
32
+ }
33
+ """
34
+
35
+ import sys
36
+ import os
37
+ import json
38
+ import csv
39
+ import argparse
40
+ from pathlib import Path
41
+
42
+ def check_deps():
43
+ """Check that required Python packages are available."""
44
+ missing = []
45
+ try:
46
+ import cv2
47
+ except ImportError:
48
+ missing.append("opencv-python-headless")
49
+ try:
50
+ import numpy
51
+ except ImportError:
52
+ missing.append("numpy")
53
+ try:
54
+ import pytesseract
55
+ except ImportError:
56
+ missing.append("pytesseract")
57
+ try:
58
+ from PIL import Image
59
+ except ImportError:
60
+ missing.append("Pillow")
61
+
62
+ if missing:
63
+ print(json.dumps({
64
+ "error": f"Missing Python packages: {', '.join(missing)}. "
65
+ f"Install with: pip install {' '.join(missing)}",
66
+ "missing": missing,
67
+ }))
68
+ sys.exit(1)
69
+
70
+ check_deps()
71
+
72
+ import cv2
73
+ import numpy as np
74
+ import pytesseract
75
+ from PIL import Image
76
+
77
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".webp"}
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Image preprocessing variants
82
+ # ---------------------------------------------------------------------------
83
+
84
+ def to_grayscale(img):
85
+ """Convert BGR to grayscale if needed."""
86
+ if len(img.shape) == 3:
87
+ return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
88
+ return img
89
+
90
+
91
+ def upscale_2x(gray):
92
+ """2x bicubic upscale for better OCR character recognition."""
93
+ h, w = gray.shape
94
+ return cv2.resize(gray, (w * 2, h * 2), interpolation=cv2.INTER_CUBIC)
95
+
96
+
97
+ def variant_adaptive_wide(gray):
98
+ """Adaptive Gaussian threshold — wide window (31px), handles gradual lighting."""
99
+ return cv2.adaptiveThreshold(
100
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
101
+ cv2.THRESH_BINARY, 31, 10
102
+ )
103
+
104
+
105
+ def variant_adaptive_fine(gray):
106
+ """Adaptive Gaussian threshold — fine window (11px), catches small text detail."""
107
+ return cv2.adaptiveThreshold(
108
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
109
+ cv2.THRESH_BINARY, 11, 2
110
+ )
111
+
112
+
113
+ def variant_otsu(gray):
114
+ """OTSU threshold — optimal global threshold for bimodal images."""
115
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
116
+ return binary
117
+
118
+
119
+ def variant_fixed_140(gray):
120
+ """Fixed threshold 140 — standard cutoff for dark text on light paper."""
121
+ _, binary = cv2.threshold(gray, 140, 255, cv2.THRESH_BINARY)
122
+ return binary
123
+
124
+
125
+ def variant_fixed_150(gray):
126
+ """Fixed threshold 150 — slightly brighter cutoff for lighter scans."""
127
+ _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
128
+ return binary
129
+
130
+
131
+ def variant_sharpen_laplacian_otsu(gray):
132
+ """Laplacian sharpen + OTSU — aggressive edge enhancement."""
133
+ kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
134
+ sharpened = cv2.filter2D(gray, -1, kernel)
135
+ _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
136
+ return binary
137
+
138
+
139
+ def variant_sharpen_unsharp_otsu(gray):
140
+ """Unsharp mask sharpen + OTSU — gentler enhancement, better for photos."""
141
+ kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
142
+ sharpened = cv2.filter2D(gray, -1, kernel)
143
+ _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
144
+ return binary
145
+
146
+
147
+ def variant_denoise_otsu(gray):
148
+ """Denoise + OTSU — removes JPEG artifacts and photo noise."""
149
+ denoised = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)
150
+ _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
151
+ return binary
152
+
153
+
154
+ ALL_VARIANTS = {
155
+ "adaptive_wide": variant_adaptive_wide,
156
+ "adaptive_fine": variant_adaptive_fine,
157
+ "otsu": variant_otsu,
158
+ "fixed_140": variant_fixed_140,
159
+ "fixed_150": variant_fixed_150,
160
+ "sharpen_lap": variant_sharpen_laplacian_otsu,
161
+ "sharpen_unsharp": variant_sharpen_unsharp_otsu,
162
+ "denoise": variant_denoise_otsu,
163
+ }
164
+
165
+ PSM_MODES = {
166
+ 4: "single_block",
167
+ 6: "default",
168
+ 11: "sparse",
169
+ }
170
+
171
+
172
+ # ---------------------------------------------------------------------------
173
+ # OCR execution
174
+ # ---------------------------------------------------------------------------
175
+
176
+ def run_tesseract(binary_img, language="eng", psm=6):
177
+ """Run Tesseract on a preprocessed binary image.
178
+ Returns (text, confidence, line_count)."""
179
+ pil_img = Image.fromarray(binary_img)
180
+ config = f"--psm {psm}"
181
+
182
+ try:
183
+ text = pytesseract.image_to_string(pil_img, lang=language, config=config).strip()
184
+ except Exception:
185
+ return "", 0.0, 0
186
+
187
+ line_count = len([l for l in text.split("\n") if l.strip()])
188
+
189
+ # Get confidence via image_to_data
190
+ try:
191
+ data = pytesseract.image_to_data(
192
+ pil_img, lang=language, config=config,
193
+ output_type=pytesseract.Output.DICT,
194
+ )
195
+ confs = [int(c) for c in data["conf"] if int(c) >= 0]
196
+ avg_conf = sum(confs) / len(confs) if confs else 0.0
197
+ except Exception:
198
+ avg_conf = 0.0
199
+
200
+ return text, avg_conf, line_count
201
+
202
+
203
+ def compute_score(text, confidence, line_count):
204
+ """Combined scoring heuristic:
205
+ - confidence * sqrt(char_count) — rewards quality and coverage
206
+ - + line_count * 10 — bonus for structured output (more lines = better parse)
207
+ The agent discovered that line-count is a strong proxy for successful parsing
208
+ on structured documents like invoices and forms."""
209
+ char_count = len(text)
210
+ if char_count == 0:
211
+ return 0
212
+ return confidence * (char_count ** 0.5) + line_count * 10
213
+
214
+
215
+ def extract_region(gray, y_start_pct, y_end_pct, x_start_pct=0, x_end_pct=100):
216
+ """Extract a region from the image by percentage coordinates."""
217
+ h, w = gray.shape
218
+ y1 = int(h * y_start_pct / 100)
219
+ y2 = int(h * y_end_pct / 100)
220
+ x1 = int(w * x_start_pct / 100)
221
+ x2 = int(w * x_end_pct / 100)
222
+ return gray[y1:y2, x1:x2]
223
+
224
+
225
+ def extract_pixel_region(gray, x, y, w, h):
226
+ """Extract a region by pixel coordinates."""
227
+ return gray[y:y+h, x:x+w]
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+ # Output writers
232
+ # ---------------------------------------------------------------------------
233
+
234
+ def write_txt(text, output_path):
235
+ """Write plain text output."""
236
+ with open(output_path, "w", encoding="utf-8") as f:
237
+ f.write(text)
238
+
239
+
240
+ def write_csv(text, output_path):
241
+ """Write CSV with line numbers."""
242
+ lines = [l for l in text.split("\n") if l.strip()]
243
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
244
+ writer = csv.writer(f)
245
+ writer.writerow(["Line_Number", "Extracted_Text"])
246
+ for i, line in enumerate(lines, 1):
247
+ writer.writerow([i, line])
248
+
249
+
250
+ def write_pdf(text, output_path):
251
+ """Write searchable PDF using reportlab (if available)."""
252
+ try:
253
+ from reportlab.lib.pagesizes import letter
254
+ from reportlab.pdfgen import canvas as pdf_canvas
255
+ from reportlab.lib.units import inch
256
+ except ImportError:
257
+ return False
258
+
259
+ lines = [l for l in text.split("\n") if l.strip()]
260
+ c = pdf_canvas.Canvas(output_path, pagesize=letter)
261
+ width, height = letter
262
+ y = height - 1 * inch
263
+
264
+ for line in lines:
265
+ if y < 1 * inch:
266
+ c.showPage()
267
+ y = height - 1 * inch
268
+ c.drawString(1 * inch, y, line)
269
+ y -= 14
270
+
271
+ c.save()
272
+ return True
273
+
274
+
275
+ def write_all_outputs(text, base_name, output_dir):
276
+ """Write TXT + CSV + PDF to output directory."""
277
+ os.makedirs(output_dir, exist_ok=True)
278
+
279
+ txt_path = os.path.join(output_dir, f"{base_name}.txt")
280
+ write_txt(text, txt_path)
281
+
282
+ csv_path = os.path.join(output_dir, f"{base_name}.csv")
283
+ write_csv(text, csv_path)
284
+
285
+ pdf_path = os.path.join(output_dir, f"{base_name}.pdf")
286
+ pdf_ok = write_pdf(text, pdf_path)
287
+
288
+ return {
289
+ "txt": txt_path,
290
+ "csv": csv_path,
291
+ "pdf": pdf_path if pdf_ok else None,
292
+ }
293
+
294
+
295
+ # ---------------------------------------------------------------------------
296
+ # Main pipeline
297
+ # ---------------------------------------------------------------------------
298
+
299
+ def run_pipeline(image_path, language="eng", do_regions=False, debug_dir=None,
300
+ single_psm=None, pixel_region=None, output_dir=None):
301
+ """Run the full multi-variant, multi-PSM OCR pipeline."""
302
+
303
+ # Load image
304
+ img = cv2.imread(image_path)
305
+ if img is None:
306
+ return {"error": f"Could not load image: {image_path}"}
307
+
308
+ h_orig, w_orig = img.shape[:2]
309
+ gray = to_grayscale(img)
310
+
311
+ # Upscale 2x
312
+ gray_2x = upscale_2x(gray)
313
+
314
+ # If a pixel region is specified, crop before processing
315
+ if pixel_region:
316
+ rx, ry, rw, rh = pixel_region
317
+ # Scale region coords to match 2x upscale
318
+ gray_2x = extract_pixel_region(gray_2x, rx * 2, ry * 2, rw * 2, rh * 2)
319
+
320
+ # Determine PSM modes to test
321
+ psm_modes = [single_psm] if single_psm else [4, 6, 11]
322
+
323
+ # Generate all variants and run OCR
324
+ all_results = {}
325
+ best_key = None
326
+ best_score = -1
327
+
328
+ for vname, vfunc in ALL_VARIANTS.items():
329
+ try:
330
+ binary = vfunc(gray_2x)
331
+ except Exception:
332
+ continue
333
+
334
+ # Save debug images
335
+ if debug_dir:
336
+ os.makedirs(debug_dir, exist_ok=True)
337
+ cv2.imwrite(os.path.join(debug_dir, f"full_{vname}.png"), binary)
338
+
339
+ for psm in psm_modes:
340
+ key = f"{vname}_psm{psm}"
341
+ text, confidence, line_count = run_tesseract(binary, language, psm)
342
+ char_count = len(text)
343
+ score = compute_score(text, confidence, line_count)
344
+
345
+ all_results[key] = {
346
+ "text": text,
347
+ "chars": char_count,
348
+ "lines": line_count,
349
+ "confidence": round(confidence, 1),
350
+ "score": round(score, 1),
351
+ }
352
+
353
+ if score > best_score:
354
+ best_score = score
355
+ best_key = key
356
+
357
+ if not best_key:
358
+ return {"error": "All OCR variants failed to produce output"}
359
+
360
+ best = all_results[best_key]
361
+ result = {
362
+ "text": best["text"],
363
+ "confidence": best["confidence"],
364
+ "variant": best_key,
365
+ "chars": best["chars"],
366
+ "lines": best["lines"],
367
+ "score": best["score"],
368
+ "image_size": f"{w_orig}x{h_orig}",
369
+ "variants_tested": len(all_results),
370
+ "all_variants": all_results,
371
+ }
372
+
373
+ # Region-based OCR
374
+ if do_regions:
375
+ regions = {}
376
+ region_defs = {
377
+ "header": (0, 35),
378
+ "body": (30, 80),
379
+ "footer": (75, 100),
380
+ }
381
+
382
+ for rname, (y_start, y_end) in region_defs.items():
383
+ region_gray = extract_region(gray_2x, y_start, y_end)
384
+
385
+ if debug_dir:
386
+ cv2.imwrite(os.path.join(debug_dir, f"region_{rname}.png"), region_gray)
387
+
388
+ # Test all variants on each region for best accuracy
389
+ region_best = ""
390
+ region_best_score = -1
391
+
392
+ for vname in ["otsu", "denoise", "adaptive_fine", "sharpen_unsharp"]:
393
+ if vname not in ALL_VARIANTS:
394
+ continue
395
+ try:
396
+ binary = ALL_VARIANTS[vname](region_gray)
397
+ except Exception:
398
+ continue
399
+
400
+ if debug_dir:
401
+ cv2.imwrite(os.path.join(debug_dir, f"region_{rname}_{vname}.png"), binary)
402
+
403
+ text, conf, lc = run_tesseract(binary, language, 6)
404
+ score = compute_score(text, conf, lc)
405
+ if score > region_best_score:
406
+ region_best_score = score
407
+ region_best = text
408
+
409
+ regions[rname] = region_best
410
+
411
+ result["regions"] = regions
412
+
413
+ if debug_dir:
414
+ result["debug_dir"] = debug_dir
415
+
416
+ # Write output files if output_dir specified
417
+ if output_dir:
418
+ base_name = Path(image_path).stem
419
+ files = write_all_outputs(best["text"], base_name, output_dir)
420
+ result["output_files"] = files
421
+
422
+ return result
423
+
424
+
425
+ def run_batch(images_dir, language="eng", do_regions=False, debug_dir=None,
426
+ output_dir=None):
427
+ """Process all images in a directory."""
428
+ images_dir = os.path.abspath(images_dir)
429
+ if not os.path.isdir(images_dir):
430
+ return {"error": f"Not a directory: {images_dir}"}
431
+
432
+ out_dir = output_dir or os.path.join(images_dir, "ocr_out")
433
+ os.makedirs(out_dir, exist_ok=True)
434
+
435
+ batch_results = {}
436
+ image_files = sorted(
437
+ f for f in os.listdir(images_dir)
438
+ if Path(f).suffix.lower() in IMAGE_EXTENSIONS
439
+ )
440
+
441
+ if not image_files:
442
+ return {"error": f"No image files found in {images_dir}"}
443
+
444
+ for img_file in image_files:
445
+ img_path = os.path.join(images_dir, img_file)
446
+ img_debug = os.path.join(debug_dir, Path(img_file).stem) if debug_dir else None
447
+ result = run_pipeline(
448
+ img_path,
449
+ language=language,
450
+ do_regions=do_regions,
451
+ debug_dir=img_debug,
452
+ output_dir=out_dir,
453
+ )
454
+ # Compact per-image result (omit all_variants for batch summary)
455
+ batch_results[img_file] = {
456
+ "text": result.get("text", ""),
457
+ "confidence": result.get("confidence", 0),
458
+ "variant": result.get("variant", ""),
459
+ "chars": result.get("chars", 0),
460
+ "lines": result.get("lines", 0),
461
+ "output_files": result.get("output_files"),
462
+ "error": result.get("error"),
463
+ }
464
+
465
+ # Write summary
466
+ summary_path = os.path.join(out_dir, "OCR_PROCESSING_SUMMARY.md")
467
+ with open(summary_path, "w", encoding="utf-8") as f:
468
+ f.write("# OCR Processing Summary Report\n\n")
469
+ f.write(f"**Source:** `{images_dir}`\n\n")
470
+ f.write("## Processed Documents\n\n")
471
+ f.write("| Document | Lines | Chars | Confidence | Variant |\n")
472
+ f.write("|----------|-------|-------|------------|----------|\n")
473
+ for img, data in batch_results.items():
474
+ if data.get("error"):
475
+ f.write(f"| {img} | ERROR | - | - | {data['error']} |\n")
476
+ else:
477
+ f.write(
478
+ f"| {img} | {data['lines']} | {data['chars']} "
479
+ f"| {data['confidence']}% | {data['variant']} |\n"
480
+ )
481
+
482
+ return {
483
+ "batch": True,
484
+ "images_processed": len(batch_results),
485
+ "output_dir": out_dir,
486
+ "summary": summary_path,
487
+ "results": batch_results,
488
+ }
489
+
490
+
491
+ def main():
492
+ parser = argparse.ArgumentParser(
493
+ description="Advanced multi-variant OCR pipeline for open-agents"
494
+ )
495
+ parser.add_argument(
496
+ "image",
497
+ help="Path to image file, or directory for --batch mode",
498
+ )
499
+ parser.add_argument("--language", "-l", default="eng",
500
+ help="OCR language (default: eng)")
501
+ parser.add_argument("--regions", action="store_true",
502
+ help="Also OCR header/body/footer regions")
503
+ parser.add_argument("--debug-dir",
504
+ help="Save preprocessed images to this directory")
505
+ parser.add_argument("--psm", type=int, choices=[4, 6, 11],
506
+ help="Use single PSM mode instead of all 3")
507
+ parser.add_argument("--region",
508
+ help="Crop region before OCR: x,y,w,h in pixels")
509
+ parser.add_argument("--output", choices=["json", "text"], default="json",
510
+ help="Stdout output format (default: json)")
511
+ parser.add_argument("--output-dir",
512
+ help="Write TXT + CSV + PDF outputs to this directory")
513
+ parser.add_argument("--batch", action="store_true",
514
+ help="Process all images in a directory")
515
+
516
+ args = parser.parse_args()
517
+
518
+ # Batch mode
519
+ if args.batch or os.path.isdir(args.image):
520
+ result = run_batch(
521
+ args.image,
522
+ language=args.language,
523
+ do_regions=args.regions,
524
+ debug_dir=args.debug_dir,
525
+ output_dir=args.output_dir,
526
+ )
527
+ if args.output == "text":
528
+ if "error" in result:
529
+ print(f"ERROR: {result['error']}", file=sys.stderr)
530
+ sys.exit(1)
531
+ print(f"Processed {result['images_processed']} images → {result['output_dir']}")
532
+ else:
533
+ print(json.dumps(result, indent=2))
534
+ sys.exit(0)
535
+
536
+ # Single image mode
537
+ if not os.path.isfile(args.image):
538
+ print(json.dumps({"error": f"File not found: {args.image}"}))
539
+ sys.exit(1)
540
+
541
+ pixel_region = None
542
+ if args.region:
543
+ try:
544
+ pixel_region = tuple(int(x) for x in args.region.split(","))
545
+ if len(pixel_region) != 4:
546
+ raise ValueError
547
+ except ValueError:
548
+ print(json.dumps({"error": "Region must be x,y,w,h (4 integers)"}))
549
+ sys.exit(1)
550
+
551
+ result = run_pipeline(
552
+ args.image,
553
+ language=args.language,
554
+ do_regions=args.regions,
555
+ debug_dir=args.debug_dir,
556
+ single_psm=args.psm,
557
+ pixel_region=pixel_region,
558
+ output_dir=args.output_dir,
559
+ )
560
+
561
+ if args.output == "text":
562
+ if "error" in result:
563
+ print(f"ERROR: {result['error']}", file=sys.stderr)
564
+ sys.exit(1)
565
+ print(result["text"])
566
+ else:
567
+ print(json.dumps(result, indent=2))
568
+
569
+
570
+ if __name__ == "__main__":
571
+ main()
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Headless Moondream Station launcher for open-agents.
4
+
5
+ Starts the Moondream vision model REST API server on port 2020 without
6
+ the interactive REPL. Designed to be auto-launched by the VisionTool.
7
+
8
+ Usage:
9
+ python start-moondream.py [--port 2020] [--host 127.0.0.1]
10
+
11
+ Environment:
12
+ HF_TOKEN - HuggingFace token (optional, for gated models)
13
+ """
14
+
15
+ import sys
16
+ import signal
17
+ import time
18
+ import argparse
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(description="Start Moondream Station REST server")
22
+ parser.add_argument("--port", type=int, default=2020, help="Server port (default: 2020)")
23
+ parser.add_argument("--host", default="127.0.0.1", help="Server host (default: 127.0.0.1)")
24
+ parser.add_argument("--model", default=None, help="Model to use (default: auto-detect, prefers non-gated moondream-2)")
25
+ args = parser.parse_args()
26
+
27
+ try:
28
+ from moondream_station.core.config import ConfigManager
29
+ from moondream_station.core.manifest import ManifestManager
30
+ from moondream_station.core.models import ModelManager
31
+ from moondream_station.core.service import ServiceManager
32
+ from moondream_station.core.analytics import Analytics
33
+ from moondream_station.session import SessionState
34
+ from moondream_station.ui.display import Display
35
+ except ImportError:
36
+ print("ERROR: moondream-station not installed. Install with: pip install moondream-station", file=sys.stderr)
37
+ sys.exit(1)
38
+
39
+ print(f"[moondream] Initializing...", flush=True)
40
+
41
+ config = ConfigManager()
42
+ config.set("service_host", args.host)
43
+ config.set("service_port", args.port)
44
+
45
+ manifest_manager = ManifestManager(config)
46
+ analytics = Analytics(config, manifest_manager)
47
+ display = Display()
48
+ models = ModelManager(config, manifest_manager)
49
+ session_state = SessionState()
50
+
51
+ # Load manifest
52
+ manifest_url = "https://m87-md-prod-assets.s3.us-west-2.amazonaws.com/station/mds2/production_manifest.json"
53
+ print(f"[moondream] Loading manifest...", flush=True)
54
+ try:
55
+ manifest_manager.load_manifest(manifest_url, analytics, display)
56
+ except Exception as e:
57
+ print(f"ERROR: Failed to load manifest: {e}", file=sys.stderr)
58
+ sys.exit(1)
59
+
60
+ # Select model — prefer moondream-2 (non-gated) unless overridden
61
+ import os
62
+ model_name = args.model
63
+ if not model_name:
64
+ # Prefer moondream-2 (no HF token required) unless user has HF_TOKEN
65
+ has_hf_token = bool(os.environ.get("HF_TOKEN") or config.get("hf_token"))
66
+ if has_hf_token:
67
+ model_name = manifest_manager.get_available_default_model()
68
+ else:
69
+ model_name = "moondream-2"
70
+ if not model_name:
71
+ model_name = manifest_manager.get_available_default_model()
72
+ if not model_name:
73
+ print("ERROR: No model available", file=sys.stderr)
74
+ sys.exit(1)
75
+
76
+ print(f"[moondream] Switching to model: {model_name}", flush=True)
77
+ if not models.switch_model(model_name, display):
78
+ print(f"ERROR: Failed to switch to model {model_name}", file=sys.stderr)
79
+ sys.exit(1)
80
+
81
+ # Start REST server
82
+ service = ServiceManager(config, manifest_manager, session_state, analytics)
83
+ print(f"[moondream] Starting REST server on {args.host}:{args.port}...", flush=True)
84
+
85
+ if not service.start(model_name, args.port):
86
+ print("ERROR: Failed to start REST server", file=sys.stderr)
87
+ sys.exit(1)
88
+
89
+ print(f"[moondream] Server running at http://{args.host}:{args.port}/v1", flush=True)
90
+ print(f"[moondream] Endpoints: /v1/caption, /v1/query, /v1/detect, /v1/point", flush=True)
91
+ print(f"READY", flush=True)
92
+
93
+ # Handle shutdown
94
+ def shutdown(signum, frame):
95
+ print(f"\n[moondream] Shutting down...", flush=True)
96
+ service.stop()
97
+ sys.exit(0)
98
+
99
+ signal.signal(signal.SIGTERM, shutdown)
100
+ signal.signal(signal.SIGINT, shutdown)
101
+
102
+ # Keep alive
103
+ try:
104
+ while service.is_running():
105
+ time.sleep(1)
106
+ except KeyboardInterrupt:
107
+ shutdown(None, None)
108
+
109
+ print("[moondream] Server stopped", flush=True)
110
+
111
+ if __name__ == "__main__":
112
+ main()