omnius 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4959 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +630665 -0
- package/dist/launcher.cjs +78 -0
- package/dist/postinstall-daemon.cjs +776 -0
- package/dist/preinstall.cjs +92 -0
- package/dist/scripts/autoresearch-prepare.py +459 -0
- package/dist/scripts/autoresearch-train.py +661 -0
- package/dist/scripts/crawlee-scraper.py +358 -0
- package/dist/scripts/live-nemotron.py +478 -0
- package/dist/scripts/live-whisper.py +242 -0
- package/dist/scripts/ocr-advanced.py +571 -0
- package/dist/scripts/start-moondream.py +112 -0
- package/dist/scripts/tor/UPSTREAM-README.md +148 -0
- package/dist/scripts/tor/destroy_tor.sh +29 -0
- package/dist/scripts/tor/tor_setup.sh +163 -0
- package/dist/scripts/transcribe-file.py +63 -0
- package/dist/scripts/web_scrape.py +1295 -0
- package/npm-shrinkwrap.json +7412 -0
- package/package.json +142 -0
- package/prompts/agentic/system-large.md +569 -0
- package/prompts/agentic/system-medium.md +211 -0
- package/prompts/agentic/system-small.md +114 -0
- package/prompts/compaction/context-compaction.md +44 -0
- package/prompts/personality/level-1-minimal.md +3 -0
- package/prompts/personality/level-2-concise.md +3 -0
- package/prompts/personality/level-4-explanatory.md +3 -0
- package/prompts/personality/level-5-thorough.md +3 -0
- package/prompts/personality/level-autist.md +3 -0
- package/prompts/personality/level-stark.md +3 -0
- package/prompts/runners/dispatcher.md +24 -0
- package/prompts/runners/editor.md +44 -0
- package/prompts/runners/evaluator.md +30 -0
- package/prompts/runners/merge-summary.md +9 -0
- package/prompts/runners/normalizer.md +23 -0
- package/prompts/runners/planner.md +33 -0
- package/prompts/runners/scout.md +39 -0
- package/prompts/runners/verifier.md +36 -0
- package/prompts/skill-builder/seed-analysis.md +30 -0
- package/prompts/skill-builder/skill-expansion.md +76 -0
- package/prompts/skill-builder/skill-validation.md +31 -0
- package/prompts/templates/analysis.md +14 -0
- package/prompts/templates/code-review.md +16 -0
- package/prompts/templates/code.md +13 -0
- package/prompts/templates/document.md +13 -0
- package/prompts/templates/error-diagnosis.md +14 -0
- package/prompts/templates/general.md +9 -0
- package/prompts/templates/plan.md +15 -0
- package/prompts/templates/system.md +16 -0
- package/prompts/tui/dmn-gather.md +128 -0
- package/prompts/tui/dream-consolidate.md +48 -0
- package/prompts/tui/dream-lucid-eval.md +17 -0
- package/prompts/tui/dream-lucid-implement.md +14 -0
- package/prompts/tui/dream-stages.md +19 -0
- package/prompts/tui/emotion-behavioral.md +2 -0
- package/prompts/tui/emotion-center.md +12 -0
- package/voices/personaplex/OverBarn.pt +0 -0
- package/voices/personaplex/clone-voice.py +384 -0
- package/voices/personaplex/dequant-loader.py +174 -0
- package/voices/personaplex/quantize-weights.py +167 -0
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ocr-advanced.py — Multi-variant, multi-PSM OCR pipeline for open-agents.
|
|
4
|
+
|
|
5
|
+
Implements a full preprocessing + OCR + cross-reference pipeline:
|
|
6
|
+
1. Load image → grayscale → 2x upscale
|
|
7
|
+
2. Generate 7 preprocessing variants (two adaptive windows, OTSU, two fixed
|
|
8
|
+
thresholds, two sharpen kernels, denoise)
|
|
9
|
+
3. Run Tesseract with PSM 4, 6, 11 on each variant (up to 21 passes)
|
|
10
|
+
4. Score results using combined heuristic (confidence * coverage + line bonus)
|
|
11
|
+
5. Optionally extract regions (header/body/footer) with cross-reference
|
|
12
|
+
6. Output as JSON, text, CSV, or write all formats to an output directory
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python3 ocr-advanced.py <image_or_dir> [options]
|
|
16
|
+
|
|
17
|
+
Single image:
|
|
18
|
+
python3 ocr-advanced.py photo.jpg --output json
|
|
19
|
+
python3 ocr-advanced.py scan.png --output-dir ./ocr_out --regions
|
|
20
|
+
|
|
21
|
+
Batch directory:
|
|
22
|
+
python3 ocr-advanced.py ./images/ --output-dir ./ocr_out --batch
|
|
23
|
+
|
|
24
|
+
Output (JSON to stdout):
|
|
25
|
+
{
|
|
26
|
+
"text": "best extracted text",
|
|
27
|
+
"confidence": 85.2,
|
|
28
|
+
"variant": "otsu_psm6",
|
|
29
|
+
"lines": 42,
|
|
30
|
+
"all_variants": { ... },
|
|
31
|
+
"regions": { ... }
|
|
32
|
+
}
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
import sys
|
|
36
|
+
import os
|
|
37
|
+
import json
|
|
38
|
+
import csv
|
|
39
|
+
import argparse
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
|
|
42
|
+
def check_deps():
|
|
43
|
+
"""Check that required Python packages are available."""
|
|
44
|
+
missing = []
|
|
45
|
+
try:
|
|
46
|
+
import cv2
|
|
47
|
+
except ImportError:
|
|
48
|
+
missing.append("opencv-python-headless")
|
|
49
|
+
try:
|
|
50
|
+
import numpy
|
|
51
|
+
except ImportError:
|
|
52
|
+
missing.append("numpy")
|
|
53
|
+
try:
|
|
54
|
+
import pytesseract
|
|
55
|
+
except ImportError:
|
|
56
|
+
missing.append("pytesseract")
|
|
57
|
+
try:
|
|
58
|
+
from PIL import Image
|
|
59
|
+
except ImportError:
|
|
60
|
+
missing.append("Pillow")
|
|
61
|
+
|
|
62
|
+
if missing:
|
|
63
|
+
print(json.dumps({
|
|
64
|
+
"error": f"Missing Python packages: {', '.join(missing)}. "
|
|
65
|
+
f"Install with: pip install {' '.join(missing)}",
|
|
66
|
+
"missing": missing,
|
|
67
|
+
}))
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
check_deps()
|
|
71
|
+
|
|
72
|
+
import cv2
|
|
73
|
+
import numpy as np
|
|
74
|
+
import pytesseract
|
|
75
|
+
from PIL import Image
|
|
76
|
+
|
|
77
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".webp"}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Image preprocessing variants
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def to_grayscale(img):
|
|
85
|
+
"""Convert BGR to grayscale if needed."""
|
|
86
|
+
if len(img.shape) == 3:
|
|
87
|
+
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
88
|
+
return img
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def upscale_2x(gray):
|
|
92
|
+
"""2x bicubic upscale for better OCR character recognition."""
|
|
93
|
+
h, w = gray.shape
|
|
94
|
+
return cv2.resize(gray, (w * 2, h * 2), interpolation=cv2.INTER_CUBIC)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def variant_adaptive_wide(gray):
|
|
98
|
+
"""Adaptive Gaussian threshold — wide window (31px), handles gradual lighting."""
|
|
99
|
+
return cv2.adaptiveThreshold(
|
|
100
|
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
101
|
+
cv2.THRESH_BINARY, 31, 10
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def variant_adaptive_fine(gray):
|
|
106
|
+
"""Adaptive Gaussian threshold — fine window (11px), catches small text detail."""
|
|
107
|
+
return cv2.adaptiveThreshold(
|
|
108
|
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
109
|
+
cv2.THRESH_BINARY, 11, 2
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def variant_otsu(gray):
|
|
114
|
+
"""OTSU threshold — optimal global threshold for bimodal images."""
|
|
115
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
116
|
+
return binary
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def variant_fixed_140(gray):
|
|
120
|
+
"""Fixed threshold 140 — standard cutoff for dark text on light paper."""
|
|
121
|
+
_, binary = cv2.threshold(gray, 140, 255, cv2.THRESH_BINARY)
|
|
122
|
+
return binary
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def variant_fixed_150(gray):
|
|
126
|
+
"""Fixed threshold 150 — slightly brighter cutoff for lighter scans."""
|
|
127
|
+
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
|
|
128
|
+
return binary
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def variant_sharpen_laplacian_otsu(gray):
|
|
132
|
+
"""Laplacian sharpen + OTSU — aggressive edge enhancement."""
|
|
133
|
+
kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
|
|
134
|
+
sharpened = cv2.filter2D(gray, -1, kernel)
|
|
135
|
+
_, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
136
|
+
return binary
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def variant_sharpen_unsharp_otsu(gray):
|
|
140
|
+
"""Unsharp mask sharpen + OTSU — gentler enhancement, better for photos."""
|
|
141
|
+
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
|
|
142
|
+
sharpened = cv2.filter2D(gray, -1, kernel)
|
|
143
|
+
_, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
144
|
+
return binary
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def variant_denoise_otsu(gray):
|
|
148
|
+
"""Denoise + OTSU — removes JPEG artifacts and photo noise."""
|
|
149
|
+
denoised = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)
|
|
150
|
+
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
151
|
+
return binary
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
ALL_VARIANTS = {
|
|
155
|
+
"adaptive_wide": variant_adaptive_wide,
|
|
156
|
+
"adaptive_fine": variant_adaptive_fine,
|
|
157
|
+
"otsu": variant_otsu,
|
|
158
|
+
"fixed_140": variant_fixed_140,
|
|
159
|
+
"fixed_150": variant_fixed_150,
|
|
160
|
+
"sharpen_lap": variant_sharpen_laplacian_otsu,
|
|
161
|
+
"sharpen_unsharp": variant_sharpen_unsharp_otsu,
|
|
162
|
+
"denoise": variant_denoise_otsu,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
PSM_MODES = {
|
|
166
|
+
4: "single_block",
|
|
167
|
+
6: "default",
|
|
168
|
+
11: "sparse",
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
# OCR execution
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
def run_tesseract(binary_img, language="eng", psm=6):
|
|
177
|
+
"""Run Tesseract on a preprocessed binary image.
|
|
178
|
+
Returns (text, confidence, line_count)."""
|
|
179
|
+
pil_img = Image.fromarray(binary_img)
|
|
180
|
+
config = f"--psm {psm}"
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
text = pytesseract.image_to_string(pil_img, lang=language, config=config).strip()
|
|
184
|
+
except Exception:
|
|
185
|
+
return "", 0.0, 0
|
|
186
|
+
|
|
187
|
+
line_count = len([l for l in text.split("\n") if l.strip()])
|
|
188
|
+
|
|
189
|
+
# Get confidence via image_to_data
|
|
190
|
+
try:
|
|
191
|
+
data = pytesseract.image_to_data(
|
|
192
|
+
pil_img, lang=language, config=config,
|
|
193
|
+
output_type=pytesseract.Output.DICT,
|
|
194
|
+
)
|
|
195
|
+
confs = [int(c) for c in data["conf"] if int(c) >= 0]
|
|
196
|
+
avg_conf = sum(confs) / len(confs) if confs else 0.0
|
|
197
|
+
except Exception:
|
|
198
|
+
avg_conf = 0.0
|
|
199
|
+
|
|
200
|
+
return text, avg_conf, line_count
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def compute_score(text, confidence, line_count):
|
|
204
|
+
"""Combined scoring heuristic:
|
|
205
|
+
- confidence * sqrt(char_count) — rewards quality and coverage
|
|
206
|
+
- + line_count * 10 — bonus for structured output (more lines = better parse)
|
|
207
|
+
The agent discovered that line-count is a strong proxy for successful parsing
|
|
208
|
+
on structured documents like invoices and forms."""
|
|
209
|
+
char_count = len(text)
|
|
210
|
+
if char_count == 0:
|
|
211
|
+
return 0
|
|
212
|
+
return confidence * (char_count ** 0.5) + line_count * 10
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def extract_region(gray, y_start_pct, y_end_pct, x_start_pct=0, x_end_pct=100):
|
|
216
|
+
"""Extract a region from the image by percentage coordinates."""
|
|
217
|
+
h, w = gray.shape
|
|
218
|
+
y1 = int(h * y_start_pct / 100)
|
|
219
|
+
y2 = int(h * y_end_pct / 100)
|
|
220
|
+
x1 = int(w * x_start_pct / 100)
|
|
221
|
+
x2 = int(w * x_end_pct / 100)
|
|
222
|
+
return gray[y1:y2, x1:x2]
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def extract_pixel_region(gray, x, y, w, h):
|
|
226
|
+
"""Extract a region by pixel coordinates."""
|
|
227
|
+
return gray[y:y+h, x:x+w]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
# Output writers
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
def write_txt(text, output_path):
|
|
235
|
+
"""Write plain text output."""
|
|
236
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
237
|
+
f.write(text)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def write_csv(text, output_path):
|
|
241
|
+
"""Write CSV with line numbers."""
|
|
242
|
+
lines = [l for l in text.split("\n") if l.strip()]
|
|
243
|
+
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
244
|
+
writer = csv.writer(f)
|
|
245
|
+
writer.writerow(["Line_Number", "Extracted_Text"])
|
|
246
|
+
for i, line in enumerate(lines, 1):
|
|
247
|
+
writer.writerow([i, line])
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def write_pdf(text, output_path):
|
|
251
|
+
"""Write searchable PDF using reportlab (if available)."""
|
|
252
|
+
try:
|
|
253
|
+
from reportlab.lib.pagesizes import letter
|
|
254
|
+
from reportlab.pdfgen import canvas as pdf_canvas
|
|
255
|
+
from reportlab.lib.units import inch
|
|
256
|
+
except ImportError:
|
|
257
|
+
return False
|
|
258
|
+
|
|
259
|
+
lines = [l for l in text.split("\n") if l.strip()]
|
|
260
|
+
c = pdf_canvas.Canvas(output_path, pagesize=letter)
|
|
261
|
+
width, height = letter
|
|
262
|
+
y = height - 1 * inch
|
|
263
|
+
|
|
264
|
+
for line in lines:
|
|
265
|
+
if y < 1 * inch:
|
|
266
|
+
c.showPage()
|
|
267
|
+
y = height - 1 * inch
|
|
268
|
+
c.drawString(1 * inch, y, line)
|
|
269
|
+
y -= 14
|
|
270
|
+
|
|
271
|
+
c.save()
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def write_all_outputs(text, base_name, output_dir):
|
|
276
|
+
"""Write TXT + CSV + PDF to output directory."""
|
|
277
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
278
|
+
|
|
279
|
+
txt_path = os.path.join(output_dir, f"{base_name}.txt")
|
|
280
|
+
write_txt(text, txt_path)
|
|
281
|
+
|
|
282
|
+
csv_path = os.path.join(output_dir, f"{base_name}.csv")
|
|
283
|
+
write_csv(text, csv_path)
|
|
284
|
+
|
|
285
|
+
pdf_path = os.path.join(output_dir, f"{base_name}.pdf")
|
|
286
|
+
pdf_ok = write_pdf(text, pdf_path)
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
"txt": txt_path,
|
|
290
|
+
"csv": csv_path,
|
|
291
|
+
"pdf": pdf_path if pdf_ok else None,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# ---------------------------------------------------------------------------
|
|
296
|
+
# Main pipeline
|
|
297
|
+
# ---------------------------------------------------------------------------
|
|
298
|
+
|
|
299
|
+
def run_pipeline(image_path, language="eng", do_regions=False, debug_dir=None,
|
|
300
|
+
single_psm=None, pixel_region=None, output_dir=None):
|
|
301
|
+
"""Run the full multi-variant, multi-PSM OCR pipeline."""
|
|
302
|
+
|
|
303
|
+
# Load image
|
|
304
|
+
img = cv2.imread(image_path)
|
|
305
|
+
if img is None:
|
|
306
|
+
return {"error": f"Could not load image: {image_path}"}
|
|
307
|
+
|
|
308
|
+
h_orig, w_orig = img.shape[:2]
|
|
309
|
+
gray = to_grayscale(img)
|
|
310
|
+
|
|
311
|
+
# Upscale 2x
|
|
312
|
+
gray_2x = upscale_2x(gray)
|
|
313
|
+
|
|
314
|
+
# If a pixel region is specified, crop before processing
|
|
315
|
+
if pixel_region:
|
|
316
|
+
rx, ry, rw, rh = pixel_region
|
|
317
|
+
# Scale region coords to match 2x upscale
|
|
318
|
+
gray_2x = extract_pixel_region(gray_2x, rx * 2, ry * 2, rw * 2, rh * 2)
|
|
319
|
+
|
|
320
|
+
# Determine PSM modes to test
|
|
321
|
+
psm_modes = [single_psm] if single_psm else [4, 6, 11]
|
|
322
|
+
|
|
323
|
+
# Generate all variants and run OCR
|
|
324
|
+
all_results = {}
|
|
325
|
+
best_key = None
|
|
326
|
+
best_score = -1
|
|
327
|
+
|
|
328
|
+
for vname, vfunc in ALL_VARIANTS.items():
|
|
329
|
+
try:
|
|
330
|
+
binary = vfunc(gray_2x)
|
|
331
|
+
except Exception:
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
# Save debug images
|
|
335
|
+
if debug_dir:
|
|
336
|
+
os.makedirs(debug_dir, exist_ok=True)
|
|
337
|
+
cv2.imwrite(os.path.join(debug_dir, f"full_{vname}.png"), binary)
|
|
338
|
+
|
|
339
|
+
for psm in psm_modes:
|
|
340
|
+
key = f"{vname}_psm{psm}"
|
|
341
|
+
text, confidence, line_count = run_tesseract(binary, language, psm)
|
|
342
|
+
char_count = len(text)
|
|
343
|
+
score = compute_score(text, confidence, line_count)
|
|
344
|
+
|
|
345
|
+
all_results[key] = {
|
|
346
|
+
"text": text,
|
|
347
|
+
"chars": char_count,
|
|
348
|
+
"lines": line_count,
|
|
349
|
+
"confidence": round(confidence, 1),
|
|
350
|
+
"score": round(score, 1),
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if score > best_score:
|
|
354
|
+
best_score = score
|
|
355
|
+
best_key = key
|
|
356
|
+
|
|
357
|
+
if not best_key:
|
|
358
|
+
return {"error": "All OCR variants failed to produce output"}
|
|
359
|
+
|
|
360
|
+
best = all_results[best_key]
|
|
361
|
+
result = {
|
|
362
|
+
"text": best["text"],
|
|
363
|
+
"confidence": best["confidence"],
|
|
364
|
+
"variant": best_key,
|
|
365
|
+
"chars": best["chars"],
|
|
366
|
+
"lines": best["lines"],
|
|
367
|
+
"score": best["score"],
|
|
368
|
+
"image_size": f"{w_orig}x{h_orig}",
|
|
369
|
+
"variants_tested": len(all_results),
|
|
370
|
+
"all_variants": all_results,
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
# Region-based OCR
|
|
374
|
+
if do_regions:
|
|
375
|
+
regions = {}
|
|
376
|
+
region_defs = {
|
|
377
|
+
"header": (0, 35),
|
|
378
|
+
"body": (30, 80),
|
|
379
|
+
"footer": (75, 100),
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
for rname, (y_start, y_end) in region_defs.items():
|
|
383
|
+
region_gray = extract_region(gray_2x, y_start, y_end)
|
|
384
|
+
|
|
385
|
+
if debug_dir:
|
|
386
|
+
cv2.imwrite(os.path.join(debug_dir, f"region_{rname}.png"), region_gray)
|
|
387
|
+
|
|
388
|
+
# Test all variants on each region for best accuracy
|
|
389
|
+
region_best = ""
|
|
390
|
+
region_best_score = -1
|
|
391
|
+
|
|
392
|
+
for vname in ["otsu", "denoise", "adaptive_fine", "sharpen_unsharp"]:
|
|
393
|
+
if vname not in ALL_VARIANTS:
|
|
394
|
+
continue
|
|
395
|
+
try:
|
|
396
|
+
binary = ALL_VARIANTS[vname](region_gray)
|
|
397
|
+
except Exception:
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
if debug_dir:
|
|
401
|
+
cv2.imwrite(os.path.join(debug_dir, f"region_{rname}_{vname}.png"), binary)
|
|
402
|
+
|
|
403
|
+
text, conf, lc = run_tesseract(binary, language, 6)
|
|
404
|
+
score = compute_score(text, conf, lc)
|
|
405
|
+
if score > region_best_score:
|
|
406
|
+
region_best_score = score
|
|
407
|
+
region_best = text
|
|
408
|
+
|
|
409
|
+
regions[rname] = region_best
|
|
410
|
+
|
|
411
|
+
result["regions"] = regions
|
|
412
|
+
|
|
413
|
+
if debug_dir:
|
|
414
|
+
result["debug_dir"] = debug_dir
|
|
415
|
+
|
|
416
|
+
# Write output files if output_dir specified
|
|
417
|
+
if output_dir:
|
|
418
|
+
base_name = Path(image_path).stem
|
|
419
|
+
files = write_all_outputs(best["text"], base_name, output_dir)
|
|
420
|
+
result["output_files"] = files
|
|
421
|
+
|
|
422
|
+
return result
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def run_batch(images_dir, language="eng", do_regions=False, debug_dir=None,
|
|
426
|
+
output_dir=None):
|
|
427
|
+
"""Process all images in a directory."""
|
|
428
|
+
images_dir = os.path.abspath(images_dir)
|
|
429
|
+
if not os.path.isdir(images_dir):
|
|
430
|
+
return {"error": f"Not a directory: {images_dir}"}
|
|
431
|
+
|
|
432
|
+
out_dir = output_dir or os.path.join(images_dir, "ocr_out")
|
|
433
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
434
|
+
|
|
435
|
+
batch_results = {}
|
|
436
|
+
image_files = sorted(
|
|
437
|
+
f for f in os.listdir(images_dir)
|
|
438
|
+
if Path(f).suffix.lower() in IMAGE_EXTENSIONS
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
if not image_files:
|
|
442
|
+
return {"error": f"No image files found in {images_dir}"}
|
|
443
|
+
|
|
444
|
+
for img_file in image_files:
|
|
445
|
+
img_path = os.path.join(images_dir, img_file)
|
|
446
|
+
img_debug = os.path.join(debug_dir, Path(img_file).stem) if debug_dir else None
|
|
447
|
+
result = run_pipeline(
|
|
448
|
+
img_path,
|
|
449
|
+
language=language,
|
|
450
|
+
do_regions=do_regions,
|
|
451
|
+
debug_dir=img_debug,
|
|
452
|
+
output_dir=out_dir,
|
|
453
|
+
)
|
|
454
|
+
# Compact per-image result (omit all_variants for batch summary)
|
|
455
|
+
batch_results[img_file] = {
|
|
456
|
+
"text": result.get("text", ""),
|
|
457
|
+
"confidence": result.get("confidence", 0),
|
|
458
|
+
"variant": result.get("variant", ""),
|
|
459
|
+
"chars": result.get("chars", 0),
|
|
460
|
+
"lines": result.get("lines", 0),
|
|
461
|
+
"output_files": result.get("output_files"),
|
|
462
|
+
"error": result.get("error"),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
# Write summary
|
|
466
|
+
summary_path = os.path.join(out_dir, "OCR_PROCESSING_SUMMARY.md")
|
|
467
|
+
with open(summary_path, "w", encoding="utf-8") as f:
|
|
468
|
+
f.write("# OCR Processing Summary Report\n\n")
|
|
469
|
+
f.write(f"**Source:** `{images_dir}`\n\n")
|
|
470
|
+
f.write("## Processed Documents\n\n")
|
|
471
|
+
f.write("| Document | Lines | Chars | Confidence | Variant |\n")
|
|
472
|
+
f.write("|----------|-------|-------|------------|----------|\n")
|
|
473
|
+
for img, data in batch_results.items():
|
|
474
|
+
if data.get("error"):
|
|
475
|
+
f.write(f"| {img} | ERROR | - | - | {data['error']} |\n")
|
|
476
|
+
else:
|
|
477
|
+
f.write(
|
|
478
|
+
f"| {img} | {data['lines']} | {data['chars']} "
|
|
479
|
+
f"| {data['confidence']}% | {data['variant']} |\n"
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
"batch": True,
|
|
484
|
+
"images_processed": len(batch_results),
|
|
485
|
+
"output_dir": out_dir,
|
|
486
|
+
"summary": summary_path,
|
|
487
|
+
"results": batch_results,
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def main():
|
|
492
|
+
parser = argparse.ArgumentParser(
|
|
493
|
+
description="Advanced multi-variant OCR pipeline for open-agents"
|
|
494
|
+
)
|
|
495
|
+
parser.add_argument(
|
|
496
|
+
"image",
|
|
497
|
+
help="Path to image file, or directory for --batch mode",
|
|
498
|
+
)
|
|
499
|
+
parser.add_argument("--language", "-l", default="eng",
|
|
500
|
+
help="OCR language (default: eng)")
|
|
501
|
+
parser.add_argument("--regions", action="store_true",
|
|
502
|
+
help="Also OCR header/body/footer regions")
|
|
503
|
+
parser.add_argument("--debug-dir",
|
|
504
|
+
help="Save preprocessed images to this directory")
|
|
505
|
+
parser.add_argument("--psm", type=int, choices=[4, 6, 11],
|
|
506
|
+
help="Use single PSM mode instead of all 3")
|
|
507
|
+
parser.add_argument("--region",
|
|
508
|
+
help="Crop region before OCR: x,y,w,h in pixels")
|
|
509
|
+
parser.add_argument("--output", choices=["json", "text"], default="json",
|
|
510
|
+
help="Stdout output format (default: json)")
|
|
511
|
+
parser.add_argument("--output-dir",
|
|
512
|
+
help="Write TXT + CSV + PDF outputs to this directory")
|
|
513
|
+
parser.add_argument("--batch", action="store_true",
|
|
514
|
+
help="Process all images in a directory")
|
|
515
|
+
|
|
516
|
+
args = parser.parse_args()
|
|
517
|
+
|
|
518
|
+
# Batch mode
|
|
519
|
+
if args.batch or os.path.isdir(args.image):
|
|
520
|
+
result = run_batch(
|
|
521
|
+
args.image,
|
|
522
|
+
language=args.language,
|
|
523
|
+
do_regions=args.regions,
|
|
524
|
+
debug_dir=args.debug_dir,
|
|
525
|
+
output_dir=args.output_dir,
|
|
526
|
+
)
|
|
527
|
+
if args.output == "text":
|
|
528
|
+
if "error" in result:
|
|
529
|
+
print(f"ERROR: {result['error']}", file=sys.stderr)
|
|
530
|
+
sys.exit(1)
|
|
531
|
+
print(f"Processed {result['images_processed']} images → {result['output_dir']}")
|
|
532
|
+
else:
|
|
533
|
+
print(json.dumps(result, indent=2))
|
|
534
|
+
sys.exit(0)
|
|
535
|
+
|
|
536
|
+
# Single image mode
|
|
537
|
+
if not os.path.isfile(args.image):
|
|
538
|
+
print(json.dumps({"error": f"File not found: {args.image}"}))
|
|
539
|
+
sys.exit(1)
|
|
540
|
+
|
|
541
|
+
pixel_region = None
|
|
542
|
+
if args.region:
|
|
543
|
+
try:
|
|
544
|
+
pixel_region = tuple(int(x) for x in args.region.split(","))
|
|
545
|
+
if len(pixel_region) != 4:
|
|
546
|
+
raise ValueError
|
|
547
|
+
except ValueError:
|
|
548
|
+
print(json.dumps({"error": "Region must be x,y,w,h (4 integers)"}))
|
|
549
|
+
sys.exit(1)
|
|
550
|
+
|
|
551
|
+
result = run_pipeline(
|
|
552
|
+
args.image,
|
|
553
|
+
language=args.language,
|
|
554
|
+
do_regions=args.regions,
|
|
555
|
+
debug_dir=args.debug_dir,
|
|
556
|
+
single_psm=args.psm,
|
|
557
|
+
pixel_region=pixel_region,
|
|
558
|
+
output_dir=args.output_dir,
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
if args.output == "text":
|
|
562
|
+
if "error" in result:
|
|
563
|
+
print(f"ERROR: {result['error']}", file=sys.stderr)
|
|
564
|
+
sys.exit(1)
|
|
565
|
+
print(result["text"])
|
|
566
|
+
else:
|
|
567
|
+
print(json.dumps(result, indent=2))
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
if __name__ == "__main__":
|
|
571
|
+
main()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Headless Moondream Station launcher for open-agents.
|
|
4
|
+
|
|
5
|
+
Starts the Moondream vision model REST API server on port 2020 without
|
|
6
|
+
the interactive REPL. Designed to be auto-launched by the VisionTool.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python start-moondream.py [--port 2020] [--host 127.0.0.1]
|
|
10
|
+
|
|
11
|
+
Environment:
|
|
12
|
+
HF_TOKEN - HuggingFace token (optional, for gated models)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import signal
|
|
17
|
+
import time
|
|
18
|
+
import argparse
|
|
19
|
+
|
|
20
|
+
def main():
|
|
21
|
+
parser = argparse.ArgumentParser(description="Start Moondream Station REST server")
|
|
22
|
+
parser.add_argument("--port", type=int, default=2020, help="Server port (default: 2020)")
|
|
23
|
+
parser.add_argument("--host", default="127.0.0.1", help="Server host (default: 127.0.0.1)")
|
|
24
|
+
parser.add_argument("--model", default=None, help="Model to use (default: auto-detect, prefers non-gated moondream-2)")
|
|
25
|
+
args = parser.parse_args()
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from moondream_station.core.config import ConfigManager
|
|
29
|
+
from moondream_station.core.manifest import ManifestManager
|
|
30
|
+
from moondream_station.core.models import ModelManager
|
|
31
|
+
from moondream_station.core.service import ServiceManager
|
|
32
|
+
from moondream_station.core.analytics import Analytics
|
|
33
|
+
from moondream_station.session import SessionState
|
|
34
|
+
from moondream_station.ui.display import Display
|
|
35
|
+
except ImportError:
|
|
36
|
+
print("ERROR: moondream-station not installed. Install with: pip install moondream-station", file=sys.stderr)
|
|
37
|
+
sys.exit(1)
|
|
38
|
+
|
|
39
|
+
print(f"[moondream] Initializing...", flush=True)
|
|
40
|
+
|
|
41
|
+
config = ConfigManager()
|
|
42
|
+
config.set("service_host", args.host)
|
|
43
|
+
config.set("service_port", args.port)
|
|
44
|
+
|
|
45
|
+
manifest_manager = ManifestManager(config)
|
|
46
|
+
analytics = Analytics(config, manifest_manager)
|
|
47
|
+
display = Display()
|
|
48
|
+
models = ModelManager(config, manifest_manager)
|
|
49
|
+
session_state = SessionState()
|
|
50
|
+
|
|
51
|
+
# Load manifest
|
|
52
|
+
manifest_url = "https://m87-md-prod-assets.s3.us-west-2.amazonaws.com/station/mds2/production_manifest.json"
|
|
53
|
+
print(f"[moondream] Loading manifest...", flush=True)
|
|
54
|
+
try:
|
|
55
|
+
manifest_manager.load_manifest(manifest_url, analytics, display)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
print(f"ERROR: Failed to load manifest: {e}", file=sys.stderr)
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
|
|
60
|
+
# Select model — prefer moondream-2 (non-gated) unless overridden
|
|
61
|
+
import os
|
|
62
|
+
model_name = args.model
|
|
63
|
+
if not model_name:
|
|
64
|
+
# Prefer moondream-2 (no HF token required) unless user has HF_TOKEN
|
|
65
|
+
has_hf_token = bool(os.environ.get("HF_TOKEN") or config.get("hf_token"))
|
|
66
|
+
if has_hf_token:
|
|
67
|
+
model_name = manifest_manager.get_available_default_model()
|
|
68
|
+
else:
|
|
69
|
+
model_name = "moondream-2"
|
|
70
|
+
if not model_name:
|
|
71
|
+
model_name = manifest_manager.get_available_default_model()
|
|
72
|
+
if not model_name:
|
|
73
|
+
print("ERROR: No model available", file=sys.stderr)
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
|
|
76
|
+
print(f"[moondream] Switching to model: {model_name}", flush=True)
|
|
77
|
+
if not models.switch_model(model_name, display):
|
|
78
|
+
print(f"ERROR: Failed to switch to model {model_name}", file=sys.stderr)
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
# Start REST server
|
|
82
|
+
service = ServiceManager(config, manifest_manager, session_state, analytics)
|
|
83
|
+
print(f"[moondream] Starting REST server on {args.host}:{args.port}...", flush=True)
|
|
84
|
+
|
|
85
|
+
if not service.start(model_name, args.port):
|
|
86
|
+
print("ERROR: Failed to start REST server", file=sys.stderr)
|
|
87
|
+
sys.exit(1)
|
|
88
|
+
|
|
89
|
+
print(f"[moondream] Server running at http://{args.host}:{args.port}/v1", flush=True)
|
|
90
|
+
print(f"[moondream] Endpoints: /v1/caption, /v1/query, /v1/detect, /v1/point", flush=True)
|
|
91
|
+
print(f"READY", flush=True)
|
|
92
|
+
|
|
93
|
+
# Handle shutdown
|
|
94
|
+
def shutdown(signum, frame):
|
|
95
|
+
print(f"\n[moondream] Shutting down...", flush=True)
|
|
96
|
+
service.stop()
|
|
97
|
+
sys.exit(0)
|
|
98
|
+
|
|
99
|
+
signal.signal(signal.SIGTERM, shutdown)
|
|
100
|
+
signal.signal(signal.SIGINT, shutdown)
|
|
101
|
+
|
|
102
|
+
# Keep alive
|
|
103
|
+
try:
|
|
104
|
+
while service.is_running():
|
|
105
|
+
time.sleep(1)
|
|
106
|
+
except KeyboardInterrupt:
|
|
107
|
+
shutdown(None, None)
|
|
108
|
+
|
|
109
|
+
print("[moondream] Server stopped", flush=True)
|
|
110
|
+
|
|
111
|
+
if __name__ == "__main__":
|
|
112
|
+
main()
|