preppergpt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +93 -0
  3. package/bin/preppergpt.js +8 -0
  4. package/compose/preppergpt.yaml +232 -0
  5. package/docs/hardware.md +15 -0
  6. package/docs/model-sources.md +12 -0
  7. package/docs/preppergpt-local-parity-map.md +16 -0
  8. package/docs/publishing.md +24 -0
  9. package/installer/cli.mjs +225 -0
  10. package/installer/install.sh +18 -0
  11. package/installer/lib/detect.mjs +128 -0
  12. package/installer/lib/paths.mjs +26 -0
  13. package/installer/lib/planner.mjs +175 -0
  14. package/installer/lib/render.mjs +76 -0
  15. package/installer/lib/util.mjs +84 -0
  16. package/package.json +48 -0
  17. package/profiles/models.json +277 -0
  18. package/services/comfyui/flux-kontext-edit-openwebui-nodes.json +46 -0
  19. package/services/comfyui/flux-kontext-edit-openwebui-workflow.json +245 -0
  20. package/services/comfyui/flux-kontext-mask-edit-openwebui-nodes.json +51 -0
  21. package/services/comfyui/flux-kontext-mask-edit-openwebui-workflow.json +322 -0
  22. package/services/comfyui/flux2-klein-9b-openwebui-nodes.json +58 -0
  23. package/services/comfyui/flux2-klein-9b-openwebui-workflow.json +141 -0
  24. package/services/comfyui/image-invert-edit-openwebui-nodes.json +23 -0
  25. package/services/comfyui/image-invert-edit-openwebui-workflow.json +52 -0
  26. package/services/deep-research/Dockerfile +7 -0
  27. package/services/deep-research/app.py +1913 -0
  28. package/services/local-agent/Dockerfile +17 -0
  29. package/services/local-agent/app.py +2311 -0
  30. package/services/local-scheduler/Dockerfile +8 -0
  31. package/services/local-scheduler/app.py +15774 -0
  32. package/services/local-vision/Dockerfile +11 -0
  33. package/services/local-vision/app.py +888 -0
  34. package/services/searxng/settings.yml +16 -0
  35. package/themes/preppergpt/custom.css +15 -0
  36. package/themes/preppergpt/static/favicon.svg +5 -0
  37. package/themes/preppergpt/static/logo.svg +6 -0
@@ -0,0 +1,888 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ import json
6
+ import os
7
+ import re
8
+ import time
9
+ import uuid
10
+ from dataclasses import dataclass
11
+ from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
12
+ from io import BytesIO
13
+ from pathlib import Path
14
+ from threading import Lock
15
+ from urllib import parse, request
16
+
17
+ from PIL import Image
18
+
19
+
20
+ MODEL_ID = os.environ.get("LOCAL_VISION_MODEL_ID", "local-vision-moondream2")
21
+ HF_MODEL = os.environ.get("LOCAL_VISION_HF_MODEL", "vikhyatk/moondream2")
22
+ HF_REVISION = os.environ.get("LOCAL_VISION_HF_REVISION", "2025-01-09")
23
+ HOST = os.environ.get("LOCAL_VISION_HOST", "127.0.0.1")
24
+ PORT = int(os.environ.get("LOCAL_VISION_PORT", "18044"))
25
+ OPENWEBUI_BASE_URL = os.environ.get("LOCAL_VISION_OPENWEBUI_BASE_URL", "http://127.0.0.1:8080")
26
+ LOCAL_FILES_ONLY = os.environ.get("LOCAL_VISION_LOCAL_FILES_ONLY", "1").lower() not in {"0", "false", "no"}
27
+ MAX_IMAGE_BYTES = int(os.environ.get("LOCAL_VISION_MAX_IMAGE_BYTES", str(12 * 1024 * 1024)))
28
+ MODEL_LOAD_TIMEOUT_HINT = int(os.environ.get("LOCAL_VISION_LOAD_TIMEOUT_HINT_SECONDS", "600"))
29
+ DEVICE_SETTING = os.environ.get("LOCAL_VISION_DEVICE", "cpu").lower()
30
+ ENABLE_OCR = os.environ.get("LOCAL_VISION_ENABLE_OCR", "1").lower() not in {"0", "false", "no"}
31
+ OCR_MIN_CONFIDENCE = float(os.environ.get("LOCAL_VISION_OCR_MIN_CONFIDENCE", "0.45"))
32
+ OCR_MAX_CHARS = int(os.environ.get("LOCAL_VISION_OCR_MAX_CHARS", "2200"))
33
+ OLLAMA_ENABLED = os.environ.get("LOCAL_VISION_OLLAMA_ENABLED", "1").lower() not in {"0", "false", "no"}
34
+ OLLAMA_MODEL_ID = os.environ.get("LOCAL_VISION_OLLAMA_MODEL_ID", "local-vision-gemma4-12b")
35
+ OLLAMA_MODEL = os.environ.get("LOCAL_VISION_OLLAMA_MODEL", "gemma4:12b")
36
+ OLLAMA_URL = os.environ.get("LOCAL_VISION_OLLAMA_URL", "http://127.0.0.1:11434").rstrip("/")
37
+ OLLAMA_TIMEOUT_SECONDS = int(os.environ.get("LOCAL_VISION_OLLAMA_TIMEOUT_SECONDS", "600"))
38
+
39
+ MODEL = None
40
+ DEVICE = "cpu"
41
+ MODEL_LOCK = Lock()
42
+ OCR_ENGINE = None
43
+ OCR_ERROR = None
44
+ OCR_LOCK = Lock()
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class OcrLine:
49
+ text: str
50
+ score: float | None
51
+ box: tuple[float, float, float, float] | None
52
+
53
+
54
+ def now() -> int:
55
+ return int(time.time())
56
+
57
+
58
+ def read_json(handler: BaseHTTPRequestHandler) -> dict:
59
+ length = int(handler.headers.get("Content-Length", "0") or "0")
60
+ raw = handler.rfile.read(length) if length else b"{}"
61
+ return json.loads(raw.decode("utf-8") or "{}")
62
+
63
+
64
+ def send_json(handler: BaseHTTPRequestHandler, status: int, payload: dict):
65
+ body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
66
+ handler.send_response(status)
67
+ handler.send_header("Content-Type", "application/json; charset=utf-8")
68
+ handler.send_header("Content-Length", str(len(body)))
69
+ handler.send_header("Access-Control-Allow-Origin", "*")
70
+ handler.end_headers()
71
+ handler.wfile.write(body)
72
+
73
+
74
+ def model_card() -> dict:
75
+ return {
76
+ "id": MODEL_ID,
77
+ "name": MODEL_ID,
78
+ "object": "model",
79
+ "created": now(),
80
+ "owned_by": "local",
81
+ "connection_type": "local",
82
+ "info": {
83
+ "id": MODEL_ID,
84
+ "name": "Local Vision - Moondream2 + OCR",
85
+ "meta": {
86
+ "description": "Local Moondream2 image understanding model with OCR assist for OpenWebUI.",
87
+ "capabilities": {
88
+ "vision": True,
89
+ "file_upload": True,
90
+ "file_context": False,
91
+ "web_search": False,
92
+ "image_generation": False,
93
+ "code_interpreter": False,
94
+ "ocr": ENABLE_OCR,
95
+ },
96
+ },
97
+ },
98
+ }
99
+
100
+
101
+ def ollama_model_card() -> dict:
102
+ return {
103
+ "id": OLLAMA_MODEL_ID,
104
+ "name": OLLAMA_MODEL_ID,
105
+ "object": "model",
106
+ "created": now(),
107
+ "owned_by": "local",
108
+ "connection_type": "local",
109
+ "info": {
110
+ "id": OLLAMA_MODEL_ID,
111
+ "name": f"Local Vision - {OLLAMA_MODEL}",
112
+ "meta": {
113
+ "description": (
114
+ f"Local Ollama vision model backed by {OLLAMA_MODEL}, exposed additively for stronger visual reasoning."
115
+ ),
116
+ "capabilities": {
117
+ "vision": True,
118
+ "file_upload": True,
119
+ "file_context": False,
120
+ "web_search": False,
121
+ "image_generation": False,
122
+ "code_interpreter": False,
123
+ "ocr": ENABLE_OCR,
124
+ },
125
+ "backend": "ollama",
126
+ "backend_model": OLLAMA_MODEL,
127
+ },
128
+ },
129
+ }
130
+
131
+
132
+ def model_cards() -> list[dict]:
133
+ cards = [model_card()]
134
+ if OLLAMA_ENABLED:
135
+ cards.append(ollama_model_card())
136
+ return cards
137
+
138
+
139
+ def load_model():
140
+ global MODEL, DEVICE
141
+ if MODEL is not None:
142
+ return MODEL
143
+
144
+ with MODEL_LOCK:
145
+ if MODEL is not None:
146
+ return MODEL
147
+
148
+ import torch
149
+ from transformers import AutoModelForCausalLM
150
+ from transformers import PreTrainedModel
151
+
152
+ if not hasattr(PreTrainedModel, "all_tied_weights_keys"):
153
+ PreTrainedModel.all_tied_weights_keys = property(lambda self: {})
154
+
155
+ if DEVICE_SETTING == "auto":
156
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
157
+ elif DEVICE_SETTING == "cuda" and not torch.cuda.is_available():
158
+ DEVICE = "cpu"
159
+ elif DEVICE_SETTING in {"cpu", "cuda"}:
160
+ DEVICE = DEVICE_SETTING
161
+ else:
162
+ DEVICE = "cpu"
163
+
164
+ kwargs = {
165
+ "revision": HF_REVISION,
166
+ "trust_remote_code": True,
167
+ "local_files_only": LOCAL_FILES_ONLY,
168
+ }
169
+ if DEVICE == "cuda":
170
+ kwargs["torch_dtype"] = torch.float16
171
+ else:
172
+ kwargs["torch_dtype"] = torch.float32
173
+
174
+ model = AutoModelForCausalLM.from_pretrained(HF_MODEL, **kwargs)
175
+ model = model.to(DEVICE).eval()
176
+ MODEL = model
177
+ return MODEL
178
+
179
+
180
+ def load_ocr_engine():
181
+ global OCR_ENGINE, OCR_ERROR
182
+ if not ENABLE_OCR:
183
+ OCR_ERROR = "disabled"
184
+ return None
185
+ if OCR_ENGINE is not None:
186
+ return OCR_ENGINE
187
+
188
+ with OCR_LOCK:
189
+ if OCR_ENGINE is not None:
190
+ return OCR_ENGINE
191
+ try:
192
+ from rapidocr_onnxruntime import RapidOCR
193
+
194
+ OCR_ENGINE = RapidOCR()
195
+ OCR_ERROR = None
196
+ return OCR_ENGINE
197
+ except Exception as exc:
198
+ OCR_ERROR = str(exc)
199
+ return None
200
+
201
+
202
+ def box_bounds(raw_box) -> tuple[float, float, float, float] | None:
203
+ if not isinstance(raw_box, (list, tuple)) or not raw_box:
204
+ return None
205
+
206
+ points = []
207
+ for point in raw_box:
208
+ if isinstance(point, (list, tuple)) and len(point) >= 2:
209
+ try:
210
+ points.append((float(point[0]), float(point[1])))
211
+ except (TypeError, ValueError):
212
+ continue
213
+ if points:
214
+ xs = [point[0] for point in points]
215
+ ys = [point[1] for point in points]
216
+ return min(xs), min(ys), max(xs), max(ys)
217
+
218
+ if len(raw_box) >= 4:
219
+ try:
220
+ x1, y1, x2, y2 = (float(raw_box[index]) for index in range(4))
221
+ return min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)
222
+ except (TypeError, ValueError):
223
+ return None
224
+ return None
225
+
226
+
227
+ def normalize_ocr_item(item) -> OcrLine | None:
228
+ text = None
229
+ score = None
230
+ box = None
231
+
232
+ if isinstance(item, dict):
233
+ text = item.get("text") or item.get("rec_text") or item.get("label")
234
+ raw_score = item.get("score", item.get("confidence", item.get("rec_score")))
235
+ raw_box = item.get("box") or item.get("bbox") or item.get("points")
236
+ elif isinstance(item, (list, tuple)):
237
+ raw_box = item[0] if item else None
238
+ if len(item) >= 3:
239
+ text = item[1]
240
+ raw_score = item[2]
241
+ elif len(item) >= 2:
242
+ if isinstance(item[0], str):
243
+ text = item[0]
244
+ raw_score = item[1]
245
+ else:
246
+ text = item[1]
247
+ raw_score = None
248
+ else:
249
+ return None
250
+ else:
251
+ return None
252
+
253
+ if not isinstance(text, str):
254
+ return None
255
+ text = re.sub(r"\s+", " ", text).strip()
256
+ if not text:
257
+ return None
258
+
259
+ try:
260
+ score = float(raw_score) if raw_score is not None else None
261
+ except (TypeError, ValueError):
262
+ score = None
263
+ if score is not None and score < OCR_MIN_CONFIDENCE:
264
+ return None
265
+
266
+ box = box_bounds(raw_box)
267
+ return OcrLine(text=text, score=score, box=box)
268
+
269
+
270
+ def normalize_ocr_result(result) -> list[OcrLine]:
271
+ if isinstance(result, tuple) and result:
272
+ result = result[0]
273
+ if not isinstance(result, list):
274
+ return []
275
+ lines = [line for line in (normalize_ocr_item(item) for item in result) if line is not None]
276
+ lines.sort(key=lambda line: ((line.box or (0, 0, 0, 0))[1], (line.box or (0, 0, 0, 0))[0]))
277
+ return lines
278
+
279
+
280
+ def extract_ocr_lines(image: Image.Image) -> list[OcrLine]:
281
+ engine = load_ocr_engine()
282
+ if engine is None:
283
+ return []
284
+
285
+ import numpy as np
286
+
287
+ with OCR_LOCK:
288
+ result, _elapsed = engine(np.array(image.convert("RGB")))
289
+ return normalize_ocr_result(result)
290
+
291
+
292
+ def question_wants_ocr(question: str) -> bool:
293
+ if not question:
294
+ return False
295
+ lowered = question.lower()
296
+ keywords = {
297
+ "barcode",
298
+ "chart",
299
+ "code",
300
+ "diagram",
301
+ "extract",
302
+ "field",
303
+ "graph",
304
+ "highest",
305
+ "id",
306
+ "incident",
307
+ "invoice",
308
+ "label",
309
+ "largest",
310
+ "max",
311
+ "month",
312
+ "number",
313
+ "ocr",
314
+ "read",
315
+ "receipt",
316
+ "screenshot",
317
+ "serial",
318
+ "shown",
319
+ "table",
320
+ "target",
321
+ "text",
322
+ "ticket",
323
+ "transcribe",
324
+ "value",
325
+ "visible",
326
+ }
327
+ return any(keyword in lowered for keyword in keywords)
328
+
329
+
330
+ def question_wants_color_square(question: str) -> bool:
331
+ lowered = question.lower()
332
+ return "color" in lowered and "square" in lowered
333
+
334
+
335
+ def named_color_from_rgb(red: int, green: int, blue: int) -> str | None:
336
+ channels = {"red": red, "green": green, "blue": blue}
337
+ name, value = max(channels.items(), key=lambda item: item[1])
338
+ if value < 80:
339
+ return None
340
+ sorted_values = sorted(channels.values(), reverse=True)
341
+ if len(sorted_values) >= 2 and sorted_values[0] - sorted_values[1] < 35:
342
+ return None
343
+ return name.capitalize()
344
+
345
+
346
+ def dominant_non_background_color(image: Image.Image) -> str | None:
347
+ rgb = image.convert("RGB")
348
+ width, height = rgb.size
349
+ left = max(0, width // 5)
350
+ right = min(width, width - left)
351
+ top = max(0, height // 5)
352
+ bottom = min(height, height - top)
353
+ pixels = []
354
+ for y in range(top, bottom):
355
+ for x in range(left, right):
356
+ red, green, blue = rgb.getpixel((x, y))
357
+ if red > 240 and green > 240 and blue > 240:
358
+ continue
359
+ pixels.append((red, green, blue))
360
+ if not pixels:
361
+ return None
362
+
363
+ red = round(sum(pixel[0] for pixel in pixels) / len(pixels))
364
+ green = round(sum(pixel[1] for pixel in pixels) / len(pixels))
365
+ blue = round(sum(pixel[2] for pixel in pixels) / len(pixels))
366
+ return named_color_from_rgb(red, green, blue)
367
+
368
+
369
+ def answer_color_square_question(question: str, images: list[Image.Image]) -> str | None:
370
+ if not question_wants_color_square(question):
371
+ return None
372
+
373
+ colors = [dominant_non_background_color(image) for image in images]
374
+ if not colors or any(color is None for color in colors):
375
+ return None
376
+ if len(colors) == 1:
377
+ return colors[0] or None
378
+ return "\n".join(f"Image {index}: {color}" for index, color in enumerate(colors, start=1) if color)
379
+
380
+
381
+ def ocr_text(lines: list[OcrLine]) -> str:
382
+ text = "\n".join(line.text for line in lines)
383
+ if len(text) <= OCR_MAX_CHARS:
384
+ return text
385
+ return text[:OCR_MAX_CHARS].rsplit("\n", 1)[0].strip()
386
+
387
+
388
+ def normalized_text(value: str) -> str:
389
+ return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip()
390
+
391
+
392
+ def field_value_after_label(lines: list[OcrLine], label: str) -> str | None:
393
+ normalized_label = normalized_text(label)
394
+ for index, line in enumerate(lines):
395
+ if normalized_text(line.text) != normalized_label:
396
+ continue
397
+ label_box = line.box
398
+ candidates: list[tuple[float, str]] = []
399
+ for next_line in lines[index + 1 : index + 8]:
400
+ text = next_line.text.strip()
401
+ if not text:
402
+ continue
403
+ if label_box and next_line.box:
404
+ lx1, ly1, lx2, ly2 = label_box
405
+ nx1, ny1, nx2, ny2 = next_line.box
406
+ same_column = abs(((nx1 + nx2) / 2) - ((lx1 + lx2) / 2)) <= max(120, (lx2 - lx1) * 1.3)
407
+ below = ny1 >= ly1
408
+ if same_column and below:
409
+ candidates.append((ny1 - ly1, text))
410
+ else:
411
+ candidates.append((float(len(candidates)), text))
412
+ if candidates:
413
+ candidates.sort(key=lambda candidate: candidate[0])
414
+ return candidates[0][1]
415
+ return None
416
+
417
+
418
+ CODE_PATTERN = re.compile(r"\b[A-Z]{2,}[A-Z0-9]*[-_ ]?\d{2,}\b")
419
+
420
+
421
+ def code_candidates(text: str) -> list[str]:
422
+ candidates = []
423
+ for match in CODE_PATTERN.finditer(text.upper()):
424
+ candidate = re.sub(r"\s+", "", match.group(0).replace("_", "-"))
425
+ if candidate not in candidates:
426
+ candidates.append(candidate)
427
+ return candidates
428
+
429
+
430
+ def answer_code_question(question: str, lines: list[OcrLine]) -> str | None:
431
+ lowered = question.lower()
432
+ text = ocr_text(lines)
433
+ if "target code" in lowered:
434
+ value = field_value_after_label(lines, "target code")
435
+ if value:
436
+ candidates = code_candidates(value)
437
+ return candidates[0] if candidates else value
438
+
439
+ if "code" in lowered or "serial" in lowered or "ticket" in lowered or "id" in lowered:
440
+ for line in lines:
441
+ if normalized_text(line.text).startswith("code "):
442
+ candidates = code_candidates(line.text)
443
+ if candidates:
444
+ return candidates[-1]
445
+ candidates = code_candidates(text)
446
+ if candidates:
447
+ return candidates[0]
448
+ return None
449
+
450
+
451
+ def center(box: tuple[float, float, float, float]) -> tuple[float, float]:
452
+ x1, y1, x2, y2 = box
453
+ return (x1 + x2) / 2, (y1 + y2) / 2
454
+
455
+
456
+ def answer_highest_labeled_value(question: str, lines: list[OcrLine]) -> str | None:
457
+ lowered = question.lower()
458
+ if not any(word in lowered for word in ("highest", "largest", "maximum", "max", "most")):
459
+ return None
460
+
461
+ month_names = {"jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"}
462
+ labels: list[tuple[str, tuple[float, float, float, float]]] = []
463
+ numbers: list[tuple[float, tuple[float, float, float, float]]] = []
464
+
465
+ for line in lines:
466
+ if not line.box:
467
+ continue
468
+ cleaned = re.sub(r"[^A-Za-z0-9.]", "", line.text)
469
+ lowered_cleaned = cleaned.lower()
470
+ upper_cleaned = cleaned.upper()
471
+ if lowered_cleaned in month_names or re.fullmatch(r"Q[1-4]", upper_cleaned):
472
+ labels.append((cleaned, line.box))
473
+ if re.fullmatch(r"\d+(?:\.\d+)?", cleaned):
474
+ numbers.append((float(cleaned), line.box))
475
+
476
+ if not labels or not numbers:
477
+ return None
478
+
479
+ values_by_label: list[tuple[float, str]] = []
480
+ for label, label_box in labels:
481
+ label_x, label_y = center(label_box)
482
+ close_numbers = []
483
+ for value, number_box in numbers:
484
+ number_x, number_y = center(number_box)
485
+ if number_y >= label_y:
486
+ continue
487
+ x_distance = abs(number_x - label_x)
488
+ if x_distance <= 52:
489
+ close_numbers.append((x_distance, value))
490
+ if close_numbers:
491
+ close_numbers.sort(key=lambda candidate: candidate[0])
492
+ values_by_label.append((close_numbers[0][1], label))
493
+
494
+ if not values_by_label:
495
+ return None
496
+ values_by_label.sort(key=lambda item: item[0], reverse=True)
497
+ return values_by_label[0][1]
498
+
499
+
500
+ def answer_from_ocr(question: str, lines: list[OcrLine]) -> str | None:
501
+ if not lines:
502
+ return None
503
+
504
+ for answerer in (answer_code_question, answer_highest_labeled_value):
505
+ answer = answerer(question, lines)
506
+ if answer:
507
+ return answer
508
+
509
+ lowered = question.lower()
510
+ if any(phrase in lowered for phrase in ("read all", "extract text", "transcribe", "what text")):
511
+ text = ocr_text(lines)
512
+ return text or None
513
+ return None
514
+
515
+
516
+ def assisted_question(question: str, lines: list[OcrLine], image_index: int | None = None) -> str:
517
+ text = ocr_text(lines)
518
+ if not text:
519
+ return question
520
+ prefix = "Detected OCR text"
521
+ if image_index is not None:
522
+ prefix += f" for image {image_index}"
523
+ if question:
524
+ return f"{question}\n\n{prefix}:\n{text}\n\nUse the OCR text when it is relevant. Answer concisely."
525
+ return f"{prefix}:\n{text}\n\nDescribe the image and mention relevant detected text."
526
+
527
+
528
+ def parse_data_url(url: str) -> bytes:
529
+ match = re.match(r"^data:([^;,]+)?(;base64)?,(.*)$", url, re.DOTALL)
530
+ if not match:
531
+ raise ValueError("invalid data URL")
532
+ is_base64 = bool(match.group(2))
533
+ payload = match.group(3)
534
+ if is_base64:
535
+ raw = base64.b64decode(payload)
536
+ else:
537
+ raw = parse.unquote_to_bytes(payload)
538
+ if len(raw) > MAX_IMAGE_BYTES:
539
+ raise ValueError("image is too large")
540
+ return raw
541
+
542
+
543
+ def read_limited(resp) -> bytes:
544
+ chunks = []
545
+ total = 0
546
+ while True:
547
+ chunk = resp.read(65536)
548
+ if not chunk:
549
+ break
550
+ total += len(chunk)
551
+ if total > MAX_IMAGE_BYTES:
552
+ raise ValueError("image is too large")
553
+ chunks.append(chunk)
554
+ return b"".join(chunks)
555
+
556
+
557
+ def fetch_image_bytes(url: str) -> bytes:
558
+ if url.startswith("data:"):
559
+ return parse_data_url(url)
560
+ if url.startswith("/"):
561
+ url = OPENWEBUI_BASE_URL.rstrip("/") + url
562
+ if url.startswith("file://"):
563
+ path = Path(parse.urlparse(url).path)
564
+ raw = path.read_bytes()
565
+ if len(raw) > MAX_IMAGE_BYTES:
566
+ raise ValueError("image is too large")
567
+ return raw
568
+ if url.startswith(("http://", "https://")):
569
+ req = request.Request(url, headers={"User-Agent": "openwebui-local-vision/0.1"})
570
+ with request.urlopen(req, timeout=60) as resp:
571
+ return read_limited(resp)
572
+ path = Path(url)
573
+ if path.exists():
574
+ raw = path.read_bytes()
575
+ if len(raw) > MAX_IMAGE_BYTES:
576
+ raise ValueError("image is too large")
577
+ return raw
578
+ raise ValueError("unsupported image URL")
579
+
580
+
581
+ def image_from_url(url: str) -> Image.Image:
582
+ raw = fetch_image_bytes(url)
583
+ image = Image.open(BytesIO(raw))
584
+ return image.convert("RGB")
585
+
586
+
587
+ def image_to_png_base64(image: Image.Image) -> str:
588
+ buf = BytesIO()
589
+ image.save(buf, format="PNG")
590
+ return base64.b64encode(buf.getvalue()).decode("ascii")
591
+
592
+
593
+ def post_json(url: str, payload: dict, timeout: int) -> dict:
594
+ raw = json.dumps(payload, ensure_ascii=False).encode("utf-8")
595
+ req = request.Request(
596
+ url,
597
+ data=raw,
598
+ headers={"Content-Type": "application/json", "User-Agent": "openwebui-local-vision/0.3"},
599
+ method="POST",
600
+ )
601
+ with request.urlopen(req, timeout=timeout) as resp:
602
+ body = resp.read()
603
+ return json.loads(body.decode("utf-8") or "{}")
604
+
605
+
606
+ def content_to_text_and_images(messages: list[dict]) -> tuple[str, list[str]]:
607
+ question_parts: list[str] = []
608
+ image_urls: list[str] = []
609
+
610
+ for message in reversed(messages):
611
+ if message.get("role") != "user":
612
+ continue
613
+ content = message.get("content", "")
614
+ if isinstance(content, str):
615
+ question_parts.append(content)
616
+ elif isinstance(content, list):
617
+ for part in content:
618
+ if not isinstance(part, dict):
619
+ continue
620
+ part_type = part.get("type")
621
+ if part_type in {"text", "input_text"}:
622
+ text = part.get("text")
623
+ if isinstance(text, str):
624
+ question_parts.append(text)
625
+ elif part_type in {"image_url", "input_image"}:
626
+ raw = part.get("image_url") or part.get("image")
627
+ if isinstance(raw, dict):
628
+ raw = raw.get("url")
629
+ if isinstance(raw, str):
630
+ image_urls.append(raw)
631
+ if question_parts or image_urls:
632
+ break
633
+
634
+ question = "\n".join(part.strip() for part in reversed(question_parts) if part.strip()).strip()
635
+ return question, image_urls
636
+
637
+
638
+ def answer_single_image_question(model, image: Image.Image, question: str, max_tokens: int) -> str:
639
+ settings = {"max_tokens": max_tokens}
640
+
641
+ with MODEL_LOCK:
642
+ if question:
643
+ result = model.query(image, question, settings=settings)
644
+ answer = result.get("answer", "")
645
+ else:
646
+ result = model.caption(image, length="normal", settings=settings)
647
+ answer = result.get("caption", "")
648
+
649
+ answer = str(answer).strip()
650
+ if not answer:
651
+ answer = "I could not produce an image answer."
652
+ return answer
653
+
654
+
655
+ def answer_image_question(question: str, image_urls: list[str], max_tokens: int) -> str:
656
+ images = [image_from_url(url) for url in image_urls]
657
+ color_answer = answer_color_square_question(question, images)
658
+ if color_answer:
659
+ return color_answer
660
+
661
+ wants_ocr = question_wants_ocr(question)
662
+ ocr_lines_by_image = [extract_ocr_lines(image) if wants_ocr else [] for image in images]
663
+
664
+ if len(images) == 1:
665
+ ocr_answer = answer_from_ocr(question, ocr_lines_by_image[0]) if wants_ocr else None
666
+ if ocr_answer:
667
+ return ocr_answer
668
+
669
+ model = load_model()
670
+
671
+ if len(images) == 1:
672
+ prompt = assisted_question(question, ocr_lines_by_image[0]) if wants_ocr else question
673
+ return answer_single_image_question(model, images[0], prompt, max_tokens)
674
+
675
+ answers = []
676
+ per_image_tokens = max(16, min(128, max_tokens // max(1, len(images))))
677
+ for index, image in enumerate(images, start=1):
678
+ if question:
679
+ per_image_question = f"{question}\nAnswer for image {index} only."
680
+ else:
681
+ per_image_question = ""
682
+ if wants_ocr:
683
+ per_image_question = assisted_question(per_image_question, ocr_lines_by_image[index - 1], index)
684
+ answer = answer_single_image_question(model, image, per_image_question, per_image_tokens)
685
+ answers.append(f"Image {index}: {answer}")
686
+ return "\n".join(answers)
687
+
688
+
689
+ def answer_ollama_image_question(question: str, image_urls: list[str], max_tokens: int) -> str:
690
+ images = [image_from_url(url) for url in image_urls]
691
+ wants_ocr = question_wants_ocr(question)
692
+ ocr_lines_by_image = [extract_ocr_lines(image) if wants_ocr else [] for image in images]
693
+
694
+ if len(images) == 1:
695
+ ocr_answer = answer_from_ocr(question, ocr_lines_by_image[0]) if wants_ocr else None
696
+ if ocr_answer:
697
+ return ocr_answer
698
+
699
+ prompt = question or "Describe the image."
700
+ if wants_ocr:
701
+ if len(images) == 1:
702
+ prompt = assisted_question(prompt, ocr_lines_by_image[0])
703
+ else:
704
+ ocr_blocks = []
705
+ for index, lines in enumerate(ocr_lines_by_image, start=1):
706
+ text = ocr_text(lines)
707
+ if text:
708
+ ocr_blocks.append(f"Image {index}:\n{text}")
709
+ if ocr_blocks:
710
+ prompt = (
711
+ f"{prompt}\n\nDetected OCR text:\n"
712
+ + "\n\n".join(ocr_blocks)
713
+ + "\n\nUse the OCR text when it is relevant. Answer concisely."
714
+ )
715
+
716
+ payload = {
717
+ "model": OLLAMA_MODEL,
718
+ "messages": [
719
+ {
720
+ "role": "user",
721
+ "content": prompt,
722
+ "images": [image_to_png_base64(image) for image in images],
723
+ }
724
+ ],
725
+ "stream": False,
726
+ "think": False,
727
+ "options": {
728
+ "temperature": 0,
729
+ "num_predict": max(16, min(max_tokens, 1024)),
730
+ },
731
+ }
732
+ data = post_json(f"{OLLAMA_URL}/api/chat", payload, OLLAMA_TIMEOUT_SECONDS)
733
+ message = data.get("message") if isinstance(data, dict) else None
734
+ answer = ""
735
+ if isinstance(message, dict):
736
+ answer = str(message.get("content") or "").strip()
737
+ if not answer and isinstance(data, dict):
738
+ answer = str(data.get("response") or "").strip()
739
+ if not answer:
740
+ answer = "I could not produce an image answer."
741
+ return answer
742
+
743
+
744
+ def backend_for_model(model_id: str) -> str | None:
745
+ if model_id == MODEL_ID:
746
+ return "moondream"
747
+ if OLLAMA_ENABLED and model_id == OLLAMA_MODEL_ID:
748
+ return "ollama"
749
+ return None
750
+
751
+
752
+ def chunk_payload(content: str, model_id: str, finish_reason=None) -> dict:
753
+ return {
754
+ "id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
755
+ "object": "chat.completion.chunk",
756
+ "created": now(),
757
+ "model": model_id,
758
+ "choices": [{"index": 0, "delta": {"content": content} if content else {}, "finish_reason": finish_reason}],
759
+ }
760
+
761
+
762
+ class Handler(BaseHTTPRequestHandler):
763
+ server_version = "openwebui-local-vision/0.3"
764
+
765
+ def log_message(self, fmt, *args):
766
+ print("%s - - [%s] %s" % (self.client_address[0], self.log_date_time_string(), fmt % args), flush=True)
767
+
768
+ def do_OPTIONS(self):
769
+ self.send_response(204)
770
+ self.send_header("Access-Control-Allow-Origin", "*")
771
+ self.send_header("Access-Control-Allow-Headers", "Authorization, Content-Type")
772
+ self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
773
+ self.end_headers()
774
+
775
+ def do_GET(self):
776
+ path = parse.urlparse(self.path).path
777
+ if path == "/health":
778
+ return send_json(
779
+ self,
780
+ 200,
781
+ {
782
+ "status": "ok",
783
+ "model": MODEL_ID,
784
+ "backend": HF_MODEL,
785
+ "revision": HF_REVISION,
786
+ "loaded": MODEL is not None,
787
+ "device": DEVICE,
788
+ "models": [card["id"] for card in model_cards()],
789
+ "ollama": {
790
+ "enabled": OLLAMA_ENABLED,
791
+ "model_id": OLLAMA_MODEL_ID if OLLAMA_ENABLED else None,
792
+ "backend_model": OLLAMA_MODEL if OLLAMA_ENABLED else None,
793
+ "url": OLLAMA_URL if OLLAMA_ENABLED else None,
794
+ },
795
+ "ocr": {
796
+ "enabled": ENABLE_OCR,
797
+ "loaded": OCR_ENGINE is not None,
798
+ "backend": "rapidocr-onnxruntime" if ENABLE_OCR else None,
799
+ "error": OCR_ERROR,
800
+ },
801
+ "load_timeout_hint_seconds": MODEL_LOAD_TIMEOUT_HINT,
802
+ },
803
+ )
804
+ if path in {"/v1/models", "/models"}:
805
+ cards = model_cards()
806
+ return send_json(self, 200, {"object": "list", "data": cards, "models": cards})
807
+ return send_json(self, 404, {"error": "not found"})
808
+
809
+ def do_POST(self):
810
+ path = parse.urlparse(self.path).path
811
+ if path not in {"/v1/chat/completions", "/chat/completions"}:
812
+ return send_json(self, 404, {"error": "not found"})
813
+
814
+ try:
815
+ payload = read_json(self)
816
+ requested_model = str(payload.get("model") or MODEL_ID)
817
+ backend = backend_for_model(requested_model)
818
+ if backend is None:
819
+ return send_json(
820
+ self,
821
+ 404,
822
+ {
823
+ "error": {
824
+ "message": f"unknown local vision model {requested_model!r}",
825
+ "type": "model_not_found",
826
+ }
827
+ },
828
+ )
829
+
830
+ question, image_urls = content_to_text_and_images(payload.get("messages") or [])
831
+ if not image_urls:
832
+ return send_json(
833
+ self,
834
+ 400,
835
+ {
836
+ "error": {
837
+ "message": f"{requested_model} requires at least one image_url or input_image content part"
838
+ }
839
+ },
840
+ )
841
+
842
+ max_tokens = int(payload.get("max_tokens") or 128)
843
+ capped_tokens = max(8, min(max_tokens, 1024))
844
+ if backend == "ollama":
845
+ answer = answer_ollama_image_question(question, image_urls, capped_tokens)
846
+ else:
847
+ answer = answer_image_question(question, image_urls, min(capped_tokens, 512))
848
+
849
+ if payload.get("stream"):
850
+ self.send_response(200)
851
+ self.send_header("Content-Type", "text/event-stream; charset=utf-8")
852
+ self.send_header("Cache-Control", "no-cache")
853
+ self.send_header("Connection", "keep-alive")
854
+ self.send_header("Access-Control-Allow-Origin", "*")
855
+ self.end_headers()
856
+ event = "data: " + json.dumps(chunk_payload(answer, requested_model), ensure_ascii=False) + "\n\n"
857
+ self.wfile.write(event.encode("utf-8"))
858
+ done = "data: " + json.dumps(chunk_payload("", requested_model, "stop")) + "\n\n" + "data: [DONE]\n\n"
859
+ self.wfile.write(done.encode("utf-8"))
860
+ self.wfile.flush()
861
+ return
862
+
863
+ return send_json(
864
+ self,
865
+ 200,
866
+ {
867
+ "id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
868
+ "object": "chat.completion",
869
+ "created": now(),
870
+ "model": requested_model,
871
+ "choices": [{"index": 0, "message": {"role": "assistant", "content": answer}, "finish_reason": "stop"}],
872
+ "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
873
+ },
874
+ )
875
+ except BrokenPipeError:
876
+ return
877
+ except Exception as exc:
878
+ return send_json(self, 500, {"error": {"message": str(exc), "type": "server_error"}})
879
+
880
+
881
+ def main():
882
+ server = ThreadingHTTPServer((HOST, PORT), Handler)
883
+ print(f"local vision listening on http://{HOST}:{PORT}", flush=True)
884
+ server.serve_forever()
885
+
886
+
887
+ if __name__ == "__main__":
888
+ main()