npm - @elizaos/plugin-vision - Versions diffs - 2.0.0-alpha.9 → 2.0.3-beta.5 - Mend

@elizaos/plugin-vision 2.0.0-alpha.9 → 2.0.3-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

package/LICENSE +21 -0
package/README.md +112 -0
package/auto-enable.ts +29 -0
package/dist/action.d.ts +3 -0
package/dist/action.d.ts.map +1 -0
package/dist/audio-capture-stream.d.ts +42 -0
package/dist/audio-capture-stream.d.ts.map +1 -0
package/dist/audio-capture.d.ts +25 -0
package/dist/audio-capture.d.ts.map +1 -0
package/dist/computeruse-ocr-bridge.d.ts +50 -0
package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
package/dist/config.d.ts +68 -0
package/dist/config.d.ts.map +1 -0
package/dist/describe-backpressure.d.ts +90 -0
package/dist/describe-backpressure.d.ts.map +1 -0
package/dist/dirty-tile-describer.d.ts +102 -0
package/dist/dirty-tile-describer.d.ts.map +1 -0
package/dist/dirty-tile-scene.d.ts +56 -0
package/dist/dirty-tile-scene.d.ts.map +1 -0
package/dist/entity-tracker.d.ts +33 -0
package/dist/entity-tracker.d.ts.map +1 -0
package/dist/face-detector-ggml.d.ts +60 -0
package/dist/face-detector-ggml.d.ts.map +1 -0
package/dist/face-detector-mediapipe.d.ts +25 -0
package/dist/face-detector-mediapipe.d.ts.map +1 -0
package/dist/face-recognition-ggml.d.ts +94 -0
package/dist/face-recognition-ggml.d.ts.map +1 -0
package/dist/get-screen-elements.d.ts +90 -0
package/dist/get-screen-elements.d.ts.map +1 -0
package/dist/get-screen.d.ts +60 -0
package/dist/get-screen.d.ts.map +1 -0
package/dist/image/sharp-compat.d.ts +89 -0
package/dist/image/sharp-compat.d.ts.map +1 -0
package/dist/image-input.d.ts +15 -0
package/dist/image-input.d.ts.map +1 -0
package/dist/index.d.ts +4 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +7992 -6026
package/dist/index.js.map +42 -26
package/dist/lifecycle.d.ts +94 -0
package/dist/lifecycle.d.ts.map +1 -0
package/dist/mobile/capacitor-camera.d.ts +85 -0
package/dist/mobile/capacitor-camera.d.ts.map +1 -0
package/dist/native/doctr-ffi.d.ts +40 -0
package/dist/native/doctr-ffi.d.ts.map +1 -0
package/dist/native/yolo-ffi.d.ts +21 -0
package/dist/native/yolo-ffi.d.ts.map +1 -0
package/dist/ocr-host-windows.d.ts +34 -0
package/dist/ocr-host-windows.d.ts.map +1 -0
package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
package/dist/ocr-service-doctr.d.ts +61 -0
package/dist/ocr-service-doctr.d.ts.map +1 -0
package/dist/ocr-service-linux-tesseract.d.ts +85 -0
package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
package/dist/ocr-service-paddleocr.d.ts +59 -0
package/dist/ocr-service-paddleocr.d.ts.map +1 -0
package/dist/ocr-service-windows.d.ts +41 -0
package/dist/ocr-service-windows.d.ts.map +1 -0
package/dist/ocr-service.d.ts +91 -0
package/dist/ocr-service.d.ts.map +1 -0
package/dist/ocr-with-coords.d.ts +103 -0
package/dist/ocr-with-coords.d.ts.map +1 -0
package/dist/person-detector.d.ts +17 -0
package/dist/person-detector.d.ts.map +1 -0
package/dist/provider.d.ts +3 -0
package/dist/provider.d.ts.map +1 -0
package/dist/routes.d.ts +7 -0
package/dist/routes.d.ts.map +1 -0
package/dist/screen-capture-bridge.d.ts +51 -0
package/dist/screen-capture-bridge.d.ts.map +1 -0
package/dist/screen-capture.d.ts +17 -0
package/dist/screen-capture.d.ts.map +1 -0
package/dist/screen-tiler.d.ts +75 -0
package/dist/screen-tiler.d.ts.map +1 -0
package/dist/service.d.ts +176 -0
package/dist/service.d.ts.map +1 -0
package/dist/set-of-marks-provider.d.ts +64 -0
package/dist/set-of-marks-provider.d.ts.map +1 -0
package/dist/som.d.ts +135 -0
package/dist/som.d.ts.map +1 -0
package/dist/som.js +184 -0
package/dist/som.js.map +11 -0
package/dist/test-input.d.ts +25 -0
package/dist/test-input.d.ts.map +1 -0
package/dist/types.d.ts +241 -0
package/dist/types.d.ts.map +1 -0
package/dist/vision-context-augmenter.d.ts +93 -0
package/dist/vision-context-augmenter.d.ts.map +1 -0
package/dist/vision-worker-manager.d.ts +51 -0
package/dist/vision-worker-manager.d.ts.map +1 -0
package/dist/workers/ocr-worker.d.ts +2 -0
package/dist/workers/ocr-worker.d.ts.map +1 -0
package/dist/workers/ocr-worker.js +1066 -121865
package/dist/workers/ocr-worker.js.map +10 -130
package/dist/workers/screen-capture-worker.d.ts +2 -0
package/dist/workers/screen-capture-worker.d.ts.map +1 -0
package/dist/workers/screen-capture-worker.js +371 -8
package/dist/workers/screen-capture-worker.js.map +5 -4
package/dist/workers/worker-logger.d.ts +10 -0
package/dist/workers/worker-logger.d.ts.map +1 -0
package/dist/yolo-detector.d.ts +37 -0
package/dist/yolo-detector.d.ts.map +1 -0
package/native/doctr.cpp/CMakeLists.txt +58 -0
package/native/doctr.cpp/README.md +62 -0
package/native/doctr.cpp/include/doctr.h +91 -0
package/native/doctr.cpp/scripts/convert.py +98 -0
package/native/doctr.cpp/src/doctr_det.cpp +112 -0
package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
package/native/macos-vision-ocr.swift +113 -0
package/native/mobilefacenet.cpp/README.md +13 -0
package/native/movenet.cpp/README.md +10 -0
package/native/retinaface.cpp/README.md +12 -0
package/native/yolo.cpp/CMakeLists.txt +57 -0
package/native/yolo.cpp/README.md +64 -0
package/native/yolo.cpp/build.mjs +76 -0
package/native/yolo.cpp/include/yolo.h +62 -0
package/native/yolo.cpp/scripts/convert.py +248 -0
package/native/yolo.cpp/src/yolo.cpp +425 -0
package/native/yolo.cpp/verify/compare.py +99 -0
package/native/yolo.cpp/verify/make_ref.py +75 -0
package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
package/native/yolo.cpp/verify/run_ts.mjs +26 -0
package/package.json +50 -24
package/registry-entry.json +43 -0
package/scripts/vendor-tesseract-linux.mjs +177 -0
package/build.config.ts +0 -70
package/dist/workers/florence2-worker.js +0 -114850
package/dist/workers/florence2-worker.js.map +0 -92

package/native/yolo.cpp/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.20)
+project(yolo LANGUAGES C CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+option(YOLO_WITH_METAL "Use Metal backend on macOS" OFF)
+option(YOLO_WITH_CUDA  "Use CUDA backend"           OFF)
+# Reuse ggml from llama.cpp's vendored copy.
+set(YOLO_GGML_DIR "" CACHE PATH "Path to a ggml source tree")
+if(NOT YOLO_GGML_DIR)
+    set(YOLO_GGML_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../plugin-local-inference/native/llama.cpp/ggml")
+endif()
+set(_yolo_has_ggml FALSE)
+if(EXISTS "${YOLO_GGML_DIR}/include/ggml.h")
+    set(_yolo_has_ggml TRUE)
+    # Build ggml STATICALLY and link it into libyolo so the FFI artifact is a
+    # single self-contained shared library (no ggml.dll/.so co-location or
+    # PATH/RPATH dance at load time). yolo itself stays SHARED below.
+    set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
+    set(GGML_BACKEND_DL    OFF CACHE BOOL "" FORCE)
+    add_subdirectory(${YOLO_GGML_DIR} ggml-build EXCLUDE_FROM_ALL)
+endif()
+# The FFI consumer (bun:ffi) dlopens this; it must be SHARED regardless of
+# BUILD_SHARED_LIBS (which we forced OFF for the ggml subbuild above).
+add_library(yolo SHARED src/yolo.cpp)
+target_include_directories(yolo PUBLIC include)
+if(_yolo_has_ggml)
+    target_compile_definitions(yolo PRIVATE YOLO_HAVE_GGML=1)
+    # Static link → GGML_API resolves to plain `extern` (no dllimport). Do NOT
+    # define GGML_SHARED/GGML_BUILD here.
+    target_link_libraries(yolo PRIVATE ggml)
+    if(YOLO_WITH_METAL)
+        target_compile_definitions(yolo PRIVATE GGML_USE_METAL=1)
+    endif()
+    if(YOLO_WITH_CUDA)
+        target_compile_definitions(yolo PRIVATE GGML_USE_CUDA=1)
+    endif()
+endif()
+# Emit as `libyolo.<ext>` in the build root on every platform/generator so the
+# FFI loader (src/native/yolo-ffi.ts) finds it at a single stable path,
+# regardless of the multi-config (Release/) subdir MSVC would otherwise use.
+set_target_properties(yolo PROPERTIES
+    PREFIX "lib"
+    OUTPUT_NAME "yolo"
+    RUNTIME_OUTPUT_DIRECTORY            "${CMAKE_CURRENT_SOURCE_DIR}/build"
+    RUNTIME_OUTPUT_DIRECTORY_RELEASE    "${CMAKE_CURRENT_SOURCE_DIR}/build"
+    RUNTIME_OUTPUT_DIRECTORY_DEBUG      "${CMAKE_CURRENT_SOURCE_DIR}/build"
+    LIBRARY_OUTPUT_DIRECTORY            "${CMAKE_CURRENT_SOURCE_DIR}/build"
+    LIBRARY_OUTPUT_DIRECTORY_RELEASE    "${CMAKE_CURRENT_SOURCE_DIR}/build"
+    LIBRARY_OUTPUT_DIRECTORY_DEBUG      "${CMAKE_CURRENT_SOURCE_DIR}/build")

package/native/yolo.cpp/README.md ADDED Viewed

@@ -0,0 +1,64 @@
+# yolo.cpp — ggml YOLOv8n object detector
+A self-contained C++ forward pass for **YOLOv8n** built directly on
+[ggml](https://github.com/ggml-org/ggml). The CNN (backbone `Conv`/`C2f`/`SPPF`
+→ PAN-FPN neck → decoupled head) runs in ggml; letterbox preprocessing, the
+final box decode and NMS stay in TypeScript (`src/yolo-detector.ts`). The DFL
+distribution decode, anchor/stride decode, and class sigmoid run in C++ here.
+ggml is linked **statically**, so the build artifact `build/libyolo.<ext>` is a
+single self-contained shared library with no external `ggml.dll`/`.so`
+dependency — `bun:ffi` loads it directly.
+## Status: working & verified
+`src/yolo.cpp` produces detections that match the upstream Ultralytics PyTorch
+model to within fp32 rounding (box max |Δ| ≈ 0.001 px, class scores exact). See
+`verify/` for the numerical check against a PyTorch reference.
+## Build
+Requires CMake ≥ 3.20 and a C/C++ toolchain (MSVC Build Tools on Windows,
+clang/gcc elsewhere). From the plugin root:
+```bash
+bun run build:native              # → native/yolo.cpp/build/libyolo.{dll,dylib,so}
+# or directly:
+bun native/yolo.cpp/build.mjs            # CPU
+bun native/yolo.cpp/build.mjs --metal    # macOS GPU
+bun native/yolo.cpp/build.mjs --cuda     # NVIDIA GPU
+```
+## Convert weights → GGUF
+Ultralytics ships under AGPL-3.0; we ship **no weights**. Convert them locally
+(BatchNorm is folded into each conv at convert time):
+```bash
+pip install ultralytics gguf numpy torch
+bun run build:weights             # → ~/.eliza/models/vision/yolov8n.gguf
+# or directly:
+python native/yolo.cpp/scripts/convert.py --variant yolov8n
+```
+The runtime resolves the GGUF at `$ELIZA_STATE_DIR/models/vision/yolov8n.gguf`
+(default `~/.eliza/...`); override with `ELIZA_YOLO_GGUF`. Override the library
+path with `ELIZA_YOLO_LIB` and the CPU thread count with `ELIZA_YOLO_THREADS`
+(defaults to ≈ physical cores).
+## Verify (numerical parity with PyTorch)
+```bash
+python native/yolo.cpp/verify/make_ref.py        # input.bin + ultralytics ref.bin
+bun    native/yolo.cpp/verify/run_ggml.mjs build/libyolo.dll <gguf>   # → out.bin
+python native/yolo.cpp/verify/compare.py         # asserts PASS
+# full TS path (FFI → parseYoloV8 → NMS) on a real image:
+bun native/yolo.cpp/verify/run_ts.mjs
+```
+## License
+The runtime in this directory is a clean-room implementation built on ggml. It
+contains no Ultralytics code. YOLOv8 weights are AGPL-3.0 and are **not** bundled
+— end users convert them locally via the script above.

package/native/yolo.cpp/build.mjs ADDED Viewed

@@ -0,0 +1,76 @@
+#!/usr/bin/env node
+// Build the self-contained ggml YOLOv8 native library (libyolo.<ext>).
+//
+//   bun native/yolo.cpp/build.mjs           # configure + build (Release)
+//
+// Produces  native/yolo.cpp/build/libyolo.{dll,dylib,so}  — the exact path the
+// FFI loader (src/native/yolo-ffi.ts) probes. ggml is linked statically, so the
+// artifact has no external ggml.dll/.so dependency.
+//
+// Requirements: CMake >= 3.20 and a C/C++ toolchain.
+//   - Windows: Visual Studio 2022 Build Tools (MSVC).
+//   - macOS:   Xcode command line tools (clang). Pass --metal for the GPU path.
+//   - Linux:   gcc/clang. Ninja is used when available.
+import { spawnSync } from "node:child_process";
+import { existsSync, readdirSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+const HERE = dirname(fileURLToPath(import.meta.url));
+const BUILD = join(HERE, "build");
+const isWin = process.platform === "win32";
+const args = process.argv.slice(2);
+const withMetal = args.includes("--metal");
+const withCuda = args.includes("--cuda");
+function run(cmd, cmdArgs) {
+  console.error(`> ${cmd} ${cmdArgs.join(" ")}`);
+  const r = spawnSync(cmd, cmdArgs, { stdio: "inherit" });
+  if (r.status !== 0) {
+    console.error(`\n[yolo build] FAILED: ${cmd} exited ${r.status}`);
+    process.exit(r.status ?? 1);
+  }
+}
+function hasNinja() {
+  const r = spawnSync("ninja", ["--version"], { stdio: "ignore" });
+  return r.status === 0;
+}
+const configure = [
+  "-S",
+  HERE,
+  "-B",
+  BUILD,
+  "-Wno-dev",
+  "-DGGML_NATIVE=OFF",
+  "-DGGML_OPENMP=OFF",
+];
+if (withMetal) configure.push("-DYOLO_WITH_METAL=ON");
+if (withCuda) configure.push("-DYOLO_WITH_CUDA=ON", "-DGGML_CUDA=ON");
+if (isWin) {
+  configure.push("-G", "Visual Studio 17 2022", "-A", "x64");
+} else {
+  configure.push("-DCMAKE_BUILD_TYPE=Release");
+  if (hasNinja()) configure.push("-G", "Ninja");
+}
+run("cmake", configure);
+run("cmake", ["--build", BUILD, "--config", "Release", "--target", "yolo"]);
+const ext = isWin ? "dll" : process.platform === "darwin" ? "dylib" : "so";
+const artifact = join(BUILD, `libyolo.${ext}`);
+if (!existsSync(artifact)) {
+  // Some generators ignore the per-config output dir override; locate it.
+  const found = readdirSync(BUILD, { recursive: true }).find((f) =>
+    String(f).endsWith(`libyolo.${ext}`),
+  );
+  console.error(
+    found
+      ? `[yolo build] artifact at ${join(BUILD, String(found))} (expected ${artifact})`
+      : `[yolo build] WARNING: libyolo.${ext} not found under ${BUILD}`,
+  );
+} else {
+  console.error(`[yolo build] OK: ${artifact}`);
+}

package/native/yolo.cpp/include/yolo.h ADDED Viewed

@@ -0,0 +1,62 @@
+// yolo.h — C ABI for the ggml-backed YOLOv8 runtime.
+//
+// Forward pass + DFL/anchor decode + class sigmoid. Letterbox preprocessing and
+// the final threshold/NMS stay in TypeScript (src/yolo-detector.ts).
+#ifndef YOLO_H
+#define YOLO_H
+#include <stddef.h>
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+// Export macro so the C ABI symbols are visible to bun:ffi / dlopen. On MSVC,
+// extern "C" functions in a DLL are NOT exported without __declspec(dllexport).
+#ifndef YOLO_API
+#  ifdef _WIN32
+#    define YOLO_API __declspec(dllexport)
+#  else
+#    define YOLO_API __attribute__((visibility("default")))
+#  endif
+#endif
+typedef struct yolo_ctx yolo_ctx;
+#define YOLO_OK            0
+#define YOLO_ERR_FILE     -1
+#define YOLO_ERR_FORMAT   -2
+#define YOLO_ERR_OOM      -3
+#define YOLO_ERR_SHAPE    -4
+#define YOLO_ERR_BACKEND  -5
+// Expected GGUF metadata:
+//   - "yolo.variant"      = "yolov8n" | "yolov8s" | ...
+//   - "yolo.input_h"      = int   (typical 640)
+//   - "yolo.input_w"      = int   (typical 640)
+//   - "yolo.classes"      = utf8 string (newline-separated, e.g. COCO 80)
+//   - "yolo.strides"      = i32[3] (typical [8,16,32])
+YOLO_API yolo_ctx * yolo_init(const char * gguf_path);
+// rgb_chw: CHW float32 RGB normalized to [0,1] (caller letterboxed to input_h x input_w).
+// out_logits: caller-allocated. Size must be (4 + num_classes) * num_anchors float32.
+// out_channels, out_anchors filled by the call.
+YOLO_API int yolo_run(yolo_ctx * ctx,
+             const float * rgb_chw,
+             int h, int w,
+             float * out_logits,
+             int * out_channels,
+             int * out_anchors);
+// Returns the embedded UTF-8 class-names string (newline-separated, owned by ctx).
+YOLO_API const char * yolo_classes(yolo_ctx * ctx);
+YOLO_API void yolo_free(yolo_ctx * ctx);
+#ifdef __cplusplus
+}
+#endif
+#endif // YOLO_H

package/native/yolo.cpp/scripts/convert.py ADDED Viewed

@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+Convert Ultralytics YOLOv8 PyTorch checkpoints to GGUF for the yolo.cpp runtime.
+Usage:
+    python scripts/convert.py --variant yolov8n
+    python scripts/convert.py --variant yolov8n --out <state-dir>/models/vision/yolov8n.gguf
+Requirements (install before running):
+    pip install ultralytics gguf numpy torch
+License note: Ultralytics ships under AGPL-3.0. This script reads the published
+weights and writes them into a GGUF; the runtime (`src/yolo.cpp`) is a
+clean-room ggml implementation. No Ultralytics code is copied into this repo.
+What it does
+------------
+Walks the DetectionModel module tree and emits one of two tensor shapes:
+  * ultralytics ``Conv`` (Conv2d + BatchNorm2d + SiLU): the BatchNorm is FOLDED
+    into the preceding conv at convert time, producing a plain conv weight +
+    bias. Emitted as ``<module>.weight`` (folded, shape [OC,IC,KH,KW]) and
+    ``<module>.bias`` (folded, [OC]).  e.g. ``model.0.weight``, ``model.2.cv1.weight``.
+  * bare ``Conv2d`` (the head's per-scale stage-2 1x1 projection, which has its
+    own bias and no BN): emitted verbatim as ``<module>.weight`` / ``<module>.bias``.
+    e.g. ``model.22.cv2.0.2.weight``.
+The DFL ``model.22.dfl.conv`` buffer (a fixed arange(16)) is intentionally
+skipped — the C runtime recomputes the DFL expectation directly.
+ggml reads tensor ``ne`` as the REVERSED numpy shape, so a PyTorch conv weight
+of numpy shape ``(OC, IC, KH, KW)`` is read by ggml as ``ne=[KW,KH,IC,OC]`` —
+exactly the ``ggml_conv_2d`` kernel layout. No transpose is needed.
+Metadata KV entries (read by ``yolo_init``):
+    "yolo.variant"  : str
+    "yolo.input_h"  : u32
+    "yolo.input_w"  : u32
+    "yolo.classes"  : str  (utf-8, newline separated, 80 COCO entries)
+    "yolo.strides"  : i32[3]
+"""
+import argparse
+import os
+import sys
+COCO_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
+    "truck", "boat", "traffic light", "fire hydrant", "stop sign",
+    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag",
+    "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
+    "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
+    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
+    "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
+    "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
+    "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock",
+    "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
+]
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--variant",
+        default="yolov8n",
+        choices=("yolov8n", "yolov8s", "yolov8m", "yolov8l", "yolov8x"),
+    )
+    parser.add_argument(
+        "--out",
+        default=None,
+        help="Output GGUF path. Defaults to "
+        "$ELIZA_STATE_DIR/models/vision/<variant>.gguf "
+        "(or ~/.eliza/models/vision/<variant>.gguf).",
+    )
+    parser.add_argument(
+        "--weights",
+        default=None,
+        help="Path to the .pt checkpoint. Defaults to '<variant>.pt' "
+        "(ultralytics auto-downloads it if absent).",
+    )
+    parser.add_argument(
+        "--trust-checkpoint",
+        action="store_true",
+        help="Allow the torch.load(weights_only=False) fallback when the "
+        "Ultralytics YOLO import fails. Only use with trusted checkpoints.",
+    )
+    args = parser.parse_args()
+    try:
+        import numpy as np
+        import torch
+        import torch.nn as nn
+    except ImportError as exc:
+        print(f"missing dependency: {exc}. pip install torch numpy", file=sys.stderr)
+        return 2
+    try:
+        import gguf
+    except ImportError:
+        print("gguf not installed. pip install gguf", file=sys.stderr)
+        return 2
+    out_path = args.out
+    if not out_path:
+        state_dir = os.environ.get(
+            "ELIZA_STATE_DIR", os.path.join(os.path.expanduser("~"), ".eliza")
+        )
+        out_path = os.path.join(state_dir, "models", "vision", f"{args.variant}.gguf")
+    args.out = out_path
+    weights = args.weights or f"{args.variant}.pt"
+    print(f"[convert] loading {weights}", file=sys.stderr)
+    # Prefer ultralytics; fall back to loading the DetectionModel straight from
+    # the checkpoint only when the operator explicitly trusts the file. PyTorch
+    # full-checkpoint unpickling can execute code.
+    try:
+        from ultralytics import YOLO
+    except Exception as exc:  # noqa: BLE001 - torchvision registration can fail here
+        if isinstance(exc, ModuleNotFoundError) and exc.name == "ultralytics":
+            print("ultralytics not installed. pip install ultralytics", file=sys.stderr)
+            return 2
+        trust_checkpoint = args.trust_checkpoint or os.environ.get(
+            "ELIZA_YOLO_TRUST_CHECKPOINT"
+        ) in {"1", "true", "yes"}
+        if not trust_checkpoint:
+            print(
+                f"[convert] ultralytics import failed ({exc}). Direct "
+                "torch.load fallback requires --trust-checkpoint or "
+                "ELIZA_YOLO_TRUST_CHECKPOINT=1 because PyTorch checkpoint "
+                "unpickling can execute code.",
+                file=sys.stderr,
+            )
+            return 2
+        print(
+            f"[convert] ultralytics unavailable ({exc}); "
+            "loading trusted DetectionModel directly from checkpoint",
+            file=sys.stderr,
+        )
+        try:
+            checkpoint = torch.load(weights, map_location="cpu", weights_only=False)
+        except FileNotFoundError:
+            print(
+                f"checkpoint not found: {weights}. Install ultralytics to auto-download "
+                "default weights, or pass --weights with a local .pt file.",
+                file=sys.stderr,
+            )
+            return 2
+        except Exception as load_exc:
+            print(f"torch.load failed for {weights}: {load_exc}", file=sys.stderr)
+            return 2
+        if isinstance(checkpoint, dict):
+            model = checkpoint.get("ema")
+            if model is None:
+                model = checkpoint.get("model")
+        elif isinstance(checkpoint, nn.Module):
+            model = checkpoint
+        else:
+            model = None
+        if not isinstance(model, nn.Module):
+            print(
+                f"checkpoint {weights} does not contain a recoverable nn.Module "
+                "in 'ema' or 'model'",
+                file=sys.stderr,
+            )
+            return 2
+    else:
+        model = YOLO(weights).model  # DetectionModel (nn.Module)
+    model.eval().float()
+    out_dir = os.path.dirname(os.path.abspath(args.out))
+    os.makedirs(out_dir, exist_ok=True)
+    writer = gguf.GGUFWriter(args.out, "yolo")
+    writer.add_string("yolo.variant", args.variant)
+    writer.add_uint32("yolo.input_h", 640)
+    writer.add_uint32("yolo.input_w", 640)
+    writer.add_string("yolo.classes", "\n".join(COCO_CLASSES))
+    writer.add_array("yolo.strides", [8, 16, 32])
+    def fold_bn(conv, bn):
+        w = conv.weight.detach().float()  # [OC,IC,KH,KW]
+        oc = w.shape[0]
+        b = (
+            conv.bias.detach().float()
+            if conv.bias is not None
+            else torch.zeros(oc, dtype=torch.float32)
+        )
+        gamma = bn.weight.detach().float()
+        beta = bn.bias.detach().float()
+        mean = bn.running_mean.detach().float()
+        var = bn.running_var.detach().float()
+        std = torch.sqrt(var + bn.eps)
+        w_folded = w * (gamma / std).reshape(-1, 1, 1, 1)
+        b_folded = beta + (b - mean) * gamma / std
+        return w_folded, b_folded
+    def as_f32(t):
+        return np.ascontiguousarray(t.detach().cpu().numpy().astype(np.float32))
+    emitted = []
+    def emit(name, w, b):
+        writer.add_tensor(name + ".weight", as_f32(w))
+        writer.add_tensor(name + ".bias", as_f32(b))
+        emitted.append((name, tuple(w.shape), tuple(b.shape)))
+    n_conv = n_bare = 0
+    for name, m in model.named_modules():
+        cls = type(m).__name__
+        if cls == "Conv" and hasattr(m, "conv") and hasattr(m, "bn"):
+            # ultralytics CBS: fold BN into the conv.
+            if isinstance(m.conv, nn.Conv2d) and isinstance(m.bn, nn.BatchNorm2d):
+                w, b = fold_bn(m.conv, m.bn)
+                emit(name, w, b)
+                n_conv += 1
+        elif cls == "Conv2d":
+            # bare Conv2d. Skip the inner conv of a CBS (handled above) and the
+            # fixed DFL buffer (recomputed in C). Keep only the head stage-2 1x1.
+            if name.endswith(".conv"):
+                continue
+            if ".dfl" in name:
+                continue
+            if m.bias is None:
+                b = torch.zeros(m.weight.shape[0], dtype=torch.float32)
+            else:
+                b = m.bias
+            emit(name, m.weight, b)
+            n_bare += 1
+    print(
+        f"[convert] folded {n_conv} CBS convs + {n_bare} bare head convs "
+        f"= {len(emitted)} tensors",
+        file=sys.stderr,
+    )
+    for name, ws, bs in emitted:
+        print(f"  {name:<28} w{ws} b{bs}", file=sys.stderr)
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file()
+    writer.close()
+    print(f"[convert] wrote {args.out}", file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())