@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -301
  3. package/dist/action.d.ts +3 -0
  4. package/dist/action.d.ts.map +1 -0
  5. package/dist/audio-capture-stream.d.ts +42 -0
  6. package/dist/audio-capture-stream.d.ts.map +1 -0
  7. package/dist/audio-capture.d.ts +25 -0
  8. package/dist/audio-capture.d.ts.map +1 -0
  9. package/dist/computeruse-ocr-bridge.d.ts +50 -0
  10. package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
  11. package/dist/config.d.ts +68 -0
  12. package/dist/config.d.ts.map +1 -0
  13. package/dist/describe-backpressure.d.ts +90 -0
  14. package/dist/describe-backpressure.d.ts.map +1 -0
  15. package/dist/dirty-tile-describer.d.ts +102 -0
  16. package/dist/dirty-tile-describer.d.ts.map +1 -0
  17. package/dist/dirty-tile-scene.d.ts +56 -0
  18. package/dist/dirty-tile-scene.d.ts.map +1 -0
  19. package/dist/entity-tracker.d.ts +33 -0
  20. package/dist/entity-tracker.d.ts.map +1 -0
  21. package/dist/face-detector-ggml.d.ts +60 -0
  22. package/dist/face-detector-ggml.d.ts.map +1 -0
  23. package/dist/face-detector-mediapipe.d.ts +25 -0
  24. package/dist/face-detector-mediapipe.d.ts.map +1 -0
  25. package/dist/face-recognition-ggml.d.ts +94 -0
  26. package/dist/face-recognition-ggml.d.ts.map +1 -0
  27. package/dist/get-screen-elements.d.ts +90 -0
  28. package/dist/get-screen-elements.d.ts.map +1 -0
  29. package/dist/get-screen.d.ts +60 -0
  30. package/dist/get-screen.d.ts.map +1 -0
  31. package/dist/image/sharp-compat.d.ts +89 -0
  32. package/dist/image/sharp-compat.d.ts.map +1 -0
  33. package/dist/image-input.d.ts +15 -0
  34. package/dist/image-input.d.ts.map +1 -0
  35. package/dist/index.d.ts +4 -0
  36. package/dist/index.d.ts.map +1 -0
  37. package/dist/index.js +7957 -6238
  38. package/dist/index.js.map +41 -26
  39. package/dist/lifecycle.d.ts +94 -0
  40. package/dist/lifecycle.d.ts.map +1 -0
  41. package/dist/mobile/capacitor-camera.d.ts +85 -0
  42. package/dist/mobile/capacitor-camera.d.ts.map +1 -0
  43. package/dist/native/doctr-ffi.d.ts +40 -0
  44. package/dist/native/doctr-ffi.d.ts.map +1 -0
  45. package/dist/native/yolo-ffi.d.ts +21 -0
  46. package/dist/native/yolo-ffi.d.ts.map +1 -0
  47. package/dist/ocr-host-windows.d.ts +34 -0
  48. package/dist/ocr-host-windows.d.ts.map +1 -0
  49. package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
  50. package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
  51. package/dist/ocr-service-doctr.d.ts +61 -0
  52. package/dist/ocr-service-doctr.d.ts.map +1 -0
  53. package/dist/ocr-service-linux-tesseract.d.ts +85 -0
  54. package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
  55. package/dist/ocr-service-paddleocr.d.ts +59 -0
  56. package/dist/ocr-service-paddleocr.d.ts.map +1 -0
  57. package/dist/ocr-service-windows.d.ts +41 -0
  58. package/dist/ocr-service-windows.d.ts.map +1 -0
  59. package/dist/ocr-service.d.ts +91 -0
  60. package/dist/ocr-service.d.ts.map +1 -0
  61. package/dist/ocr-with-coords.d.ts +103 -0
  62. package/dist/ocr-with-coords.d.ts.map +1 -0
  63. package/dist/person-detector.d.ts +17 -0
  64. package/dist/person-detector.d.ts.map +1 -0
  65. package/dist/provider.d.ts +3 -0
  66. package/dist/provider.d.ts.map +1 -0
  67. package/dist/routes.d.ts +7 -0
  68. package/dist/routes.d.ts.map +1 -0
  69. package/dist/screen-capture-bridge.d.ts +51 -0
  70. package/dist/screen-capture-bridge.d.ts.map +1 -0
  71. package/dist/screen-capture.d.ts +17 -0
  72. package/dist/screen-capture.d.ts.map +1 -0
  73. package/dist/screen-tiler.d.ts +75 -0
  74. package/dist/screen-tiler.d.ts.map +1 -0
  75. package/dist/service.d.ts +176 -0
  76. package/dist/service.d.ts.map +1 -0
  77. package/dist/set-of-marks-provider.d.ts +64 -0
  78. package/dist/set-of-marks-provider.d.ts.map +1 -0
  79. package/dist/som.d.ts +135 -0
  80. package/dist/som.d.ts.map +1 -0
  81. package/dist/som.js +184 -0
  82. package/dist/som.js.map +11 -0
  83. package/dist/test-input.d.ts +25 -0
  84. package/dist/test-input.d.ts.map +1 -0
  85. package/dist/types.d.ts +241 -0
  86. package/dist/types.d.ts.map +1 -0
  87. package/dist/vision-context-augmenter.d.ts +93 -0
  88. package/dist/vision-context-augmenter.d.ts.map +1 -0
  89. package/dist/vision-worker-manager.d.ts +51 -0
  90. package/dist/vision-worker-manager.d.ts.map +1 -0
  91. package/dist/workers/ocr-worker.d.ts +2 -0
  92. package/dist/workers/ocr-worker.d.ts.map +1 -0
  93. package/dist/workers/ocr-worker.js +1075 -7821
  94. package/dist/workers/ocr-worker.js.map +10 -51
  95. package/dist/workers/screen-capture-worker.d.ts +2 -0
  96. package/dist/workers/screen-capture-worker.d.ts.map +1 -0
  97. package/dist/workers/screen-capture-worker.js +364 -6
  98. package/dist/workers/screen-capture-worker.js.map +5 -4
  99. package/dist/workers/worker-logger.d.ts +10 -0
  100. package/dist/workers/worker-logger.d.ts.map +1 -0
  101. package/dist/yolo-detector.d.ts +37 -0
  102. package/dist/yolo-detector.d.ts.map +1 -0
  103. package/native/doctr.cpp/CMakeLists.txt +58 -0
  104. package/native/doctr.cpp/README.md +62 -0
  105. package/native/doctr.cpp/include/doctr.h +91 -0
  106. package/native/doctr.cpp/scripts/convert.py +98 -0
  107. package/native/doctr.cpp/src/doctr_det.cpp +112 -0
  108. package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
  109. package/native/macos-vision-ocr.swift +113 -0
  110. package/native/mobilefacenet.cpp/README.md +13 -0
  111. package/native/movenet.cpp/README.md +10 -0
  112. package/native/retinaface.cpp/README.md +12 -0
  113. package/native/yolo.cpp/CMakeLists.txt +57 -0
  114. package/native/yolo.cpp/README.md +64 -0
  115. package/native/yolo.cpp/build.mjs +76 -0
  116. package/native/yolo.cpp/include/yolo.h +62 -0
  117. package/native/yolo.cpp/scripts/convert.py +248 -0
  118. package/native/yolo.cpp/src/yolo.cpp +425 -0
  119. package/native/yolo.cpp/verify/compare.py +99 -0
  120. package/native/yolo.cpp/verify/make_ref.py +75 -0
  121. package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
  122. package/native/yolo.cpp/verify/run_ts.mjs +26 -0
  123. package/package.json +39 -21
  124. package/registry-entry.json +43 -0
  125. package/scripts/vendor-tesseract-linux.mjs +177 -0
  126. package/build.config.ts +0 -89
  127. package/dist/workers/florence2-worker.js +0 -779
  128. package/dist/workers/florence2-worker.js.map +0 -13
@@ -0,0 +1,57 @@
1
+ cmake_minimum_required(VERSION 3.20)
2
+ project(yolo LANGUAGES C CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
7
+
8
+ option(YOLO_WITH_METAL "Use Metal backend on macOS" OFF)
9
+ option(YOLO_WITH_CUDA "Use CUDA backend" OFF)
10
+
11
+ # Reuse ggml from llama.cpp's vendored copy.
12
+ set(YOLO_GGML_DIR "" CACHE PATH "Path to a ggml source tree")
13
+ if(NOT YOLO_GGML_DIR)
14
+ set(YOLO_GGML_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../plugin-local-inference/native/llama.cpp/ggml")
15
+ endif()
16
+
17
+ set(_yolo_has_ggml FALSE)
18
+ if(EXISTS "${YOLO_GGML_DIR}/include/ggml.h")
19
+ set(_yolo_has_ggml TRUE)
20
+ # Build ggml STATICALLY and link it into libyolo so the FFI artifact is a
21
+ # single self-contained shared library (no ggml.dll/.so co-location or
22
+ # PATH/RPATH dance at load time). yolo itself stays SHARED below.
23
+ set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
24
+ set(GGML_BACKEND_DL OFF CACHE BOOL "" FORCE)
25
+ add_subdirectory(${YOLO_GGML_DIR} ggml-build EXCLUDE_FROM_ALL)
26
+ endif()
27
+
28
+ # The FFI consumer (bun:ffi) dlopens this; it must be SHARED regardless of
29
+ # BUILD_SHARED_LIBS (which we forced OFF for the ggml subbuild above).
30
+ add_library(yolo SHARED src/yolo.cpp)
31
+ target_include_directories(yolo PUBLIC include)
32
+
33
+ if(_yolo_has_ggml)
34
+ target_compile_definitions(yolo PRIVATE YOLO_HAVE_GGML=1)
35
+ # Static link → GGML_API resolves to plain `extern` (no dllimport). Do NOT
36
+ # define GGML_SHARED/GGML_BUILD here.
37
+ target_link_libraries(yolo PRIVATE ggml)
38
+ if(YOLO_WITH_METAL)
39
+ target_compile_definitions(yolo PRIVATE GGML_USE_METAL=1)
40
+ endif()
41
+ if(YOLO_WITH_CUDA)
42
+ target_compile_definitions(yolo PRIVATE GGML_USE_CUDA=1)
43
+ endif()
44
+ endif()
45
+
46
+ # Emit as `libyolo.<ext>` in the build root on every platform/generator so the
47
+ # FFI loader (src/native/yolo-ffi.ts) finds it at a single stable path,
48
+ # regardless of the multi-config (Release/) subdir MSVC would otherwise use.
49
+ set_target_properties(yolo PROPERTIES
50
+ PREFIX "lib"
51
+ OUTPUT_NAME "yolo"
52
+ RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build"
53
+ RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_CURRENT_SOURCE_DIR}/build"
54
+ RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_CURRENT_SOURCE_DIR}/build"
55
+ LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build"
56
+ LIBRARY_OUTPUT_DIRECTORY_RELEASE "${CMAKE_CURRENT_SOURCE_DIR}/build"
57
+ LIBRARY_OUTPUT_DIRECTORY_DEBUG "${CMAKE_CURRENT_SOURCE_DIR}/build")
@@ -0,0 +1,64 @@
1
+ # yolo.cpp — ggml YOLOv8n object detector
2
+
3
+ A self-contained C++ forward pass for **YOLOv8n** built directly on
4
+ [ggml](https://github.com/ggml-org/ggml). The CNN (backbone `Conv`/`C2f`/`SPPF`
5
+ → PAN-FPN neck → decoupled head) runs in ggml; letterbox preprocessing, the
6
+ final box decode and NMS stay in TypeScript (`src/yolo-detector.ts`). The DFL
7
+ distribution decode, anchor/stride decode, and class sigmoid run in C++ here.
8
+
9
+ ggml is linked **statically**, so the build artifact `build/libyolo.<ext>` is a
10
+ single self-contained shared library with no external `ggml.dll`/`.so`
11
+ dependency — `bun:ffi` loads it directly.
12
+
13
+ ## Status: working & verified
14
+
15
+ `src/yolo.cpp` produces detections that match the upstream Ultralytics PyTorch
16
+ model to within fp32 rounding (box max |Δ| ≈ 0.001 px, class scores exact). See
17
+ `verify/` for the numerical check against a PyTorch reference.
18
+
19
+ ## Build
20
+
21
+ Requires CMake ≥ 3.20 and a C/C++ toolchain (MSVC Build Tools on Windows,
22
+ clang/gcc elsewhere). From the plugin root:
23
+
24
+ ```bash
25
+ bun run build:native # → native/yolo.cpp/build/libyolo.{dll,dylib,so}
26
+ # or directly:
27
+ bun native/yolo.cpp/build.mjs # CPU
28
+ bun native/yolo.cpp/build.mjs --metal # macOS GPU
29
+ bun native/yolo.cpp/build.mjs --cuda # NVIDIA GPU
30
+ ```
31
+
32
+ ## Convert weights → GGUF
33
+
34
+ Ultralytics ships under AGPL-3.0; we ship **no weights**. Convert them locally
35
+ (BatchNorm is folded into each conv at convert time):
36
+
37
+ ```bash
38
+ pip install ultralytics gguf numpy torch
39
+ bun run build:weights # → ~/.eliza/models/vision/yolov8n.gguf
40
+ # or directly:
41
+ python native/yolo.cpp/scripts/convert.py --variant yolov8n
42
+ ```
43
+
44
+ The runtime resolves the GGUF at `$ELIZA_STATE_DIR/models/vision/yolov8n.gguf`
45
+ (default `~/.eliza/...`); override with `ELIZA_YOLO_GGUF`. Override the library
46
+ path with `ELIZA_YOLO_LIB` and the CPU thread count with `ELIZA_YOLO_THREADS`
47
+ (defaults to ≈ physical cores).
48
+
49
+ ## Verify (numerical parity with PyTorch)
50
+
51
+ ```bash
52
+ python native/yolo.cpp/verify/make_ref.py # input.bin + ultralytics ref.bin
53
+ bun native/yolo.cpp/verify/run_ggml.mjs build/libyolo.dll <gguf> # → out.bin
54
+ python native/yolo.cpp/verify/compare.py # asserts PASS
55
+
56
+ # full TS path (FFI → parseYoloV8 → NMS) on a real image:
57
+ bun native/yolo.cpp/verify/run_ts.mjs
58
+ ```
59
+
60
+ ## License
61
+
62
+ The runtime in this directory is a clean-room implementation built on ggml. It
63
+ contains no Ultralytics code. YOLOv8 weights are AGPL-3.0 and are **not** bundled
64
+ — end users convert them locally via the script above.
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env node
2
+ // Build the self-contained ggml YOLOv8 native library (libyolo.<ext>).
3
+ //
4
+ // bun native/yolo.cpp/build.mjs # configure + build (Release)
5
+ //
6
+ // Produces native/yolo.cpp/build/libyolo.{dll,dylib,so} — the exact path the
7
+ // FFI loader (src/native/yolo-ffi.ts) probes. ggml is linked statically, so the
8
+ // artifact has no external ggml.dll/.so dependency.
9
+ //
10
+ // Requirements: CMake >= 3.20 and a C/C++ toolchain.
11
+ // - Windows: Visual Studio 2022 Build Tools (MSVC).
12
+ // - macOS: Xcode command line tools (clang). Pass --metal for the GPU path.
13
+ // - Linux: gcc/clang. Ninja is used when available.
14
+ import { spawnSync } from "node:child_process";
15
+ import { existsSync, readdirSync } from "node:fs";
16
+ import { dirname, join } from "node:path";
17
+ import { fileURLToPath } from "node:url";
18
+
19
+ const HERE = dirname(fileURLToPath(import.meta.url));
20
+ const BUILD = join(HERE, "build");
21
+ const isWin = process.platform === "win32";
22
+ const args = process.argv.slice(2);
23
+ const withMetal = args.includes("--metal");
24
+ const withCuda = args.includes("--cuda");
25
+
26
+ function run(cmd, cmdArgs) {
27
+ console.error(`> ${cmd} ${cmdArgs.join(" ")}`);
28
+ const r = spawnSync(cmd, cmdArgs, { stdio: "inherit" });
29
+ if (r.status !== 0) {
30
+ console.error(`\n[yolo build] FAILED: ${cmd} exited ${r.status}`);
31
+ process.exit(r.status ?? 1);
32
+ }
33
+ }
34
+
35
+ function hasNinja() {
36
+ const r = spawnSync("ninja", ["--version"], { stdio: "ignore" });
37
+ return r.status === 0;
38
+ }
39
+
40
+ const configure = [
41
+ "-S",
42
+ HERE,
43
+ "-B",
44
+ BUILD,
45
+ "-Wno-dev",
46
+ "-DGGML_NATIVE=OFF",
47
+ "-DGGML_OPENMP=OFF",
48
+ ];
49
+ if (withMetal) configure.push("-DYOLO_WITH_METAL=ON");
50
+ if (withCuda) configure.push("-DYOLO_WITH_CUDA=ON", "-DGGML_CUDA=ON");
51
+
52
+ if (isWin) {
53
+ configure.push("-G", "Visual Studio 17 2022", "-A", "x64");
54
+ } else {
55
+ configure.push("-DCMAKE_BUILD_TYPE=Release");
56
+ if (hasNinja()) configure.push("-G", "Ninja");
57
+ }
58
+
59
+ run("cmake", configure);
60
+ run("cmake", ["--build", BUILD, "--config", "Release", "--target", "yolo"]);
61
+
62
+ const ext = isWin ? "dll" : process.platform === "darwin" ? "dylib" : "so";
63
+ const artifact = join(BUILD, `libyolo.${ext}`);
64
+ if (!existsSync(artifact)) {
65
+ // Some generators ignore the per-config output dir override; locate it.
66
+ const found = readdirSync(BUILD, { recursive: true }).find((f) =>
67
+ String(f).endsWith(`libyolo.${ext}`),
68
+ );
69
+ console.error(
70
+ found
71
+ ? `[yolo build] artifact at ${join(BUILD, String(found))} (expected ${artifact})`
72
+ : `[yolo build] WARNING: libyolo.${ext} not found under ${BUILD}`,
73
+ );
74
+ } else {
75
+ console.error(`[yolo build] OK: ${artifact}`);
76
+ }
@@ -0,0 +1,62 @@
1
+ // yolo.h — C ABI for the ggml-backed YOLOv8 runtime.
2
+ //
3
+ // Forward pass + DFL/anchor decode + class sigmoid. Letterbox preprocessing and
4
+ // the final threshold/NMS stay in TypeScript (src/yolo-detector.ts).
5
+
6
+ #ifndef YOLO_H
7
+ #define YOLO_H
8
+
9
+ #include <stddef.h>
10
+ #include <stdint.h>
11
+
12
+ #ifdef __cplusplus
13
+ extern "C" {
14
+ #endif
15
+
16
+ // Export macro so the C ABI symbols are visible to bun:ffi / dlopen. On MSVC,
17
+ // extern "C" functions in a DLL are NOT exported without __declspec(dllexport).
18
+ #ifndef YOLO_API
19
+ # ifdef _WIN32
20
+ # define YOLO_API __declspec(dllexport)
21
+ # else
22
+ # define YOLO_API __attribute__((visibility("default")))
23
+ # endif
24
+ #endif
25
+
26
+ typedef struct yolo_ctx yolo_ctx;
27
+
28
+ #define YOLO_OK 0
29
+ #define YOLO_ERR_FILE -1
30
+ #define YOLO_ERR_FORMAT -2
31
+ #define YOLO_ERR_OOM -3
32
+ #define YOLO_ERR_SHAPE -4
33
+ #define YOLO_ERR_BACKEND -5
34
+
35
+ // Expected GGUF metadata:
36
+ // - "yolo.variant" = "yolov8n" | "yolov8s" | ...
37
+ // - "yolo.input_h" = int (typical 640)
38
+ // - "yolo.input_w" = int (typical 640)
39
+ // - "yolo.classes" = utf8 string (newline-separated, e.g. COCO 80)
40
+ // - "yolo.strides" = i32[3] (typical [8,16,32])
41
+ YOLO_API yolo_ctx * yolo_init(const char * gguf_path);
42
+
43
+ // rgb_chw: CHW float32 RGB normalized to [0,1] (caller letterboxed to input_h x input_w).
44
+ // out_logits: caller-allocated. Size must be (4 + num_classes) * num_anchors float32.
45
+ // out_channels, out_anchors filled by the call.
46
+ YOLO_API int yolo_run(yolo_ctx * ctx,
47
+ const float * rgb_chw,
48
+ int h, int w,
49
+ float * out_logits,
50
+ int * out_channels,
51
+ int * out_anchors);
52
+
53
+ // Returns the embedded UTF-8 class-names string (newline-separated, owned by ctx).
54
+ YOLO_API const char * yolo_classes(yolo_ctx * ctx);
55
+
56
+ YOLO_API void yolo_free(yolo_ctx * ctx);
57
+
58
+ #ifdef __cplusplus
59
+ }
60
+ #endif
61
+
62
+ #endif // YOLO_H
@@ -0,0 +1,248 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert Ultralytics YOLOv8 PyTorch checkpoints to GGUF for the yolo.cpp runtime.
4
+
5
+ Usage:
6
+ python scripts/convert.py --variant yolov8n
7
+ python scripts/convert.py --variant yolov8n --out <state-dir>/models/vision/yolov8n.gguf
8
+
9
+ Requirements (install before running):
10
+ pip install ultralytics gguf numpy torch
11
+
12
+ License note: Ultralytics ships under AGPL-3.0. This script reads the published
13
+ weights and writes them into a GGUF; the runtime (`src/yolo.cpp`) is a
14
+ clean-room ggml implementation. No Ultralytics code is copied into this repo.
15
+
16
+ What it does
17
+ ------------
18
+ Walks the DetectionModel module tree and emits one of two tensor shapes:
19
+
20
+ * ultralytics ``Conv`` (Conv2d + BatchNorm2d + SiLU): the BatchNorm is FOLDED
21
+ into the preceding conv at convert time, producing a plain conv weight +
22
+ bias. Emitted as ``<module>.weight`` (folded, shape [OC,IC,KH,KW]) and
23
+ ``<module>.bias`` (folded, [OC]). e.g. ``model.0.weight``, ``model.2.cv1.weight``.
24
+ * bare ``Conv2d`` (the head's per-scale stage-2 1x1 projection, which has its
25
+ own bias and no BN): emitted verbatim as ``<module>.weight`` / ``<module>.bias``.
26
+ e.g. ``model.22.cv2.0.2.weight``.
27
+
28
+ The DFL ``model.22.dfl.conv`` buffer (a fixed arange(16)) is intentionally
29
+ skipped — the C runtime recomputes the DFL expectation directly.
30
+
31
+ ggml reads tensor ``ne`` as the REVERSED numpy shape, so a PyTorch conv weight
32
+ of numpy shape ``(OC, IC, KH, KW)`` is read by ggml as ``ne=[KW,KH,IC,OC]`` —
33
+ exactly the ``ggml_conv_2d`` kernel layout. No transpose is needed.
34
+
35
+ Metadata KV entries (read by ``yolo_init``):
36
+ "yolo.variant" : str
37
+ "yolo.input_h" : u32
38
+ "yolo.input_w" : u32
39
+ "yolo.classes" : str (utf-8, newline separated, 80 COCO entries)
40
+ "yolo.strides" : i32[3]
41
+ """
42
+
43
+ import argparse
44
+ import os
45
+ import sys
46
+
47
+ COCO_CLASSES = [
48
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
49
+ "truck", "boat", "traffic light", "fire hydrant", "stop sign",
50
+ "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
51
+ "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag",
52
+ "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
53
+ "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
54
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
55
+ "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
56
+ "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
57
+ "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
58
+ "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock",
59
+ "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
60
+ ]
61
+
62
+
63
+ def main() -> int:
64
+ parser = argparse.ArgumentParser()
65
+ parser.add_argument(
66
+ "--variant",
67
+ default="yolov8n",
68
+ choices=("yolov8n", "yolov8s", "yolov8m", "yolov8l", "yolov8x"),
69
+ )
70
+ parser.add_argument(
71
+ "--out",
72
+ default=None,
73
+ help="Output GGUF path. Defaults to "
74
+ "$ELIZA_STATE_DIR/models/vision/<variant>.gguf "
75
+ "(or ~/.eliza/models/vision/<variant>.gguf).",
76
+ )
77
+ parser.add_argument(
78
+ "--weights",
79
+ default=None,
80
+ help="Path to the .pt checkpoint. Defaults to '<variant>.pt' "
81
+ "(ultralytics auto-downloads it if absent).",
82
+ )
83
+ parser.add_argument(
84
+ "--trust-checkpoint",
85
+ action="store_true",
86
+ help="Allow the torch.load(weights_only=False) fallback when the "
87
+ "Ultralytics YOLO import fails. Only use with trusted checkpoints.",
88
+ )
89
+ args = parser.parse_args()
90
+
91
+ try:
92
+ import numpy as np
93
+ import torch
94
+ import torch.nn as nn
95
+ except ImportError as exc:
96
+ print(f"missing dependency: {exc}. pip install torch numpy", file=sys.stderr)
97
+ return 2
98
+ try:
99
+ import gguf
100
+ except ImportError:
101
+ print("gguf not installed. pip install gguf", file=sys.stderr)
102
+ return 2
103
+
104
+ out_path = args.out
105
+ if not out_path:
106
+ state_dir = os.environ.get(
107
+ "ELIZA_STATE_DIR", os.path.join(os.path.expanduser("~"), ".eliza")
108
+ )
109
+ out_path = os.path.join(state_dir, "models", "vision", f"{args.variant}.gguf")
110
+ args.out = out_path
111
+
112
+ weights = args.weights or f"{args.variant}.pt"
113
+ print(f"[convert] loading {weights}", file=sys.stderr)
114
+ # Prefer ultralytics; fall back to loading the DetectionModel straight from
115
+ # the checkpoint only when the operator explicitly trusts the file. PyTorch
116
+ # full-checkpoint unpickling can execute code.
117
+ try:
118
+ from ultralytics import YOLO
119
+ except Exception as exc: # noqa: BLE001 - torchvision registration can fail here
120
+ if isinstance(exc, ModuleNotFoundError) and exc.name == "ultralytics":
121
+ print("ultralytics not installed. pip install ultralytics", file=sys.stderr)
122
+ return 2
123
+ trust_checkpoint = args.trust_checkpoint or os.environ.get(
124
+ "ELIZA_YOLO_TRUST_CHECKPOINT"
125
+ ) in {"1", "true", "yes"}
126
+ if not trust_checkpoint:
127
+ print(
128
+ f"[convert] ultralytics import failed ({exc}). Direct "
129
+ "torch.load fallback requires --trust-checkpoint or "
130
+ "ELIZA_YOLO_TRUST_CHECKPOINT=1 because PyTorch checkpoint "
131
+ "unpickling can execute code.",
132
+ file=sys.stderr,
133
+ )
134
+ return 2
135
+ print(
136
+ f"[convert] ultralytics unavailable ({exc}); "
137
+ "loading trusted DetectionModel directly from checkpoint",
138
+ file=sys.stderr,
139
+ )
140
+ try:
141
+ checkpoint = torch.load(weights, map_location="cpu", weights_only=False)
142
+ except FileNotFoundError:
143
+ print(
144
+ f"checkpoint not found: {weights}. Install ultralytics to auto-download "
145
+ "default weights, or pass --weights with a local .pt file.",
146
+ file=sys.stderr,
147
+ )
148
+ return 2
149
+ except Exception as load_exc:
150
+ print(f"torch.load failed for {weights}: {load_exc}", file=sys.stderr)
151
+ return 2
152
+ if isinstance(checkpoint, dict):
153
+ model = checkpoint.get("ema")
154
+ if model is None:
155
+ model = checkpoint.get("model")
156
+ elif isinstance(checkpoint, nn.Module):
157
+ model = checkpoint
158
+ else:
159
+ model = None
160
+ if not isinstance(model, nn.Module):
161
+ print(
162
+ f"checkpoint {weights} does not contain a recoverable nn.Module "
163
+ "in 'ema' or 'model'",
164
+ file=sys.stderr,
165
+ )
166
+ return 2
167
+ else:
168
+ model = YOLO(weights).model # DetectionModel (nn.Module)
169
+ model.eval().float()
170
+
171
+ out_dir = os.path.dirname(os.path.abspath(args.out))
172
+ os.makedirs(out_dir, exist_ok=True)
173
+
174
+ writer = gguf.GGUFWriter(args.out, "yolo")
175
+ writer.add_string("yolo.variant", args.variant)
176
+ writer.add_uint32("yolo.input_h", 640)
177
+ writer.add_uint32("yolo.input_w", 640)
178
+ writer.add_string("yolo.classes", "\n".join(COCO_CLASSES))
179
+ writer.add_array("yolo.strides", [8, 16, 32])
180
+
181
+ def fold_bn(conv, bn):
182
+ w = conv.weight.detach().float() # [OC,IC,KH,KW]
183
+ oc = w.shape[0]
184
+ b = (
185
+ conv.bias.detach().float()
186
+ if conv.bias is not None
187
+ else torch.zeros(oc, dtype=torch.float32)
188
+ )
189
+ gamma = bn.weight.detach().float()
190
+ beta = bn.bias.detach().float()
191
+ mean = bn.running_mean.detach().float()
192
+ var = bn.running_var.detach().float()
193
+ std = torch.sqrt(var + bn.eps)
194
+ w_folded = w * (gamma / std).reshape(-1, 1, 1, 1)
195
+ b_folded = beta + (b - mean) * gamma / std
196
+ return w_folded, b_folded
197
+
198
+ def as_f32(t):
199
+ return np.ascontiguousarray(t.detach().cpu().numpy().astype(np.float32))
200
+
201
+ emitted = []
202
+
203
+ def emit(name, w, b):
204
+ writer.add_tensor(name + ".weight", as_f32(w))
205
+ writer.add_tensor(name + ".bias", as_f32(b))
206
+ emitted.append((name, tuple(w.shape), tuple(b.shape)))
207
+
208
+ n_conv = n_bare = 0
209
+ for name, m in model.named_modules():
210
+ cls = type(m).__name__
211
+ if cls == "Conv" and hasattr(m, "conv") and hasattr(m, "bn"):
212
+ # ultralytics CBS: fold BN into the conv.
213
+ if isinstance(m.conv, nn.Conv2d) and isinstance(m.bn, nn.BatchNorm2d):
214
+ w, b = fold_bn(m.conv, m.bn)
215
+ emit(name, w, b)
216
+ n_conv += 1
217
+ elif cls == "Conv2d":
218
+ # bare Conv2d. Skip the inner conv of a CBS (handled above) and the
219
+ # fixed DFL buffer (recomputed in C). Keep only the head stage-2 1x1.
220
+ if name.endswith(".conv"):
221
+ continue
222
+ if ".dfl" in name:
223
+ continue
224
+ if m.bias is None:
225
+ b = torch.zeros(m.weight.shape[0], dtype=torch.float32)
226
+ else:
227
+ b = m.bias
228
+ emit(name, m.weight, b)
229
+ n_bare += 1
230
+
231
+ print(
232
+ f"[convert] folded {n_conv} CBS convs + {n_bare} bare head convs "
233
+ f"= {len(emitted)} tensors",
234
+ file=sys.stderr,
235
+ )
236
+ for name, ws, bs in emitted:
237
+ print(f" {name:<28} w{ws} b{bs}", file=sys.stderr)
238
+
239
+ writer.write_header_to_file()
240
+ writer.write_kv_data_to_file()
241
+ writer.write_tensors_to_file()
242
+ writer.close()
243
+ print(f"[convert] wrote {args.out}", file=sys.stderr)
244
+ return 0
245
+
246
+
247
+ if __name__ == "__main__":
248
+ raise SystemExit(main())