npm - @elizaos/plugin-vision - Versions diffs - 2.0.0-beta.1 → 2.0.3-beta.5 - Mend

@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

package/LICENSE +21 -0
package/README.md +73 -301
package/dist/action.d.ts +3 -0
package/dist/action.d.ts.map +1 -0
package/dist/audio-capture-stream.d.ts +42 -0
package/dist/audio-capture-stream.d.ts.map +1 -0
package/dist/audio-capture.d.ts +25 -0
package/dist/audio-capture.d.ts.map +1 -0
package/dist/computeruse-ocr-bridge.d.ts +50 -0
package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
package/dist/config.d.ts +68 -0
package/dist/config.d.ts.map +1 -0
package/dist/describe-backpressure.d.ts +90 -0
package/dist/describe-backpressure.d.ts.map +1 -0
package/dist/dirty-tile-describer.d.ts +102 -0
package/dist/dirty-tile-describer.d.ts.map +1 -0
package/dist/dirty-tile-scene.d.ts +56 -0
package/dist/dirty-tile-scene.d.ts.map +1 -0
package/dist/entity-tracker.d.ts +33 -0
package/dist/entity-tracker.d.ts.map +1 -0
package/dist/face-detector-ggml.d.ts +60 -0
package/dist/face-detector-ggml.d.ts.map +1 -0
package/dist/face-detector-mediapipe.d.ts +25 -0
package/dist/face-detector-mediapipe.d.ts.map +1 -0
package/dist/face-recognition-ggml.d.ts +94 -0
package/dist/face-recognition-ggml.d.ts.map +1 -0
package/dist/get-screen-elements.d.ts +90 -0
package/dist/get-screen-elements.d.ts.map +1 -0
package/dist/get-screen.d.ts +60 -0
package/dist/get-screen.d.ts.map +1 -0
package/dist/image/sharp-compat.d.ts +89 -0
package/dist/image/sharp-compat.d.ts.map +1 -0
package/dist/image-input.d.ts +15 -0
package/dist/image-input.d.ts.map +1 -0
package/dist/index.d.ts +4 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +7957 -6238
package/dist/index.js.map +41 -26
package/dist/lifecycle.d.ts +94 -0
package/dist/lifecycle.d.ts.map +1 -0
package/dist/mobile/capacitor-camera.d.ts +85 -0
package/dist/mobile/capacitor-camera.d.ts.map +1 -0
package/dist/native/doctr-ffi.d.ts +40 -0
package/dist/native/doctr-ffi.d.ts.map +1 -0
package/dist/native/yolo-ffi.d.ts +21 -0
package/dist/native/yolo-ffi.d.ts.map +1 -0
package/dist/ocr-host-windows.d.ts +34 -0
package/dist/ocr-host-windows.d.ts.map +1 -0
package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
package/dist/ocr-service-doctr.d.ts +61 -0
package/dist/ocr-service-doctr.d.ts.map +1 -0
package/dist/ocr-service-linux-tesseract.d.ts +85 -0
package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
package/dist/ocr-service-paddleocr.d.ts +59 -0
package/dist/ocr-service-paddleocr.d.ts.map +1 -0
package/dist/ocr-service-windows.d.ts +41 -0
package/dist/ocr-service-windows.d.ts.map +1 -0
package/dist/ocr-service.d.ts +91 -0
package/dist/ocr-service.d.ts.map +1 -0
package/dist/ocr-with-coords.d.ts +103 -0
package/dist/ocr-with-coords.d.ts.map +1 -0
package/dist/person-detector.d.ts +17 -0
package/dist/person-detector.d.ts.map +1 -0
package/dist/provider.d.ts +3 -0
package/dist/provider.d.ts.map +1 -0
package/dist/routes.d.ts +7 -0
package/dist/routes.d.ts.map +1 -0
package/dist/screen-capture-bridge.d.ts +51 -0
package/dist/screen-capture-bridge.d.ts.map +1 -0
package/dist/screen-capture.d.ts +17 -0
package/dist/screen-capture.d.ts.map +1 -0
package/dist/screen-tiler.d.ts +75 -0
package/dist/screen-tiler.d.ts.map +1 -0
package/dist/service.d.ts +176 -0
package/dist/service.d.ts.map +1 -0
package/dist/set-of-marks-provider.d.ts +64 -0
package/dist/set-of-marks-provider.d.ts.map +1 -0
package/dist/som.d.ts +135 -0
package/dist/som.d.ts.map +1 -0
package/dist/som.js +184 -0
package/dist/som.js.map +11 -0
package/dist/test-input.d.ts +25 -0
package/dist/test-input.d.ts.map +1 -0
package/dist/types.d.ts +241 -0
package/dist/types.d.ts.map +1 -0
package/dist/vision-context-augmenter.d.ts +93 -0
package/dist/vision-context-augmenter.d.ts.map +1 -0
package/dist/vision-worker-manager.d.ts +51 -0
package/dist/vision-worker-manager.d.ts.map +1 -0
package/dist/workers/ocr-worker.d.ts +2 -0
package/dist/workers/ocr-worker.d.ts.map +1 -0
package/dist/workers/ocr-worker.js +1075 -7821
package/dist/workers/ocr-worker.js.map +10 -51
package/dist/workers/screen-capture-worker.d.ts +2 -0
package/dist/workers/screen-capture-worker.d.ts.map +1 -0
package/dist/workers/screen-capture-worker.js +364 -6
package/dist/workers/screen-capture-worker.js.map +5 -4
package/dist/workers/worker-logger.d.ts +10 -0
package/dist/workers/worker-logger.d.ts.map +1 -0
package/dist/yolo-detector.d.ts +37 -0
package/dist/yolo-detector.d.ts.map +1 -0
package/native/doctr.cpp/CMakeLists.txt +58 -0
package/native/doctr.cpp/README.md +62 -0
package/native/doctr.cpp/include/doctr.h +91 -0
package/native/doctr.cpp/scripts/convert.py +98 -0
package/native/doctr.cpp/src/doctr_det.cpp +112 -0
package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
package/native/macos-vision-ocr.swift +113 -0
package/native/mobilefacenet.cpp/README.md +13 -0
package/native/movenet.cpp/README.md +10 -0
package/native/retinaface.cpp/README.md +12 -0
package/native/yolo.cpp/CMakeLists.txt +57 -0
package/native/yolo.cpp/README.md +64 -0
package/native/yolo.cpp/build.mjs +76 -0
package/native/yolo.cpp/include/yolo.h +62 -0
package/native/yolo.cpp/scripts/convert.py +248 -0
package/native/yolo.cpp/src/yolo.cpp +425 -0
package/native/yolo.cpp/verify/compare.py +99 -0
package/native/yolo.cpp/verify/make_ref.py +75 -0
package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
package/native/yolo.cpp/verify/run_ts.mjs +26 -0
package/package.json +39 -21
package/registry-entry.json +43 -0
package/scripts/vendor-tesseract-linux.mjs +177 -0
package/build.config.ts +0 -89
package/dist/workers/florence2-worker.js +0 -779
package/dist/workers/florence2-worker.js.map +0 -13

package/native/yolo.cpp/src/yolo.cpp ADDED Viewed

@@ -0,0 +1,425 @@
+// yolo.cpp — YOLOv8n forward pass via ggml.
+//
+// Implements the full YOLOv8 nano graph on ggml's CPU backend:
+//   backbone (Conv / C2f / SPPF) → PAN-FPN neck → decoupled head (cv2 box + cv3 cls).
+// The CNN runs in ggml; the cheap tail (DFL distribution decode, anchor/stride
+// decode to pixel cx/cy/w/h, and class sigmoid) runs in plain C++ here so the
+// emitted tensor is exactly what `src/yolo-detector.ts::parseYoloV8` expects:
+//
+//   out_logits laid out [channels=4+nc=84, anchors=8400], channel-major
+//   (value(c,a) = out_logits[c*8400 + a]); rows 0..3 = cx,cy,w,h in 640x640
+//   letterboxed input pixels; rows 4..83 = per-class probabilities (sigmoid).
+//   Anchor order P3(6400) → P4(1600) → P5(400).
+//
+// Weights come from the GGUF written by scripts/convert.py: BatchNorm is folded
+// into each conv at convert time, so every CBS conv is a plain conv weight+bias
+// followed by SiLU; the head's stage-2 1x1 convs are plain conv+bias (no act).
+// The DFL buffer is NOT stored — the expectation over reg_max=16 bins is
+// computed directly below.
+#include "yolo.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <thread>
+#include <vector>
+#if defined(YOLO_HAVE_GGML)
+#  include "ggml.h"
+#  include "ggml-alloc.h"
+#  include "ggml-backend.h"
+#  include "ggml-cpu.h"
+#  include "gguf.h"
+#endif
+struct yolo_ctx {
+    std::string gguf_path;
+    std::string classes;
+    int input_h = 640;
+    int input_w = 640;
+#if defined(YOLO_HAVE_GGML)
+    ggml_backend_t        backend = nullptr;
+    struct ggml_context * wctx    = nullptr;  // weights (named tensors, backend buffer)
+    ggml_backend_buffer_t wbuf    = nullptr;
+#endif
+};
+#if defined(YOLO_HAVE_GGML)
+// ---- weight lookup ---------------------------------------------------------
+static struct ggml_tensor * w_get(struct ggml_context * wc, const std::string & name) {
+    struct ggml_tensor * t = ggml_get_tensor(wc, name.c_str());
+    if (!t) {
+        fprintf(stderr, "[yolo] missing tensor '%s'\n", name.c_str());
+    }
+    return t;
+}
+// ---- graph helpers ---------------------------------------------------------
+//
+// `g`  = compute-graph context (no_alloc); `wc` = weights context.
+// A "CBS" conv = conv2d(stride) + per-channel bias + SiLU. Padding is derived
+// from the kernel width (k3 -> 1, k1 -> 0). BN is already folded into w/b.
+static struct ggml_tensor * conv_core(struct ggml_context * g, struct ggml_context * wc,
+                                      struct ggml_tensor * x, const std::string & name,
+                                      int stride) {
+    struct ggml_tensor * w = w_get(wc, name + ".weight");
+    struct ggml_tensor * b = w_get(wc, name + ".bias");
+    if (!w || !b) return nullptr;
+    const int pad = (int) (w->ne[0] / 2);  // ne[0] = KW
+    struct ggml_tensor * y = ggml_conv_2d(g, w, x, stride, stride, pad, pad, 1, 1);
+    // bias broadcast over [OW, OH, OC, N]
+    y = ggml_add(g, y, ggml_reshape_4d(g, b, 1, 1, b->ne[0], 1));
+    return y;
+}
+static struct ggml_tensor * conv_bn(struct ggml_context * g, struct ggml_context * wc,
+                                    struct ggml_tensor * x, const std::string & name,
+                                    int stride) {
+    struct ggml_tensor * y = conv_core(g, wc, x, name, stride);
+    if (!y) return nullptr;
+    return ggml_silu(g, y);
+}
+static struct ggml_tensor * conv_plain(struct ggml_context * g, struct ggml_context * wc,
+                                       struct ggml_tensor * x, const std::string & name,
+                                       int stride) {
+    return conv_core(g, wc, x, name, stride);  // no activation (head stage-2)
+}
+static struct ggml_tensor * bottleneck(struct ggml_context * g, struct ggml_context * wc,
+                                       struct ggml_tensor * x, const std::string & prefix,
+                                       bool add) {
+    struct ggml_tensor * h = conv_bn(g, wc, x, prefix + ".cv1", 1);  // 3x3
+    h = conv_bn(g, wc, h, prefix + ".cv2", 1);                       // 3x3
+    if (!h) return nullptr;
+    if (add) h = ggml_add(g, x, h);
+    return h;
+}
+static struct ggml_tensor * c2f(struct ggml_context * g, struct ggml_context * wc,
+                                struct ggml_tensor * x, const std::string & prefix,
+                                int n, bool add) {
+    struct ggml_tensor * y = conv_bn(g, wc, x, prefix + ".cv1", 1);  // 1x1 -> 2*hidden
+    if (!y) return nullptr;
+    const int64_t W = y->ne[0], H = y->ne[1];
+    const int64_t hid = y->ne[2] / 2;
+    // split channels into two halves; cont() so each is a clean conv input.
+    struct ggml_tensor * y0 =
+        ggml_cont(g, ggml_view_3d(g, y, W, H, hid, y->nb[1], y->nb[2], 0));
+    struct ggml_tensor * y1 =
+        ggml_cont(g, ggml_view_3d(g, y, W, H, hid, y->nb[1], y->nb[2], (size_t) hid * y->nb[2]));
+    std::vector<struct ggml_tensor *> outs;
+    outs.push_back(y0);
+    outs.push_back(y1);
+    struct ggml_tensor * prev = y1;
+    for (int j = 0; j < n; j++) {
+        prev = bottleneck(g, wc, prev, prefix + ".m." + std::to_string(j), add);
+        if (!prev) return nullptr;
+        outs.push_back(prev);
+    }
+    struct ggml_tensor * acc = outs[0];
+    for (size_t k = 1; k < outs.size(); k++) {
+        acc = ggml_concat(g, acc, outs[k], 2);  // concat on channels
+    }
+    return conv_bn(g, wc, acc, prefix + ".cv2", 1);  // 1x1 fuse -> c_out
+}
+static struct ggml_tensor * sppf(struct ggml_context * g, struct ggml_context * wc,
+                                 struct ggml_tensor * x, const std::string & prefix) {
+    struct ggml_tensor * c = conv_bn(g, wc, x, prefix + ".cv1", 1);  // 1x1 -> c_
+    if (!c) return nullptr;
+    struct ggml_tensor * m1 = ggml_pool_2d(g, c, GGML_OP_POOL_MAX, 5, 5, 1, 1, 2.0f, 2.0f);
+    struct ggml_tensor * m2 = ggml_pool_2d(g, m1, GGML_OP_POOL_MAX, 5, 5, 1, 1, 2.0f, 2.0f);
+    struct ggml_tensor * m3 = ggml_pool_2d(g, m2, GGML_OP_POOL_MAX, 5, 5, 1, 1, 2.0f, 2.0f);
+    struct ggml_tensor * cat = ggml_concat(g, ggml_concat(g, ggml_concat(g, c, m1, 2), m2, 2), m3, 2);
+    return conv_bn(g, wc, cat, prefix + ".cv2", 1);  // 1x1 -> c2
+}
+// Build the full YOLOv8n graph; fills box[3] + cls[3] head outputs (P3,P4,P5).
+static bool build_yolov8n(struct ggml_context * g, struct ggml_context * wc,
+                          struct ggml_tensor * inp,
+                          struct ggml_tensor * box[3], struct ggml_tensor * cls[3]) {
+    // backbone
+    struct ggml_tensor * x = conv_bn(g, wc, inp, "model.0", 2);     // 16, 320
+    x = conv_bn(g, wc, x, "model.1", 2);                            // 32, 160
+    x = c2f(g, wc, x, "model.2", 1, true);                          // 32, 160
+    x = conv_bn(g, wc, x, "model.3", 2);                            // 64, 80
+    struct ggml_tensor * p3 = c2f(g, wc, x, "model.4", 2, true);    // 64, 80   (P3 src)
+    x = conv_bn(g, wc, p3, "model.5", 2);                           // 128, 40
+    struct ggml_tensor * p4 = c2f(g, wc, x, "model.6", 2, true);    // 128, 40  (P4 src)
+    x = conv_bn(g, wc, p4, "model.7", 2);                           // 256, 20
+    x = c2f(g, wc, x, "model.8", 1, true);                          // 256, 20
+    struct ggml_tensor * p5 = sppf(g, wc, x, "model.9");            // 256, 20  (P5 src)
+    if (!p3 || !p4 || !p5) return false;
+    // neck (PAN-FPN)
+    struct ggml_tensor * u = ggml_upscale(g, p5, 2, GGML_SCALE_MODE_NEAREST);  // 256, 40
+    x = ggml_concat(g, u, p4, 2);                                   // 384, 40
+    struct ggml_tensor * n12 = c2f(g, wc, x, "model.12", 1, false); // 128, 40
+    u = ggml_upscale(g, n12, 2, GGML_SCALE_MODE_NEAREST);           // 128, 80
+    x = ggml_concat(g, u, p3, 2);                                   // 192, 80
+    struct ggml_tensor * n15 = c2f(g, wc, x, "model.15", 1, false); // 64, 80   (head P3)
+    x = conv_bn(g, wc, n15, "model.16", 2);                         // 64, 40
+    x = ggml_concat(g, x, n12, 2);                                  // 192, 40
+    struct ggml_tensor * n18 = c2f(g, wc, x, "model.18", 1, false); // 128, 40  (head P4)
+    x = conv_bn(g, wc, n18, "model.19", 2);                         // 128, 20
+    x = ggml_concat(g, x, p5, 2);                                   // 384, 20
+    struct ggml_tensor * n21 = c2f(g, wc, x, "model.21", 1, false); // 256, 20  (head P5)
+    if (!n12 || !n15 || !n18 || !n21) return false;
+    struct ggml_tensor * feats[3] = { n15, n18, n21 };
+    for (int s = 0; s < 3; s++) {
+        const std::string cv2 = "model.22.cv2." + std::to_string(s);
+        const std::string cv3 = "model.22.cv3." + std::to_string(s);
+        struct ggml_tensor * b = conv_bn(g, wc, feats[s], cv2 + ".0", 1);
+        b = conv_bn(g, wc, b, cv2 + ".1", 1);
+        b = conv_plain(g, wc, b, cv2 + ".2", 1);  // 64 ch (4*reg_max)
+        struct ggml_tensor * c = conv_bn(g, wc, feats[s], cv3 + ".0", 1);
+        c = conv_bn(g, wc, c, cv3 + ".1", 1);
+        c = conv_plain(g, wc, c, cv3 + ".2", 1);  // 80 ch (nc)
+        if (!b || !c) return false;
+        box[s] = b;
+        cls[s] = c;
+    }
+    return true;
+}
+// CPU-side DFL + decode + sigmoid → out_logits [84, 8400] (channel-major).
+static void decode_head(const std::vector<float> & box, const std::vector<float> & cls,
+                        int W, int H, int stride, int base, int nc, int anchors,
+                        float * out) {
+    const int reg = 16;       // reg_max
+    const int WH = W * H;
+    for (int gy = 0; gy < H; gy++) {
+        for (int gx = 0; gx < W; gx++) {
+            const int cell = gy * W + gx;
+            float dist[4];
+            for (int side = 0; side < 4; side++) {
+                float mx = -1e30f;
+                for (int j = 0; j < reg; j++) {
+                    float z = box[(size_t) (side * reg + j) * WH + cell];
+                    if (z > mx) mx = z;
+                }
+                float sum = 0.0f, acc = 0.0f;
+                for (int j = 0; j < reg; j++) {
+                    float e = expf(box[(size_t) (side * reg + j) * WH + cell] - mx);
+                    sum += e;
+                    acc += e * j;
+                }
+                dist[side] = acc / sum;  // expected distance in grid cells
+            }
+            const float ax = gx + 0.5f, ay = gy + 0.5f;
+            const float x1 = ax - dist[0], y1 = ay - dist[1];
+            const float x2 = ax + dist[2], y2 = ay + dist[3];
+            const int a = base + cell;  // global anchor index
+            out[0 * anchors + a] = (x1 + x2) * 0.5f * stride;  // cx
+            out[1 * anchors + a] = (y1 + y2) * 0.5f * stride;  // cy
+            out[2 * anchors + a] = (x2 - x1) * stride;         // w
+            out[3 * anchors + a] = (y2 - y1) * stride;         // h
+            for (int cc = 0; cc < nc; cc++) {
+                float v = cls[(size_t) cc * WH + cell];
+                out[(4 + cc) * anchors + a] = 1.0f / (1.0f + expf(-v));
+            }
+        }
+    }
+}
+#endif  // YOLO_HAVE_GGML
+// ---------------------------------------------------------------------------
+//  C ABI
+// ---------------------------------------------------------------------------
+extern "C" yolo_ctx * yolo_init(const char * gguf_path) {
+    if (!gguf_path) return nullptr;
+#if defined(YOLO_HAVE_GGML)
+    yolo_ctx * ctx = new (std::nothrow) yolo_ctx();
+    if (!ctx) return nullptr;
+    ctx->gguf_path = gguf_path;
+    ctx->backend = ggml_backend_cpu_init();
+    if (!ctx->backend) { delete ctx; return nullptr; }
+    {
+        // YOLOv8n is ~250 small conv ops; ggml's per-op thread barrier means
+        // too many threads spin-waits and gets *slower* (hyperthread
+        // oversubscription is catastrophic here). Default to ~physical cores
+        // (logical/2), capped at 8; allow override via ELIZA_YOLO_THREADS.
+        int nth = 4;
+        unsigned hw = std::thread::hardware_concurrency();
+        if (hw > 0) {
+            nth = (int) (hw / 2);
+            if (nth < 1) nth = 1;
+            if (nth > 8) nth = 8;
+        }
+        if (const char * env = std::getenv("ELIZA_YOLO_THREADS")) {
+            int v = std::atoi(env);
+            if (v > 0 && v <= 128) nth = v;
+        }
+        ggml_backend_cpu_set_n_threads(ctx->backend, nth);
+    }
+    // 1. load the gguf into a throwaway ctx with data (no_alloc=false).
+    struct ggml_context * tmp = nullptr;
+    struct gguf_init_params gp = { /*no_alloc=*/false, /*ctx=*/&tmp };
+    struct gguf_context * gguf = gguf_init_from_file(gguf_path, gp);
+    if (!gguf || !tmp) {
+        fprintf(stderr, "[yolo] failed to open gguf %s\n", gguf_path);
+        if (gguf) gguf_free(gguf);
+        ggml_backend_free(ctx->backend);
+        delete ctx;
+        return nullptr;
+    }
+    // metadata: class names (string) + input dims (fall back to 640).
+    int64_t kc = gguf_find_key(gguf, "yolo.classes");
+    if (kc >= 0 && gguf_get_kv_type(gguf, kc) == GGUF_TYPE_STRING) {
+        ctx->classes = gguf_get_val_str(gguf, kc);
+    }
+    ctx->input_h = 640;
+    ctx->input_w = 640;
+    // 2. metadata-only duplicate into the weights ctx (no_alloc), then back it
+    //    with a CPU buffer and upload each tensor's bytes.
+    const int64_t n = gguf_get_n_tensors(gguf);
+    struct ggml_init_params ip = {
+        /*mem_size=*/ ggml_tensor_overhead() * (size_t) (n + 8),
+        /*mem_buffer=*/ nullptr,
+        /*no_alloc=*/ true,
+    };
+    ctx->wctx = ggml_init(ip);
+    if (!ctx->wctx) {
+        gguf_free(gguf); ggml_free(tmp); ggml_backend_free(ctx->backend);
+        delete ctx; return nullptr;
+    }
+    for (int64_t i = 0; i < n; i++) {
+        const char * name = gguf_get_tensor_name(gguf, i);
+        struct ggml_tensor * src = ggml_get_tensor(tmp, name);
+        struct ggml_tensor * dst = ggml_dup_tensor(ctx->wctx, src);
+        ggml_set_name(dst, name);
+    }
+    ctx->wbuf = ggml_backend_alloc_ctx_tensors(ctx->wctx, ctx->backend);
+    if (!ctx->wbuf) {
+        gguf_free(gguf); ggml_free(tmp); ggml_free(ctx->wctx);
+        ggml_backend_free(ctx->backend); delete ctx; return nullptr;
+    }
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx->wctx); cur;
+         cur = ggml_get_next_tensor(ctx->wctx, cur)) {
+        struct ggml_tensor * src = ggml_get_tensor(tmp, ggml_get_name(cur));
+        ggml_backend_tensor_set(cur, ggml_get_data(src), 0, ggml_nbytes(src));
+    }
+    gguf_free(gguf);
+    ggml_free(tmp);
+    fprintf(stderr, "[yolo] initialized %s (%lld tensors, backend=%s)\n",
+            gguf_path, (long long) n, ggml_backend_name(ctx->backend));
+    return ctx;
+#else
+    fprintf(stderr, "[yolo] built without YOLO_HAVE_GGML — weights cannot load.\n");
+    return nullptr;
+#endif
+}
+extern "C" int yolo_run(yolo_ctx * ctx,
+                        const float * rgb_chw,
+                        int h, int w,
+                        float * out_logits,
+                        int * out_channels,
+                        int * out_anchors) {
+    if (!ctx || !rgb_chw || !out_logits || !out_channels || !out_anchors) {
+        return YOLO_ERR_SHAPE;
+    }
+    if (h != ctx->input_h || w != ctx->input_w) return YOLO_ERR_SHAPE;
+#if defined(YOLO_HAVE_GGML)
+    const int nc = 80;
+    const int anchors = 8400;
+    // compute-graph context (no_alloc; gallocr assigns activation buffers).
+    size_t cmem = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    struct ggml_context * g = ggml_init({ cmem, nullptr, /*no_alloc=*/true });
+    if (!g) return YOLO_ERR_OOM;
+    struct ggml_tensor * inp = ggml_new_tensor_4d(g, GGML_TYPE_F32, w, h, 3, 1);  // [W,H,C,N]
+    ggml_set_name(inp, "input");
+    ggml_set_input(inp);
+    struct ggml_tensor * box[3] = { nullptr, nullptr, nullptr };
+    struct ggml_tensor * cls[3] = { nullptr, nullptr, nullptr };
+    if (!build_yolov8n(g, ctx->wctx, inp, box, cls)) {
+        ggml_free(g);
+        return YOLO_ERR_FORMAT;
+    }
+    for (int s = 0; s < 3; s++) { ggml_set_output(box[s]); ggml_set_output(cls[s]); }
+    struct ggml_cgraph * gf = ggml_new_graph(g);
+    for (int s = 0; s < 3; s++) {
+        ggml_build_forward_expand(gf, box[s]);
+        ggml_build_forward_expand(gf, cls[s]);
+    }
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+    if (!alloc || !ggml_gallocr_alloc_graph(alloc, gf)) {
+        if (alloc) ggml_gallocr_free(alloc);
+        ggml_free(g);
+        return YOLO_ERR_OOM;
+    }
+    // upload preprocessed CHW image, run.
+    ggml_backend_tensor_set(inp, rgb_chw, 0, ggml_nbytes(inp));
+    if (ggml_backend_graph_compute(ctx->backend, gf) != GGML_STATUS_SUCCESS) {
+        ggml_gallocr_free(alloc);
+        ggml_free(g);
+        return YOLO_ERR_BACKEND;
+    }
+    // pull head tensors to host and decode.
+    std::memset(out_logits, 0, sizeof(float) * (size_t) (4 + nc) * anchors);
+    const int strides[3] = { 8, 16, 32 };
+    const int bases[3]   = { 0, 6400, 8000 };
+    for (int s = 0; s < 3; s++) {
+        const int W = (int) box[s]->ne[0];
+        const int H = (int) box[s]->ne[1];
+        std::vector<float> boxbuf(ggml_nelements(box[s]));
+        std::vector<float> clsbuf(ggml_nelements(cls[s]));
+        ggml_backend_tensor_get(box[s], boxbuf.data(), 0, ggml_nbytes(box[s]));
+        ggml_backend_tensor_get(cls[s], clsbuf.data(), 0, ggml_nbytes(cls[s]));
+        decode_head(boxbuf, clsbuf, W, H, strides[s], bases[s], nc, anchors, out_logits);
+    }
+    *out_channels = 4 + nc;  // 84
+    *out_anchors  = anchors; // 8400
+    ggml_gallocr_free(alloc);
+    ggml_free(g);
+    return YOLO_OK;
+#else
+    *out_channels = 0;
+    *out_anchors = 0;
+    return YOLO_ERR_BACKEND;
+#endif
+}
+extern "C" const char * yolo_classes(yolo_ctx * ctx) {
+    return ctx ? ctx->classes.c_str() : nullptr;
+}
+extern "C" void yolo_free(yolo_ctx * ctx) {
+    if (!ctx) return;
+#if defined(YOLO_HAVE_GGML)
+    if (ctx->wbuf)    ggml_backend_buffer_free(ctx->wbuf);
+    if (ctx->wctx)    ggml_free(ctx->wctx);
+    if (ctx->backend) ggml_backend_free(ctx->backend);
+#endif
+    delete ctx;
+}

package/native/yolo.cpp/verify/compare.py ADDED Viewed

@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""Compare verify/out.bin (ggml) against verify/ref.bin (ultralytics)."""
+import os
+import sys
+import numpy as np
+HERE = os.path.dirname(os.path.abspath(__file__))
+COCO = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
+    "truck", "boat", "traffic light", "fire hydrant", "stop sign",
+    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag",
+    "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
+    "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
+    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
+    "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
+    "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
+    "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock",
+    "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
+]
+def nms(boxes, scores, iou_thr=0.5):
+    # boxes xywh -> xyxy
+    x1 = boxes[:, 0] - boxes[:, 2] / 2
+    y1 = boxes[:, 1] - boxes[:, 3] / 2
+    x2 = boxes[:, 0] + boxes[:, 2] / 2
+    y2 = boxes[:, 1] + boxes[:, 3] / 2
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-9)
+        order = order[1:][iou <= iou_thr]
+    return keep
+def detect(preds, conf_thr=0.25):
+    boxes = preds[:4].T          # [8400,4]
+    scores = preds[4:]           # [80,8400]
+    cls = scores.argmax(0)
+    conf = scores.max(0)
+    m = conf >= conf_thr
+    b, c, s = boxes[m], cls[m], conf[m]
+    keep = nms(b, s)
+    return [(int(c[k]), float(s[k]), b[k]) for k in keep]
+def main() -> int:
+    ref = np.fromfile(os.path.join(HERE, "ref.bin"), dtype=np.float32)
+    out = np.fromfile(os.path.join(HERE, "out.bin"), dtype=np.float32)
+    if ref.size != 84 * 8400 or out.size != 84 * 8400:
+        print(f"size mismatch ref={ref.size} out={out.size}", file=sys.stderr)
+        return 1
+    ref = ref.reshape(84, 8400)
+    out = out.reshape(84, 8400)
+    box_diff = np.abs(ref[:4] - out[:4])
+    cls_diff = np.abs(ref[4:] - out[4:])
+    print(f"box  max|Δ|={box_diff.max():.4f} mean|Δ|={box_diff.mean():.5f}")
+    print(f"cls  max|Δ|={cls_diff.max():.5f} mean|Δ|={cls_diff.mean():.6f}")
+    print("\n-- ultralytics reference detections --")
+    rd = detect(ref)
+    for c, s, b in rd:
+        print(f"  {COCO[c]:12s} {s:.3f} xywh=({b[0]:.0f},{b[1]:.0f},{b[2]:.0f},{b[3]:.0f})")
+    print("-- ggml detections --")
+    gd = detect(out)
+    for c, s, b in gd:
+        print(f"  {COCO[c]:12s} {s:.3f} xywh=({b[0]:.0f},{b[1]:.0f},{b[2]:.0f},{b[3]:.0f})")
+    # pass criteria: same set of (class) detections, boxes within a few px, scores close
+    ref_set = sorted([(c, round(float(s), 1)) for c, s, _ in rd])
+    gd_set = sorted([(c, round(float(s), 1)) for c, s, _ in gd])
+    ok = (
+        box_diff.max() < 2.0
+        and cls_diff.max() < 0.02
+        and len(rd) == len(gd)
+        and [c for c, _, _ in rd] == [c for c, _, _ in gd]
+    )
+    print(f"\nRESULT: {'PASS' if ok else 'FAIL'}  "
+          f"(ref {len(rd)} dets, ggml {len(gd)} dets)")
+    return 0 if ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

package/native/yolo.cpp/verify/make_ref.py ADDED Viewed

@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""
+Build a fixed preprocessed input + an ultralytics reference output, so the ggml
+runtime can be checked numerically against PyTorch on the SAME input bytes.
+Writes (next to this script):
+  input.bin  float32 CHW [3,640,640], RGB /255, letterboxed (gray 114) — fed to BOTH
+  ref.bin    float32 [84,8400] ultralytics DetectionModel output (cx,cy,w,h px + sigmoid cls)
+  meta.txt   shapes + a few top reference detections for human sanity
+"""
+import os
+import sys
+import numpy as np
+import torch
+from PIL import Image
+from ultralytics import YOLO
+from ultralytics.utils import ASSETS
+HERE = os.path.dirname(os.path.abspath(__file__))
+INSIZE = 640
+def letterbox_chw(img: Image.Image) -> np.ndarray:
+    w, h = img.size
+    scale = min(INSIZE / w, INSIZE / h)
+    nw, nh = round(w * scale), round(h * scale)
+    resized = img.resize((nw, nh), Image.BILINEAR)
+    canvas = Image.new("RGB", (INSIZE, INSIZE), (114, 114, 114))
+    padw = round((INSIZE - nw) / 2)
+    padh = round((INSIZE - nh) / 2)
+    canvas.paste(resized, (padw, padh))
+    arr = np.asarray(canvas).astype(np.float32) / 255.0  # HWC RGB
+    chw = np.ascontiguousarray(np.transpose(arr, (2, 0, 1)))  # CHW
+    return chw
+def main() -> int:
+    src = ASSETS / "bus.jpg"
+    img = Image.open(src).convert("RGB")
+    # stage the test image next to this script for run_ts.mjs (gitignored).
+    img.save(os.path.join(HERE, "bus.jpg"))
+    chw = letterbox_chw(img)
+    chw.tofile(os.path.join(HERE, "input.bin"))
+    model = YOLO("yolov8n.pt").model.eval().float()
+    with torch.no_grad():
+        inp = torch.from_numpy(chw[None])  # [1,3,640,640]
+        out = model(inp)
+        if isinstance(out, (list, tuple)):
+            out = out[0]
+        preds = out[0].cpu().numpy().astype(np.float32)  # [84,8400]
+    preds.tofile(os.path.join(HERE, "ref.bin"))
+    # human sanity: decode top reference detections (no NMS, just peek)
+    boxes = preds[:4]          # [4,8400]
+    scores = preds[4:]         # [80,8400]
+    cls = scores.argmax(0)
+    conf = scores.max(0)
+    order = conf.argsort()[::-1][:8]
+    lines = [f"input.bin CHW [3,{INSIZE},{INSIZE}]", f"ref.bin [84,8400] from {src.name}"]
+    for a in order:
+        cx, cy, bw, bh = boxes[:, a]
+        lines.append(
+            f"  anchor {a:5d} cls={cls[a]:2d} conf={conf[a]:.3f} "
+            f"box(cx,cy,w,h)=({cx:.1f},{cy:.1f},{bw:.1f},{bh:.1f})"
+        )
+    meta = "\n".join(lines)
+    open(os.path.join(HERE, "meta.txt"), "w").write(meta + "\n")
+    print(meta)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/native/yolo.cpp/verify/run_ggml.mjs ADDED Viewed

@@ -0,0 +1,78 @@
+// Run the ggml yolo.dll on verify/input.bin and write verify/out.bin [84,8400].
+// Standalone bun:ffi harness (does not depend on the TS plugin) so the native
+// runtime can be checked against the PyTorch reference in isolation.
+//
+//   bun verify/run_ggml.mjs <yolo.dll> <yolov8n.gguf>
+import { dlopen, FFIType, ptr, CString } from "bun:ffi";
+import { readFileSync, writeFileSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+const HERE = dirname(fileURLToPath(import.meta.url));
+const dll = process.argv[2];
+const gguf = process.argv[3];
+if (!dll || !gguf) {
+  console.error("usage: bun run_ggml.mjs <yolo.dll> <yolov8n.gguf>");
+  process.exit(2);
+}
+const lib = dlopen(dll, {
+  yolo_init: { args: [FFIType.cstring], returns: FFIType.pointer },
+  yolo_run: {
+    args: [
+      FFIType.pointer, // ctx
+      FFIType.pointer, // rgb_chw
+      FFIType.i32, // h
+      FFIType.i32, // w
+      FFIType.pointer, // out_logits
+      FFIType.pointer, // out_channels
+      FFIType.pointer, // out_anchors
+    ],
+    returns: FFIType.i32,
+  },
+  yolo_classes: { args: [FFIType.pointer], returns: FFIType.cstring },
+  yolo_free: { args: [FFIType.pointer], returns: FFIType.void },
+});
+const ggufZ = Buffer.from(gguf + "\0", "utf8");
+const ctx = lib.symbols.yolo_init(ptr(ggufZ));
+if (!ctx) {
+  console.error("yolo_init returned NULL");
+  process.exit(1);
+}
+const classesPtr = lib.symbols.yolo_classes(ctx);
+const classes = classesPtr ? new CString(classesPtr).toString() : "";
+console.error(`classes: ${classes.split(/\r?\n/).filter(Boolean).length}`);
+const input = new Float32Array(
+  readFileSync(join(HERE, "input.bin")).buffer.slice(0),
+);
+console.error(`input floats: ${input.length} (expected ${3 * 640 * 640})`);
+const out = new Float32Array(84 * 8400);
+const outChan = new Int32Array(1);
+const outAnch = new Int32Array(1);
+const t0 = performance.now();
+const rc = lib.symbols.yolo_run(
+  ctx,
+  ptr(input),
+  640,
+  640,
+  ptr(out),
+  ptr(outChan),
+  ptr(outAnch),
+);
+const dt = performance.now() - t0;
+console.error(
+  `yolo_run rc=${rc} channels=${outChan[0]} anchors=${outAnch[0]} (${dt.toFixed(0)}ms)`,
+);
+if (rc !== 0) {
+  lib.symbols.yolo_free(ctx);
+  process.exit(1);
+}
+writeFileSync(join(HERE, "out.bin"), Buffer.from(out.buffer));
+lib.symbols.yolo_free(ctx);
+console.error("wrote out.bin");