@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -301
  3. package/dist/action.d.ts +3 -0
  4. package/dist/action.d.ts.map +1 -0
  5. package/dist/audio-capture-stream.d.ts +42 -0
  6. package/dist/audio-capture-stream.d.ts.map +1 -0
  7. package/dist/audio-capture.d.ts +25 -0
  8. package/dist/audio-capture.d.ts.map +1 -0
  9. package/dist/computeruse-ocr-bridge.d.ts +50 -0
  10. package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
  11. package/dist/config.d.ts +68 -0
  12. package/dist/config.d.ts.map +1 -0
  13. package/dist/describe-backpressure.d.ts +90 -0
  14. package/dist/describe-backpressure.d.ts.map +1 -0
  15. package/dist/dirty-tile-describer.d.ts +102 -0
  16. package/dist/dirty-tile-describer.d.ts.map +1 -0
  17. package/dist/dirty-tile-scene.d.ts +56 -0
  18. package/dist/dirty-tile-scene.d.ts.map +1 -0
  19. package/dist/entity-tracker.d.ts +33 -0
  20. package/dist/entity-tracker.d.ts.map +1 -0
  21. package/dist/face-detector-ggml.d.ts +60 -0
  22. package/dist/face-detector-ggml.d.ts.map +1 -0
  23. package/dist/face-detector-mediapipe.d.ts +25 -0
  24. package/dist/face-detector-mediapipe.d.ts.map +1 -0
  25. package/dist/face-recognition-ggml.d.ts +94 -0
  26. package/dist/face-recognition-ggml.d.ts.map +1 -0
  27. package/dist/get-screen-elements.d.ts +90 -0
  28. package/dist/get-screen-elements.d.ts.map +1 -0
  29. package/dist/get-screen.d.ts +60 -0
  30. package/dist/get-screen.d.ts.map +1 -0
  31. package/dist/image/sharp-compat.d.ts +89 -0
  32. package/dist/image/sharp-compat.d.ts.map +1 -0
  33. package/dist/image-input.d.ts +15 -0
  34. package/dist/image-input.d.ts.map +1 -0
  35. package/dist/index.d.ts +4 -0
  36. package/dist/index.d.ts.map +1 -0
  37. package/dist/index.js +7957 -6238
  38. package/dist/index.js.map +41 -26
  39. package/dist/lifecycle.d.ts +94 -0
  40. package/dist/lifecycle.d.ts.map +1 -0
  41. package/dist/mobile/capacitor-camera.d.ts +85 -0
  42. package/dist/mobile/capacitor-camera.d.ts.map +1 -0
  43. package/dist/native/doctr-ffi.d.ts +40 -0
  44. package/dist/native/doctr-ffi.d.ts.map +1 -0
  45. package/dist/native/yolo-ffi.d.ts +21 -0
  46. package/dist/native/yolo-ffi.d.ts.map +1 -0
  47. package/dist/ocr-host-windows.d.ts +34 -0
  48. package/dist/ocr-host-windows.d.ts.map +1 -0
  49. package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
  50. package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
  51. package/dist/ocr-service-doctr.d.ts +61 -0
  52. package/dist/ocr-service-doctr.d.ts.map +1 -0
  53. package/dist/ocr-service-linux-tesseract.d.ts +85 -0
  54. package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
  55. package/dist/ocr-service-paddleocr.d.ts +59 -0
  56. package/dist/ocr-service-paddleocr.d.ts.map +1 -0
  57. package/dist/ocr-service-windows.d.ts +41 -0
  58. package/dist/ocr-service-windows.d.ts.map +1 -0
  59. package/dist/ocr-service.d.ts +91 -0
  60. package/dist/ocr-service.d.ts.map +1 -0
  61. package/dist/ocr-with-coords.d.ts +103 -0
  62. package/dist/ocr-with-coords.d.ts.map +1 -0
  63. package/dist/person-detector.d.ts +17 -0
  64. package/dist/person-detector.d.ts.map +1 -0
  65. package/dist/provider.d.ts +3 -0
  66. package/dist/provider.d.ts.map +1 -0
  67. package/dist/routes.d.ts +7 -0
  68. package/dist/routes.d.ts.map +1 -0
  69. package/dist/screen-capture-bridge.d.ts +51 -0
  70. package/dist/screen-capture-bridge.d.ts.map +1 -0
  71. package/dist/screen-capture.d.ts +17 -0
  72. package/dist/screen-capture.d.ts.map +1 -0
  73. package/dist/screen-tiler.d.ts +75 -0
  74. package/dist/screen-tiler.d.ts.map +1 -0
  75. package/dist/service.d.ts +176 -0
  76. package/dist/service.d.ts.map +1 -0
  77. package/dist/set-of-marks-provider.d.ts +64 -0
  78. package/dist/set-of-marks-provider.d.ts.map +1 -0
  79. package/dist/som.d.ts +135 -0
  80. package/dist/som.d.ts.map +1 -0
  81. package/dist/som.js +184 -0
  82. package/dist/som.js.map +11 -0
  83. package/dist/test-input.d.ts +25 -0
  84. package/dist/test-input.d.ts.map +1 -0
  85. package/dist/types.d.ts +241 -0
  86. package/dist/types.d.ts.map +1 -0
  87. package/dist/vision-context-augmenter.d.ts +93 -0
  88. package/dist/vision-context-augmenter.d.ts.map +1 -0
  89. package/dist/vision-worker-manager.d.ts +51 -0
  90. package/dist/vision-worker-manager.d.ts.map +1 -0
  91. package/dist/workers/ocr-worker.d.ts +2 -0
  92. package/dist/workers/ocr-worker.d.ts.map +1 -0
  93. package/dist/workers/ocr-worker.js +1075 -7821
  94. package/dist/workers/ocr-worker.js.map +10 -51
  95. package/dist/workers/screen-capture-worker.d.ts +2 -0
  96. package/dist/workers/screen-capture-worker.d.ts.map +1 -0
  97. package/dist/workers/screen-capture-worker.js +364 -6
  98. package/dist/workers/screen-capture-worker.js.map +5 -4
  99. package/dist/workers/worker-logger.d.ts +10 -0
  100. package/dist/workers/worker-logger.d.ts.map +1 -0
  101. package/dist/yolo-detector.d.ts +37 -0
  102. package/dist/yolo-detector.d.ts.map +1 -0
  103. package/native/doctr.cpp/CMakeLists.txt +58 -0
  104. package/native/doctr.cpp/README.md +62 -0
  105. package/native/doctr.cpp/include/doctr.h +91 -0
  106. package/native/doctr.cpp/scripts/convert.py +98 -0
  107. package/native/doctr.cpp/src/doctr_det.cpp +112 -0
  108. package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
  109. package/native/macos-vision-ocr.swift +113 -0
  110. package/native/mobilefacenet.cpp/README.md +13 -0
  111. package/native/movenet.cpp/README.md +10 -0
  112. package/native/retinaface.cpp/README.md +12 -0
  113. package/native/yolo.cpp/CMakeLists.txt +57 -0
  114. package/native/yolo.cpp/README.md +64 -0
  115. package/native/yolo.cpp/build.mjs +76 -0
  116. package/native/yolo.cpp/include/yolo.h +62 -0
  117. package/native/yolo.cpp/scripts/convert.py +248 -0
  118. package/native/yolo.cpp/src/yolo.cpp +425 -0
  119. package/native/yolo.cpp/verify/compare.py +99 -0
  120. package/native/yolo.cpp/verify/make_ref.py +75 -0
  121. package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
  122. package/native/yolo.cpp/verify/run_ts.mjs +26 -0
  123. package/package.json +39 -21
  124. package/registry-entry.json +43 -0
  125. package/scripts/vendor-tesseract-linux.mjs +177 -0
  126. package/build.config.ts +0 -89
  127. package/dist/workers/florence2-worker.js +0 -779
  128. package/dist/workers/florence2-worker.js.map +0 -13
@@ -0,0 +1,425 @@
1
+ // yolo.cpp — YOLOv8n forward pass via ggml.
2
+ //
3
+ // Implements the full YOLOv8 nano graph on ggml's CPU backend:
4
+ // backbone (Conv / C2f / SPPF) → PAN-FPN neck → decoupled head (cv2 box + cv3 cls).
5
+ // The CNN runs in ggml; the cheap tail (DFL distribution decode, anchor/stride
6
+ // decode to pixel cx/cy/w/h, and class sigmoid) runs in plain C++ here so the
7
+ // emitted tensor is exactly what `src/yolo-detector.ts::parseYoloV8` expects:
8
+ //
9
+ // out_logits laid out [channels=4+nc=84, anchors=8400], channel-major
10
+ // (value(c,a) = out_logits[c*8400 + a]); rows 0..3 = cx,cy,w,h in 640x640
11
+ // letterboxed input pixels; rows 4..83 = per-class probabilities (sigmoid).
12
+ // Anchor order P3(6400) → P4(1600) → P5(400).
13
+ //
14
+ // Weights come from the GGUF written by scripts/convert.py: BatchNorm is folded
15
+ // into each conv at convert time, so every CBS conv is a plain conv weight+bias
16
+ // followed by SiLU; the head's stage-2 1x1 convs are plain conv+bias (no act).
17
+ // The DFL buffer is NOT stored — the expectation over reg_max=16 bins is
18
+ // computed directly below.
19
+
20
+ #include "yolo.h"
21
+
22
+ #include <cmath>
23
+ #include <cstdio>
24
+ #include <cstdlib>
25
+ #include <cstring>
26
+ #include <string>
27
+ #include <thread>
28
+ #include <vector>
29
+
30
+ #if defined(YOLO_HAVE_GGML)
31
+ # include "ggml.h"
32
+ # include "ggml-alloc.h"
33
+ # include "ggml-backend.h"
34
+ # include "ggml-cpu.h"
35
+ # include "gguf.h"
36
+ #endif
37
+
38
+ struct yolo_ctx {
39
+ std::string gguf_path;
40
+ std::string classes;
41
+ int input_h = 640;
42
+ int input_w = 640;
43
+
44
+ #if defined(YOLO_HAVE_GGML)
45
+ ggml_backend_t backend = nullptr;
46
+ struct ggml_context * wctx = nullptr; // weights (named tensors, backend buffer)
47
+ ggml_backend_buffer_t wbuf = nullptr;
48
+ #endif
49
+ };
50
+
51
+ #if defined(YOLO_HAVE_GGML)
52
+
53
+ // ---- weight lookup ---------------------------------------------------------
54
+
55
+ static struct ggml_tensor * w_get(struct ggml_context * wc, const std::string & name) {
56
+ struct ggml_tensor * t = ggml_get_tensor(wc, name.c_str());
57
+ if (!t) {
58
+ fprintf(stderr, "[yolo] missing tensor '%s'\n", name.c_str());
59
+ }
60
+ return t;
61
+ }
62
+
63
+ // ---- graph helpers ---------------------------------------------------------
64
+ //
65
+ // `g` = compute-graph context (no_alloc); `wc` = weights context.
66
+ // A "CBS" conv = conv2d(stride) + per-channel bias + SiLU. Padding is derived
67
+ // from the kernel width (k3 -> 1, k1 -> 0). BN is already folded into w/b.
68
+
69
+ static struct ggml_tensor * conv_core(struct ggml_context * g, struct ggml_context * wc,
70
+ struct ggml_tensor * x, const std::string & name,
71
+ int stride) {
72
+ struct ggml_tensor * w = w_get(wc, name + ".weight");
73
+ struct ggml_tensor * b = w_get(wc, name + ".bias");
74
+ if (!w || !b) return nullptr;
75
+ const int pad = (int) (w->ne[0] / 2); // ne[0] = KW
76
+ struct ggml_tensor * y = ggml_conv_2d(g, w, x, stride, stride, pad, pad, 1, 1);
77
+ // bias broadcast over [OW, OH, OC, N]
78
+ y = ggml_add(g, y, ggml_reshape_4d(g, b, 1, 1, b->ne[0], 1));
79
+ return y;
80
+ }
81
+
82
+ static struct ggml_tensor * conv_bn(struct ggml_context * g, struct ggml_context * wc,
83
+ struct ggml_tensor * x, const std::string & name,
84
+ int stride) {
85
+ struct ggml_tensor * y = conv_core(g, wc, x, name, stride);
86
+ if (!y) return nullptr;
87
+ return ggml_silu(g, y);
88
+ }
89
+
90
+ static struct ggml_tensor * conv_plain(struct ggml_context * g, struct ggml_context * wc,
91
+ struct ggml_tensor * x, const std::string & name,
92
+ int stride) {
93
+ return conv_core(g, wc, x, name, stride); // no activation (head stage-2)
94
+ }
95
+
96
+ static struct ggml_tensor * bottleneck(struct ggml_context * g, struct ggml_context * wc,
97
+ struct ggml_tensor * x, const std::string & prefix,
98
+ bool add) {
99
+ struct ggml_tensor * h = conv_bn(g, wc, x, prefix + ".cv1", 1); // 3x3
100
+ h = conv_bn(g, wc, h, prefix + ".cv2", 1); // 3x3
101
+ if (!h) return nullptr;
102
+ if (add) h = ggml_add(g, x, h);
103
+ return h;
104
+ }
105
+
106
+ static struct ggml_tensor * c2f(struct ggml_context * g, struct ggml_context * wc,
107
+ struct ggml_tensor * x, const std::string & prefix,
108
+ int n, bool add) {
109
+ struct ggml_tensor * y = conv_bn(g, wc, x, prefix + ".cv1", 1); // 1x1 -> 2*hidden
110
+ if (!y) return nullptr;
111
+ const int64_t W = y->ne[0], H = y->ne[1];
112
+ const int64_t hid = y->ne[2] / 2;
113
+
114
+ // split channels into two halves; cont() so each is a clean conv input.
115
+ struct ggml_tensor * y0 =
116
+ ggml_cont(g, ggml_view_3d(g, y, W, H, hid, y->nb[1], y->nb[2], 0));
117
+ struct ggml_tensor * y1 =
118
+ ggml_cont(g, ggml_view_3d(g, y, W, H, hid, y->nb[1], y->nb[2], (size_t) hid * y->nb[2]));
119
+
120
+ std::vector<struct ggml_tensor *> outs;
121
+ outs.push_back(y0);
122
+ outs.push_back(y1);
123
+ struct ggml_tensor * prev = y1;
124
+ for (int j = 0; j < n; j++) {
125
+ prev = bottleneck(g, wc, prev, prefix + ".m." + std::to_string(j), add);
126
+ if (!prev) return nullptr;
127
+ outs.push_back(prev);
128
+ }
129
+ struct ggml_tensor * acc = outs[0];
130
+ for (size_t k = 1; k < outs.size(); k++) {
131
+ acc = ggml_concat(g, acc, outs[k], 2); // concat on channels
132
+ }
133
+ return conv_bn(g, wc, acc, prefix + ".cv2", 1); // 1x1 fuse -> c_out
134
+ }
135
+
136
+ static struct ggml_tensor * sppf(struct ggml_context * g, struct ggml_context * wc,
137
+ struct ggml_tensor * x, const std::string & prefix) {
138
+ struct ggml_tensor * c = conv_bn(g, wc, x, prefix + ".cv1", 1); // 1x1 -> c_
139
+ if (!c) return nullptr;
140
+ struct ggml_tensor * m1 = ggml_pool_2d(g, c, GGML_OP_POOL_MAX, 5, 5, 1, 1, 2.0f, 2.0f);
141
+ struct ggml_tensor * m2 = ggml_pool_2d(g, m1, GGML_OP_POOL_MAX, 5, 5, 1, 1, 2.0f, 2.0f);
142
+ struct ggml_tensor * m3 = ggml_pool_2d(g, m2, GGML_OP_POOL_MAX, 5, 5, 1, 1, 2.0f, 2.0f);
143
+ struct ggml_tensor * cat = ggml_concat(g, ggml_concat(g, ggml_concat(g, c, m1, 2), m2, 2), m3, 2);
144
+ return conv_bn(g, wc, cat, prefix + ".cv2", 1); // 1x1 -> c2
145
+ }
146
+
147
+ // Build the full YOLOv8n graph; fills box[3] + cls[3] head outputs (P3,P4,P5).
148
+ static bool build_yolov8n(struct ggml_context * g, struct ggml_context * wc,
149
+ struct ggml_tensor * inp,
150
+ struct ggml_tensor * box[3], struct ggml_tensor * cls[3]) {
151
+ // backbone
152
+ struct ggml_tensor * x = conv_bn(g, wc, inp, "model.0", 2); // 16, 320
153
+ x = conv_bn(g, wc, x, "model.1", 2); // 32, 160
154
+ x = c2f(g, wc, x, "model.2", 1, true); // 32, 160
155
+ x = conv_bn(g, wc, x, "model.3", 2); // 64, 80
156
+ struct ggml_tensor * p3 = c2f(g, wc, x, "model.4", 2, true); // 64, 80 (P3 src)
157
+ x = conv_bn(g, wc, p3, "model.5", 2); // 128, 40
158
+ struct ggml_tensor * p4 = c2f(g, wc, x, "model.6", 2, true); // 128, 40 (P4 src)
159
+ x = conv_bn(g, wc, p4, "model.7", 2); // 256, 20
160
+ x = c2f(g, wc, x, "model.8", 1, true); // 256, 20
161
+ struct ggml_tensor * p5 = sppf(g, wc, x, "model.9"); // 256, 20 (P5 src)
162
+ if (!p3 || !p4 || !p5) return false;
163
+
164
+ // neck (PAN-FPN)
165
+ struct ggml_tensor * u = ggml_upscale(g, p5, 2, GGML_SCALE_MODE_NEAREST); // 256, 40
166
+ x = ggml_concat(g, u, p4, 2); // 384, 40
167
+ struct ggml_tensor * n12 = c2f(g, wc, x, "model.12", 1, false); // 128, 40
168
+ u = ggml_upscale(g, n12, 2, GGML_SCALE_MODE_NEAREST); // 128, 80
169
+ x = ggml_concat(g, u, p3, 2); // 192, 80
170
+ struct ggml_tensor * n15 = c2f(g, wc, x, "model.15", 1, false); // 64, 80 (head P3)
171
+ x = conv_bn(g, wc, n15, "model.16", 2); // 64, 40
172
+ x = ggml_concat(g, x, n12, 2); // 192, 40
173
+ struct ggml_tensor * n18 = c2f(g, wc, x, "model.18", 1, false); // 128, 40 (head P4)
174
+ x = conv_bn(g, wc, n18, "model.19", 2); // 128, 20
175
+ x = ggml_concat(g, x, p5, 2); // 384, 20
176
+ struct ggml_tensor * n21 = c2f(g, wc, x, "model.21", 1, false); // 256, 20 (head P5)
177
+ if (!n12 || !n15 || !n18 || !n21) return false;
178
+
179
+ struct ggml_tensor * feats[3] = { n15, n18, n21 };
180
+ for (int s = 0; s < 3; s++) {
181
+ const std::string cv2 = "model.22.cv2." + std::to_string(s);
182
+ const std::string cv3 = "model.22.cv3." + std::to_string(s);
183
+ struct ggml_tensor * b = conv_bn(g, wc, feats[s], cv2 + ".0", 1);
184
+ b = conv_bn(g, wc, b, cv2 + ".1", 1);
185
+ b = conv_plain(g, wc, b, cv2 + ".2", 1); // 64 ch (4*reg_max)
186
+ struct ggml_tensor * c = conv_bn(g, wc, feats[s], cv3 + ".0", 1);
187
+ c = conv_bn(g, wc, c, cv3 + ".1", 1);
188
+ c = conv_plain(g, wc, c, cv3 + ".2", 1); // 80 ch (nc)
189
+ if (!b || !c) return false;
190
+ box[s] = b;
191
+ cls[s] = c;
192
+ }
193
+ return true;
194
+ }
195
+
196
+ // CPU-side DFL + decode + sigmoid → out_logits [84, 8400] (channel-major).
197
+ static void decode_head(const std::vector<float> & box, const std::vector<float> & cls,
198
+ int W, int H, int stride, int base, int nc, int anchors,
199
+ float * out) {
200
+ const int reg = 16; // reg_max
201
+ const int WH = W * H;
202
+ for (int gy = 0; gy < H; gy++) {
203
+ for (int gx = 0; gx < W; gx++) {
204
+ const int cell = gy * W + gx;
205
+ float dist[4];
206
+ for (int side = 0; side < 4; side++) {
207
+ float mx = -1e30f;
208
+ for (int j = 0; j < reg; j++) {
209
+ float z = box[(size_t) (side * reg + j) * WH + cell];
210
+ if (z > mx) mx = z;
211
+ }
212
+ float sum = 0.0f, acc = 0.0f;
213
+ for (int j = 0; j < reg; j++) {
214
+ float e = expf(box[(size_t) (side * reg + j) * WH + cell] - mx);
215
+ sum += e;
216
+ acc += e * j;
217
+ }
218
+ dist[side] = acc / sum; // expected distance in grid cells
219
+ }
220
+ const float ax = gx + 0.5f, ay = gy + 0.5f;
221
+ const float x1 = ax - dist[0], y1 = ay - dist[1];
222
+ const float x2 = ax + dist[2], y2 = ay + dist[3];
223
+ const int a = base + cell; // global anchor index
224
+ out[0 * anchors + a] = (x1 + x2) * 0.5f * stride; // cx
225
+ out[1 * anchors + a] = (y1 + y2) * 0.5f * stride; // cy
226
+ out[2 * anchors + a] = (x2 - x1) * stride; // w
227
+ out[3 * anchors + a] = (y2 - y1) * stride; // h
228
+ for (int cc = 0; cc < nc; cc++) {
229
+ float v = cls[(size_t) cc * WH + cell];
230
+ out[(4 + cc) * anchors + a] = 1.0f / (1.0f + expf(-v));
231
+ }
232
+ }
233
+ }
234
+ }
235
+
236
+ #endif // YOLO_HAVE_GGML
237
+
238
+ // ---------------------------------------------------------------------------
239
+ // C ABI
240
+ // ---------------------------------------------------------------------------
241
+
242
+ extern "C" yolo_ctx * yolo_init(const char * gguf_path) {
243
+ if (!gguf_path) return nullptr;
244
+
245
+ #if defined(YOLO_HAVE_GGML)
246
+ yolo_ctx * ctx = new (std::nothrow) yolo_ctx();
247
+ if (!ctx) return nullptr;
248
+ ctx->gguf_path = gguf_path;
249
+
250
+ ctx->backend = ggml_backend_cpu_init();
251
+ if (!ctx->backend) { delete ctx; return nullptr; }
252
+ {
253
+ // YOLOv8n is ~250 small conv ops; ggml's per-op thread barrier means
254
+ // too many threads spin-waits and gets *slower* (hyperthread
255
+ // oversubscription is catastrophic here). Default to ~physical cores
256
+ // (logical/2), capped at 8; allow override via ELIZA_YOLO_THREADS.
257
+ int nth = 4;
258
+ unsigned hw = std::thread::hardware_concurrency();
259
+ if (hw > 0) {
260
+ nth = (int) (hw / 2);
261
+ if (nth < 1) nth = 1;
262
+ if (nth > 8) nth = 8;
263
+ }
264
+ if (const char * env = std::getenv("ELIZA_YOLO_THREADS")) {
265
+ int v = std::atoi(env);
266
+ if (v > 0 && v <= 128) nth = v;
267
+ }
268
+ ggml_backend_cpu_set_n_threads(ctx->backend, nth);
269
+ }
270
+
271
+ // 1. load the gguf into a throwaway ctx with data (no_alloc=false).
272
+ struct ggml_context * tmp = nullptr;
273
+ struct gguf_init_params gp = { /*no_alloc=*/false, /*ctx=*/&tmp };
274
+ struct gguf_context * gguf = gguf_init_from_file(gguf_path, gp);
275
+ if (!gguf || !tmp) {
276
+ fprintf(stderr, "[yolo] failed to open gguf %s\n", gguf_path);
277
+ if (gguf) gguf_free(gguf);
278
+ ggml_backend_free(ctx->backend);
279
+ delete ctx;
280
+ return nullptr;
281
+ }
282
+
283
+ // metadata: class names (string) + input dims (fall back to 640).
284
+ int64_t kc = gguf_find_key(gguf, "yolo.classes");
285
+ if (kc >= 0 && gguf_get_kv_type(gguf, kc) == GGUF_TYPE_STRING) {
286
+ ctx->classes = gguf_get_val_str(gguf, kc);
287
+ }
288
+ ctx->input_h = 640;
289
+ ctx->input_w = 640;
290
+
291
+ // 2. metadata-only duplicate into the weights ctx (no_alloc), then back it
292
+ // with a CPU buffer and upload each tensor's bytes.
293
+ const int64_t n = gguf_get_n_tensors(gguf);
294
+ struct ggml_init_params ip = {
295
+ /*mem_size=*/ ggml_tensor_overhead() * (size_t) (n + 8),
296
+ /*mem_buffer=*/ nullptr,
297
+ /*no_alloc=*/ true,
298
+ };
299
+ ctx->wctx = ggml_init(ip);
300
+ if (!ctx->wctx) {
301
+ gguf_free(gguf); ggml_free(tmp); ggml_backend_free(ctx->backend);
302
+ delete ctx; return nullptr;
303
+ }
304
+ for (int64_t i = 0; i < n; i++) {
305
+ const char * name = gguf_get_tensor_name(gguf, i);
306
+ struct ggml_tensor * src = ggml_get_tensor(tmp, name);
307
+ struct ggml_tensor * dst = ggml_dup_tensor(ctx->wctx, src);
308
+ ggml_set_name(dst, name);
309
+ }
310
+ ctx->wbuf = ggml_backend_alloc_ctx_tensors(ctx->wctx, ctx->backend);
311
+ if (!ctx->wbuf) {
312
+ gguf_free(gguf); ggml_free(tmp); ggml_free(ctx->wctx);
313
+ ggml_backend_free(ctx->backend); delete ctx; return nullptr;
314
+ }
315
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx->wctx); cur;
316
+ cur = ggml_get_next_tensor(ctx->wctx, cur)) {
317
+ struct ggml_tensor * src = ggml_get_tensor(tmp, ggml_get_name(cur));
318
+ ggml_backend_tensor_set(cur, ggml_get_data(src), 0, ggml_nbytes(src));
319
+ }
320
+
321
+ gguf_free(gguf);
322
+ ggml_free(tmp);
323
+
324
+ fprintf(stderr, "[yolo] initialized %s (%lld tensors, backend=%s)\n",
325
+ gguf_path, (long long) n, ggml_backend_name(ctx->backend));
326
+ return ctx;
327
+ #else
328
+ fprintf(stderr, "[yolo] built without YOLO_HAVE_GGML — weights cannot load.\n");
329
+ return nullptr;
330
+ #endif
331
+ }
332
+
333
+ extern "C" int yolo_run(yolo_ctx * ctx,
334
+ const float * rgb_chw,
335
+ int h, int w,
336
+ float * out_logits,
337
+ int * out_channels,
338
+ int * out_anchors) {
339
+ if (!ctx || !rgb_chw || !out_logits || !out_channels || !out_anchors) {
340
+ return YOLO_ERR_SHAPE;
341
+ }
342
+ if (h != ctx->input_h || w != ctx->input_w) return YOLO_ERR_SHAPE;
343
+
344
+ #if defined(YOLO_HAVE_GGML)
345
+ const int nc = 80;
346
+ const int anchors = 8400;
347
+
348
+ // compute-graph context (no_alloc; gallocr assigns activation buffers).
349
+ size_t cmem = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
350
+ struct ggml_context * g = ggml_init({ cmem, nullptr, /*no_alloc=*/true });
351
+ if (!g) return YOLO_ERR_OOM;
352
+
353
+ struct ggml_tensor * inp = ggml_new_tensor_4d(g, GGML_TYPE_F32, w, h, 3, 1); // [W,H,C,N]
354
+ ggml_set_name(inp, "input");
355
+ ggml_set_input(inp);
356
+
357
+ struct ggml_tensor * box[3] = { nullptr, nullptr, nullptr };
358
+ struct ggml_tensor * cls[3] = { nullptr, nullptr, nullptr };
359
+ if (!build_yolov8n(g, ctx->wctx, inp, box, cls)) {
360
+ ggml_free(g);
361
+ return YOLO_ERR_FORMAT;
362
+ }
363
+ for (int s = 0; s < 3; s++) { ggml_set_output(box[s]); ggml_set_output(cls[s]); }
364
+
365
+ struct ggml_cgraph * gf = ggml_new_graph(g);
366
+ for (int s = 0; s < 3; s++) {
367
+ ggml_build_forward_expand(gf, box[s]);
368
+ ggml_build_forward_expand(gf, cls[s]);
369
+ }
370
+
371
+ ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
372
+ if (!alloc || !ggml_gallocr_alloc_graph(alloc, gf)) {
373
+ if (alloc) ggml_gallocr_free(alloc);
374
+ ggml_free(g);
375
+ return YOLO_ERR_OOM;
376
+ }
377
+
378
+ // upload preprocessed CHW image, run.
379
+ ggml_backend_tensor_set(inp, rgb_chw, 0, ggml_nbytes(inp));
380
+ if (ggml_backend_graph_compute(ctx->backend, gf) != GGML_STATUS_SUCCESS) {
381
+ ggml_gallocr_free(alloc);
382
+ ggml_free(g);
383
+ return YOLO_ERR_BACKEND;
384
+ }
385
+
386
+ // pull head tensors to host and decode.
387
+ std::memset(out_logits, 0, sizeof(float) * (size_t) (4 + nc) * anchors);
388
+ const int strides[3] = { 8, 16, 32 };
389
+ const int bases[3] = { 0, 6400, 8000 };
390
+ for (int s = 0; s < 3; s++) {
391
+ const int W = (int) box[s]->ne[0];
392
+ const int H = (int) box[s]->ne[1];
393
+ std::vector<float> boxbuf(ggml_nelements(box[s]));
394
+ std::vector<float> clsbuf(ggml_nelements(cls[s]));
395
+ ggml_backend_tensor_get(box[s], boxbuf.data(), 0, ggml_nbytes(box[s]));
396
+ ggml_backend_tensor_get(cls[s], clsbuf.data(), 0, ggml_nbytes(cls[s]));
397
+ decode_head(boxbuf, clsbuf, W, H, strides[s], bases[s], nc, anchors, out_logits);
398
+ }
399
+
400
+ *out_channels = 4 + nc; // 84
401
+ *out_anchors = anchors; // 8400
402
+
403
+ ggml_gallocr_free(alloc);
404
+ ggml_free(g);
405
+ return YOLO_OK;
406
+ #else
407
+ *out_channels = 0;
408
+ *out_anchors = 0;
409
+ return YOLO_ERR_BACKEND;
410
+ #endif
411
+ }
412
+
413
+ extern "C" const char * yolo_classes(yolo_ctx * ctx) {
414
+ return ctx ? ctx->classes.c_str() : nullptr;
415
+ }
416
+
417
+ extern "C" void yolo_free(yolo_ctx * ctx) {
418
+ if (!ctx) return;
419
+ #if defined(YOLO_HAVE_GGML)
420
+ if (ctx->wbuf) ggml_backend_buffer_free(ctx->wbuf);
421
+ if (ctx->wctx) ggml_free(ctx->wctx);
422
+ if (ctx->backend) ggml_backend_free(ctx->backend);
423
+ #endif
424
+ delete ctx;
425
+ }
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env python3
2
+ """Compare verify/out.bin (ggml) against verify/ref.bin (ultralytics)."""
3
+ import os
4
+ import sys
5
+
6
+ import numpy as np
7
+
8
+ HERE = os.path.dirname(os.path.abspath(__file__))
9
+
10
+ COCO = [
11
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
12
+ "truck", "boat", "traffic light", "fire hydrant", "stop sign",
13
+ "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
14
+ "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag",
15
+ "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
16
+ "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
17
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
18
+ "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
19
+ "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
20
+ "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
21
+ "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock",
22
+ "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
23
+ ]
24
+
25
+
26
+ def nms(boxes, scores, iou_thr=0.5):
27
+ # boxes xywh -> xyxy
28
+ x1 = boxes[:, 0] - boxes[:, 2] / 2
29
+ y1 = boxes[:, 1] - boxes[:, 3] / 2
30
+ x2 = boxes[:, 0] + boxes[:, 2] / 2
31
+ y2 = boxes[:, 1] + boxes[:, 3] / 2
32
+ areas = (x2 - x1) * (y2 - y1)
33
+ order = scores.argsort()[::-1]
34
+ keep = []
35
+ while order.size:
36
+ i = order[0]
37
+ keep.append(i)
38
+ xx1 = np.maximum(x1[i], x1[order[1:]])
39
+ yy1 = np.maximum(y1[i], y1[order[1:]])
40
+ xx2 = np.minimum(x2[i], x2[order[1:]])
41
+ yy2 = np.minimum(y2[i], y2[order[1:]])
42
+ w = np.maximum(0.0, xx2 - xx1)
43
+ h = np.maximum(0.0, yy2 - yy1)
44
+ inter = w * h
45
+ iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-9)
46
+ order = order[1:][iou <= iou_thr]
47
+ return keep
48
+
49
+
50
+ def detect(preds, conf_thr=0.25):
51
+ boxes = preds[:4].T # [8400,4]
52
+ scores = preds[4:] # [80,8400]
53
+ cls = scores.argmax(0)
54
+ conf = scores.max(0)
55
+ m = conf >= conf_thr
56
+ b, c, s = boxes[m], cls[m], conf[m]
57
+ keep = nms(b, s)
58
+ return [(int(c[k]), float(s[k]), b[k]) for k in keep]
59
+
60
+
61
+ def main() -> int:
62
+ ref = np.fromfile(os.path.join(HERE, "ref.bin"), dtype=np.float32)
63
+ out = np.fromfile(os.path.join(HERE, "out.bin"), dtype=np.float32)
64
+ if ref.size != 84 * 8400 or out.size != 84 * 8400:
65
+ print(f"size mismatch ref={ref.size} out={out.size}", file=sys.stderr)
66
+ return 1
67
+ ref = ref.reshape(84, 8400)
68
+ out = out.reshape(84, 8400)
69
+
70
+ box_diff = np.abs(ref[:4] - out[:4])
71
+ cls_diff = np.abs(ref[4:] - out[4:])
72
+ print(f"box max|Δ|={box_diff.max():.4f} mean|Δ|={box_diff.mean():.5f}")
73
+ print(f"cls max|Δ|={cls_diff.max():.5f} mean|Δ|={cls_diff.mean():.6f}")
74
+
75
+ print("\n-- ultralytics reference detections --")
76
+ rd = detect(ref)
77
+ for c, s, b in rd:
78
+ print(f" {COCO[c]:12s} {s:.3f} xywh=({b[0]:.0f},{b[1]:.0f},{b[2]:.0f},{b[3]:.0f})")
79
+ print("-- ggml detections --")
80
+ gd = detect(out)
81
+ for c, s, b in gd:
82
+ print(f" {COCO[c]:12s} {s:.3f} xywh=({b[0]:.0f},{b[1]:.0f},{b[2]:.0f},{b[3]:.0f})")
83
+
84
+ # pass criteria: same set of (class) detections, boxes within a few px, scores close
85
+ ref_set = sorted([(c, round(float(s), 1)) for c, s, _ in rd])
86
+ gd_set = sorted([(c, round(float(s), 1)) for c, s, _ in gd])
87
+ ok = (
88
+ box_diff.max() < 2.0
89
+ and cls_diff.max() < 0.02
90
+ and len(rd) == len(gd)
91
+ and [c for c, _, _ in rd] == [c for c, _, _ in gd]
92
+ )
93
+ print(f"\nRESULT: {'PASS' if ok else 'FAIL'} "
94
+ f"(ref {len(rd)} dets, ggml {len(gd)} dets)")
95
+ return 0 if ok else 1
96
+
97
+
98
+ if __name__ == "__main__":
99
+ raise SystemExit(main())
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Build a fixed preprocessed input + an ultralytics reference output, so the ggml
4
+ runtime can be checked numerically against PyTorch on the SAME input bytes.
5
+
6
+ Writes (next to this script):
7
+ input.bin float32 CHW [3,640,640], RGB /255, letterboxed (gray 114) — fed to BOTH
8
+ ref.bin float32 [84,8400] ultralytics DetectionModel output (cx,cy,w,h px + sigmoid cls)
9
+ meta.txt shapes + a few top reference detections for human sanity
10
+ """
11
+ import os
12
+ import sys
13
+
14
+ import numpy as np
15
+ import torch
16
+ from PIL import Image
17
+ from ultralytics import YOLO
18
+ from ultralytics.utils import ASSETS
19
+
20
+ HERE = os.path.dirname(os.path.abspath(__file__))
21
+ INSIZE = 640
22
+
23
+
24
+ def letterbox_chw(img: Image.Image) -> np.ndarray:
25
+ w, h = img.size
26
+ scale = min(INSIZE / w, INSIZE / h)
27
+ nw, nh = round(w * scale), round(h * scale)
28
+ resized = img.resize((nw, nh), Image.BILINEAR)
29
+ canvas = Image.new("RGB", (INSIZE, INSIZE), (114, 114, 114))
30
+ padw = round((INSIZE - nw) / 2)
31
+ padh = round((INSIZE - nh) / 2)
32
+ canvas.paste(resized, (padw, padh))
33
+ arr = np.asarray(canvas).astype(np.float32) / 255.0 # HWC RGB
34
+ chw = np.ascontiguousarray(np.transpose(arr, (2, 0, 1))) # CHW
35
+ return chw
36
+
37
+
38
+ def main() -> int:
39
+ src = ASSETS / "bus.jpg"
40
+ img = Image.open(src).convert("RGB")
41
+ # stage the test image next to this script for run_ts.mjs (gitignored).
42
+ img.save(os.path.join(HERE, "bus.jpg"))
43
+ chw = letterbox_chw(img)
44
+ chw.tofile(os.path.join(HERE, "input.bin"))
45
+
46
+ model = YOLO("yolov8n.pt").model.eval().float()
47
+ with torch.no_grad():
48
+ inp = torch.from_numpy(chw[None]) # [1,3,640,640]
49
+ out = model(inp)
50
+ if isinstance(out, (list, tuple)):
51
+ out = out[0]
52
+ preds = out[0].cpu().numpy().astype(np.float32) # [84,8400]
53
+ preds.tofile(os.path.join(HERE, "ref.bin"))
54
+
55
+ # human sanity: decode top reference detections (no NMS, just peek)
56
+ boxes = preds[:4] # [4,8400]
57
+ scores = preds[4:] # [80,8400]
58
+ cls = scores.argmax(0)
59
+ conf = scores.max(0)
60
+ order = conf.argsort()[::-1][:8]
61
+ lines = [f"input.bin CHW [3,{INSIZE},{INSIZE}]", f"ref.bin [84,8400] from {src.name}"]
62
+ for a in order:
63
+ cx, cy, bw, bh = boxes[:, a]
64
+ lines.append(
65
+ f" anchor {a:5d} cls={cls[a]:2d} conf={conf[a]:.3f} "
66
+ f"box(cx,cy,w,h)=({cx:.1f},{cy:.1f},{bw:.1f},{bh:.1f})"
67
+ )
68
+ meta = "\n".join(lines)
69
+ open(os.path.join(HERE, "meta.txt"), "w").write(meta + "\n")
70
+ print(meta)
71
+ return 0
72
+
73
+
74
+ if __name__ == "__main__":
75
+ raise SystemExit(main())
@@ -0,0 +1,78 @@
1
+ // Run the ggml yolo.dll on verify/input.bin and write verify/out.bin [84,8400].
2
+ // Standalone bun:ffi harness (does not depend on the TS plugin) so the native
3
+ // runtime can be checked against the PyTorch reference in isolation.
4
+ //
5
+ // bun verify/run_ggml.mjs <yolo.dll> <yolov8n.gguf>
6
+ import { dlopen, FFIType, ptr, CString } from "bun:ffi";
7
+ import { readFileSync, writeFileSync } from "node:fs";
8
+ import { join, dirname } from "node:path";
9
+ import { fileURLToPath } from "node:url";
10
+
11
+ const HERE = dirname(fileURLToPath(import.meta.url));
12
+ const dll = process.argv[2];
13
+ const gguf = process.argv[3];
14
+ if (!dll || !gguf) {
15
+ console.error("usage: bun run_ggml.mjs <yolo.dll> <yolov8n.gguf>");
16
+ process.exit(2);
17
+ }
18
+
19
+ const lib = dlopen(dll, {
20
+ yolo_init: { args: [FFIType.cstring], returns: FFIType.pointer },
21
+ yolo_run: {
22
+ args: [
23
+ FFIType.pointer, // ctx
24
+ FFIType.pointer, // rgb_chw
25
+ FFIType.i32, // h
26
+ FFIType.i32, // w
27
+ FFIType.pointer, // out_logits
28
+ FFIType.pointer, // out_channels
29
+ FFIType.pointer, // out_anchors
30
+ ],
31
+ returns: FFIType.i32,
32
+ },
33
+ yolo_classes: { args: [FFIType.pointer], returns: FFIType.cstring },
34
+ yolo_free: { args: [FFIType.pointer], returns: FFIType.void },
35
+ });
36
+
37
+ const ggufZ = Buffer.from(gguf + "\0", "utf8");
38
+ const ctx = lib.symbols.yolo_init(ptr(ggufZ));
39
+ if (!ctx) {
40
+ console.error("yolo_init returned NULL");
41
+ process.exit(1);
42
+ }
43
+
44
+ const classesPtr = lib.symbols.yolo_classes(ctx);
45
+ const classes = classesPtr ? new CString(classesPtr).toString() : "";
46
+ console.error(`classes: ${classes.split(/\r?\n/).filter(Boolean).length}`);
47
+
48
+ const input = new Float32Array(
49
+ readFileSync(join(HERE, "input.bin")).buffer.slice(0),
50
+ );
51
+ console.error(`input floats: ${input.length} (expected ${3 * 640 * 640})`);
52
+
53
+ const out = new Float32Array(84 * 8400);
54
+ const outChan = new Int32Array(1);
55
+ const outAnch = new Int32Array(1);
56
+
57
+ const t0 = performance.now();
58
+ const rc = lib.symbols.yolo_run(
59
+ ctx,
60
+ ptr(input),
61
+ 640,
62
+ 640,
63
+ ptr(out),
64
+ ptr(outChan),
65
+ ptr(outAnch),
66
+ );
67
+ const dt = performance.now() - t0;
68
+ console.error(
69
+ `yolo_run rc=${rc} channels=${outChan[0]} anchors=${outAnch[0]} (${dt.toFixed(0)}ms)`,
70
+ );
71
+ if (rc !== 0) {
72
+ lib.symbols.yolo_free(ctx);
73
+ process.exit(1);
74
+ }
75
+
76
+ writeFileSync(join(HERE, "out.bin"), Buffer.from(out.buffer));
77
+ lib.symbols.yolo_free(ctx);
78
+ console.error("wrote out.bin");