@elizaos/plugin-vision 2.0.0-beta.1 → 2.0.3-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -301
  3. package/dist/action.d.ts +3 -0
  4. package/dist/action.d.ts.map +1 -0
  5. package/dist/audio-capture-stream.d.ts +42 -0
  6. package/dist/audio-capture-stream.d.ts.map +1 -0
  7. package/dist/audio-capture.d.ts +25 -0
  8. package/dist/audio-capture.d.ts.map +1 -0
  9. package/dist/computeruse-ocr-bridge.d.ts +50 -0
  10. package/dist/computeruse-ocr-bridge.d.ts.map +1 -0
  11. package/dist/config.d.ts +68 -0
  12. package/dist/config.d.ts.map +1 -0
  13. package/dist/describe-backpressure.d.ts +90 -0
  14. package/dist/describe-backpressure.d.ts.map +1 -0
  15. package/dist/dirty-tile-describer.d.ts +102 -0
  16. package/dist/dirty-tile-describer.d.ts.map +1 -0
  17. package/dist/dirty-tile-scene.d.ts +56 -0
  18. package/dist/dirty-tile-scene.d.ts.map +1 -0
  19. package/dist/entity-tracker.d.ts +33 -0
  20. package/dist/entity-tracker.d.ts.map +1 -0
  21. package/dist/face-detector-ggml.d.ts +60 -0
  22. package/dist/face-detector-ggml.d.ts.map +1 -0
  23. package/dist/face-detector-mediapipe.d.ts +25 -0
  24. package/dist/face-detector-mediapipe.d.ts.map +1 -0
  25. package/dist/face-recognition-ggml.d.ts +94 -0
  26. package/dist/face-recognition-ggml.d.ts.map +1 -0
  27. package/dist/get-screen-elements.d.ts +90 -0
  28. package/dist/get-screen-elements.d.ts.map +1 -0
  29. package/dist/get-screen.d.ts +60 -0
  30. package/dist/get-screen.d.ts.map +1 -0
  31. package/dist/image/sharp-compat.d.ts +89 -0
  32. package/dist/image/sharp-compat.d.ts.map +1 -0
  33. package/dist/image-input.d.ts +15 -0
  34. package/dist/image-input.d.ts.map +1 -0
  35. package/dist/index.d.ts +4 -0
  36. package/dist/index.d.ts.map +1 -0
  37. package/dist/index.js +7957 -6238
  38. package/dist/index.js.map +41 -26
  39. package/dist/lifecycle.d.ts +94 -0
  40. package/dist/lifecycle.d.ts.map +1 -0
  41. package/dist/mobile/capacitor-camera.d.ts +85 -0
  42. package/dist/mobile/capacitor-camera.d.ts.map +1 -0
  43. package/dist/native/doctr-ffi.d.ts +40 -0
  44. package/dist/native/doctr-ffi.d.ts.map +1 -0
  45. package/dist/native/yolo-ffi.d.ts +21 -0
  46. package/dist/native/yolo-ffi.d.ts.map +1 -0
  47. package/dist/ocr-host-windows.d.ts +34 -0
  48. package/dist/ocr-host-windows.d.ts.map +1 -0
  49. package/dist/ocr-service-apple-vision-macos.d.ts +51 -0
  50. package/dist/ocr-service-apple-vision-macos.d.ts.map +1 -0
  51. package/dist/ocr-service-doctr.d.ts +61 -0
  52. package/dist/ocr-service-doctr.d.ts.map +1 -0
  53. package/dist/ocr-service-linux-tesseract.d.ts +85 -0
  54. package/dist/ocr-service-linux-tesseract.d.ts.map +1 -0
  55. package/dist/ocr-service-paddleocr.d.ts +59 -0
  56. package/dist/ocr-service-paddleocr.d.ts.map +1 -0
  57. package/dist/ocr-service-windows.d.ts +41 -0
  58. package/dist/ocr-service-windows.d.ts.map +1 -0
  59. package/dist/ocr-service.d.ts +91 -0
  60. package/dist/ocr-service.d.ts.map +1 -0
  61. package/dist/ocr-with-coords.d.ts +103 -0
  62. package/dist/ocr-with-coords.d.ts.map +1 -0
  63. package/dist/person-detector.d.ts +17 -0
  64. package/dist/person-detector.d.ts.map +1 -0
  65. package/dist/provider.d.ts +3 -0
  66. package/dist/provider.d.ts.map +1 -0
  67. package/dist/routes.d.ts +7 -0
  68. package/dist/routes.d.ts.map +1 -0
  69. package/dist/screen-capture-bridge.d.ts +51 -0
  70. package/dist/screen-capture-bridge.d.ts.map +1 -0
  71. package/dist/screen-capture.d.ts +17 -0
  72. package/dist/screen-capture.d.ts.map +1 -0
  73. package/dist/screen-tiler.d.ts +75 -0
  74. package/dist/screen-tiler.d.ts.map +1 -0
  75. package/dist/service.d.ts +176 -0
  76. package/dist/service.d.ts.map +1 -0
  77. package/dist/set-of-marks-provider.d.ts +64 -0
  78. package/dist/set-of-marks-provider.d.ts.map +1 -0
  79. package/dist/som.d.ts +135 -0
  80. package/dist/som.d.ts.map +1 -0
  81. package/dist/som.js +184 -0
  82. package/dist/som.js.map +11 -0
  83. package/dist/test-input.d.ts +25 -0
  84. package/dist/test-input.d.ts.map +1 -0
  85. package/dist/types.d.ts +241 -0
  86. package/dist/types.d.ts.map +1 -0
  87. package/dist/vision-context-augmenter.d.ts +93 -0
  88. package/dist/vision-context-augmenter.d.ts.map +1 -0
  89. package/dist/vision-worker-manager.d.ts +51 -0
  90. package/dist/vision-worker-manager.d.ts.map +1 -0
  91. package/dist/workers/ocr-worker.d.ts +2 -0
  92. package/dist/workers/ocr-worker.d.ts.map +1 -0
  93. package/dist/workers/ocr-worker.js +1075 -7821
  94. package/dist/workers/ocr-worker.js.map +10 -51
  95. package/dist/workers/screen-capture-worker.d.ts +2 -0
  96. package/dist/workers/screen-capture-worker.d.ts.map +1 -0
  97. package/dist/workers/screen-capture-worker.js +364 -6
  98. package/dist/workers/screen-capture-worker.js.map +5 -4
  99. package/dist/workers/worker-logger.d.ts +10 -0
  100. package/dist/workers/worker-logger.d.ts.map +1 -0
  101. package/dist/yolo-detector.d.ts +37 -0
  102. package/dist/yolo-detector.d.ts.map +1 -0
  103. package/native/doctr.cpp/CMakeLists.txt +58 -0
  104. package/native/doctr.cpp/README.md +62 -0
  105. package/native/doctr.cpp/include/doctr.h +91 -0
  106. package/native/doctr.cpp/scripts/convert.py +98 -0
  107. package/native/doctr.cpp/src/doctr_det.cpp +112 -0
  108. package/native/doctr.cpp/src/doctr_rec.cpp +103 -0
  109. package/native/macos-vision-ocr.swift +113 -0
  110. package/native/mobilefacenet.cpp/README.md +13 -0
  111. package/native/movenet.cpp/README.md +10 -0
  112. package/native/retinaface.cpp/README.md +12 -0
  113. package/native/yolo.cpp/CMakeLists.txt +57 -0
  114. package/native/yolo.cpp/README.md +64 -0
  115. package/native/yolo.cpp/build.mjs +76 -0
  116. package/native/yolo.cpp/include/yolo.h +62 -0
  117. package/native/yolo.cpp/scripts/convert.py +248 -0
  118. package/native/yolo.cpp/src/yolo.cpp +425 -0
  119. package/native/yolo.cpp/verify/compare.py +99 -0
  120. package/native/yolo.cpp/verify/make_ref.py +75 -0
  121. package/native/yolo.cpp/verify/run_ggml.mjs +78 -0
  122. package/native/yolo.cpp/verify/run_ts.mjs +26 -0
  123. package/package.json +39 -21
  124. package/registry-entry.json +43 -0
  125. package/scripts/vendor-tesseract-linux.mjs +177 -0
  126. package/build.config.ts +0 -89
  127. package/dist/workers/florence2-worker.js +0 -779
  128. package/dist/workers/florence2-worker.js.map +0 -13
@@ -0,0 +1,58 @@
1
+ cmake_minimum_required(VERSION 3.20)
2
+ project(doctr LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
7
+
8
+ option(DOCTR_BUILD_SHARED "Build shared library (bun:ffi)" ON)
9
+ option(DOCTR_WITH_METAL "Use Metal backend on macOS" OFF)
10
+ option(DOCTR_WITH_CUDA "Use CUDA backend" OFF)
11
+
12
+ # ----------------------------------------------------------------------------
13
+ # ggml dependency
14
+ # ----------------------------------------------------------------------------
15
+ # The runtime depends on ggml. The expected layout is:
16
+ # plugins/plugin-local-inference/native/llama.cpp/ggml/ (existing submodule)
17
+ # We point at that to avoid pinning a second copy. If the path is missing the
18
+ # build still produces libdoctr.<ext> with a fallback ABI (DOCTR_HAVE_GGML
19
+ # undefined) so the FFI surface exists for the JS layer to call into — it will throw a
20
+ # clear "GGUF not ready" at runtime.
21
+
22
+ set(DOCTR_GGML_DIR "" CACHE PATH "Path to a ggml source tree (defaults to llama.cpp's vendored copy)")
23
+ if(NOT DOCTR_GGML_DIR)
24
+ set(DOCTR_GGML_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../plugin-local-inference/native/llama.cpp/ggml")
25
+ endif()
26
+
27
+ set(_doctr_has_ggml FALSE)
28
+ if(EXISTS "${DOCTR_GGML_DIR}/include/ggml.h")
29
+ set(_doctr_has_ggml TRUE)
30
+ add_subdirectory(${DOCTR_GGML_DIR} ggml-build EXCLUDE_FROM_ALL)
31
+ endif()
32
+
33
+ # ----------------------------------------------------------------------------
34
+ # Library
35
+ # ----------------------------------------------------------------------------
36
+ set(_doctr_sources
37
+ src/doctr_det.cpp
38
+ src/doctr_rec.cpp
39
+ )
40
+
41
+ if(DOCTR_BUILD_SHARED)
42
+ add_library(doctr SHARED ${_doctr_sources})
43
+ else()
44
+ add_library(doctr STATIC ${_doctr_sources})
45
+ endif()
46
+
47
+ target_include_directories(doctr PUBLIC include)
48
+
49
+ if(_doctr_has_ggml)
50
+ target_compile_definitions(doctr PRIVATE DOCTR_HAVE_GGML=1)
51
+ target_link_libraries(doctr PRIVATE ggml)
52
+ if(DOCTR_WITH_METAL)
53
+ target_compile_definitions(doctr PRIVATE GGML_USE_METAL=1)
54
+ endif()
55
+ if(DOCTR_WITH_CUDA)
56
+ target_compile_definitions(doctr PRIVATE GGML_USE_CUDA=1)
57
+ endif()
58
+ endif()
@@ -0,0 +1,62 @@
1
+ # doctr.cpp — ggml port of doCTR
2
+
3
+ C++ port of [Mindee doCTR](https://github.com/mindee/doctr) built directly on
4
+ [ggml](https://github.com/ggml-org/ggml). Two stages:
5
+
6
+ - **Detection** — `db_mobilenet_v3_large` backbone + DBNet head → probability map.
7
+ - **Recognition** — `crnn_mobilenet_v3_small` backbone + BiLSTM + CTC head → per-crop logits.
8
+
9
+ The post-processing (DBNet contour → bbox, CTC greedy decode) stays in
10
+ TypeScript — both are trivial and runtime-portable. This C++ library runs only
11
+ the forward pass; the JS caller orchestrates det → crop → rec → decode.
12
+
13
+ ## Status
14
+
15
+ **Phase 1 (current):** FFI surface scaffolded; weight conversion script
16
+ authored; build glue authored. **GGUF weight files are not yet built.** The TS
17
+ binding throws a clear error until `vision/doctr-det.gguf` and
18
+ `vision/doctr-rec.gguf` are present on disk.
19
+
20
+ ## Build (when implemented)
21
+
22
+ ```bash
23
+ cd plugins/plugin-vision/native/doctr.cpp
24
+ cmake -B build -S . -DGGML_METAL=ON # macOS arm64
25
+ cmake --build build --config Release
26
+ ```
27
+
28
+ Produces a single shared library `libdoctr.dylib` / `.so` / `.dll` consumed
29
+ via `bun:ffi` from `plugin-vision/src/native/doctr-ffi.ts`.
30
+
31
+ ## Convert weights (when implemented)
32
+
33
+ ```bash
34
+ python scripts/convert.py \
35
+ --variant db_mobilenet_v3_large \
36
+ --out vision/doctr-det.gguf
37
+
38
+ python scripts/convert.py \
39
+ --variant crnn_mobilenet_v3_small \
40
+ --out vision/doctr-rec.gguf
41
+ ```
42
+
43
+ The detection variant writes a single tensor graph + mean/std metadata.
44
+ The recognition variant additionally writes the character vocabulary as a
45
+ `doctr.charset` KV entry inside the GGUF file.
46
+
47
+ ## ABI
48
+
49
+ See `include/doctr.h`. The ABI is intentionally minimal:
50
+
51
+ ```c
52
+ doctr_det_ctx * doctr_det_init(const char * gguf_path);
53
+ int doctr_det_run(doctr_det_ctx *, const float * rgb_chw, int h, int w,
54
+ float * out_prob, int * out_h, int * out_w);
55
+ void doctr_det_free(doctr_det_ctx *);
56
+
57
+ doctr_rec_ctx * doctr_rec_init(const char * gguf_path);
58
+ int doctr_rec_run(doctr_rec_ctx *, const float * rgb_chw, int h, int w,
59
+ float * out_logits, int * out_T, int * out_C);
60
+ const char * doctr_rec_charset(doctr_rec_ctx *);
61
+ void doctr_rec_free(doctr_rec_ctx *);
62
+ ```
@@ -0,0 +1,91 @@
1
+ // doctr.h — C ABI for the ggml-backed doCTR runtime.
2
+ //
3
+ // Stable across detection / recognition variants. Both variants own their own
4
+ // context; they're built around ggml's compute-graph + GGUF weight loader and
5
+ // expose only the forward pass. Post-processing (DBNet contouring, CTC decode)
6
+ // happens in TypeScript so that the C side stays a pure tensor pipeline.
7
+ //
8
+ // Threading model: each context is single-threaded. Callers wanting parallel
9
+ // recognition over multiple crops should hold a pool of contexts.
10
+
11
+ #ifndef DOCTR_H
12
+ #define DOCTR_H
13
+
14
+ #include <stddef.h>
15
+ #include <stdint.h>
16
+
17
+ #ifdef __cplusplus
18
+ extern "C" {
19
+ #endif
20
+
21
+ typedef struct doctr_det_ctx doctr_det_ctx;
22
+ typedef struct doctr_rec_ctx doctr_rec_ctx;
23
+
24
+ // Return codes. Anything non-zero is an error.
25
+ #define DOCTR_OK 0
26
+ #define DOCTR_ERR_FILE -1
27
+ #define DOCTR_ERR_FORMAT -2
28
+ #define DOCTR_ERR_OOM -3
29
+ #define DOCTR_ERR_SHAPE -4
30
+ #define DOCTR_ERR_BACKEND -5
31
+
32
+ // === Detection ===
33
+ //
34
+ // Loads a `db_mobilenet_v3_large`-based detection model from GGUF. Expected
35
+ // metadata KV entries:
36
+ // - "doctr.det.variant" = "db_mobilenet_v3_large"
37
+ // - "doctr.det.mean" = float[3] (per-channel RGB mean, 0..1)
38
+ // - "doctr.det.std" = float[3] (per-channel RGB std, 0..1)
39
+ // - "doctr.det.input_h" = int (typical 1024)
40
+ // - "doctr.det.input_w" = int (typical 1024)
41
+ //
42
+ // Returns NULL on failure; check stderr for diagnostics.
43
+ doctr_det_ctx * doctr_det_init(const char * gguf_path);
44
+
45
+ // rgb_chw: CHW float32 RGB, normalized externally to [0,1] (the C side
46
+ // applies the model's mean/std from GGUF metadata).
47
+ // h, w: spatial dims of rgb_chw. Must match the GGUF input_h/input_w
48
+ // (callers letterbox/resize beforehand).
49
+ // out_prob: caller-allocated. Size must be (h/4) * (w/4) float32.
50
+ // out_h, out_w: filled with the actual probability-map dims (h/4, w/4).
51
+ int doctr_det_run(doctr_det_ctx * ctx,
52
+ const float * rgb_chw,
53
+ int h, int w,
54
+ float * out_prob,
55
+ int * out_h, int * out_w);
56
+
57
+ void doctr_det_free(doctr_det_ctx * ctx);
58
+
59
+ // === Recognition ===
60
+ //
61
+ // Loads a `crnn_mobilenet_v3_small`-based recognition model. Expected GGUF
62
+ // metadata:
63
+ // - "doctr.rec.variant" = "crnn_mobilenet_v3_small"
64
+ // - "doctr.rec.mean" = float[3]
65
+ // - "doctr.rec.std" = float[3]
66
+ // - "doctr.rec.input_h" = int (typical 32)
67
+ // - "doctr.rec.input_w" = int (typical 128)
68
+ // - "doctr.rec.charset" = utf8 string (newline-separated)
69
+ doctr_rec_ctx * doctr_rec_init(const char * gguf_path);
70
+
71
+ // rgb_chw: CHW float32 RGB crop normalized to [0,1].
72
+ // h must equal input_h (32). w is dynamic up to model max.
73
+ // out_logits: caller-allocated. Size must be at least T * C float32
74
+ // where T = w/8 (typical CRNN stride) and C = charset.size()+1.
75
+ // out_T, out_C: written by the call.
76
+ int doctr_rec_run(doctr_rec_ctx * ctx,
77
+ const float * rgb_chw,
78
+ int h, int w,
79
+ float * out_logits,
80
+ int * out_T, int * out_C);
81
+
82
+ // Returns the embedded UTF-8 charset string (newline-separated, owned by ctx).
83
+ const char * doctr_rec_charset(doctr_rec_ctx * ctx);
84
+
85
+ void doctr_rec_free(doctr_rec_ctx * ctx);
86
+
87
+ #ifdef __cplusplus
88
+ }
89
+ #endif
90
+
91
+ #endif // DOCTR_H
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert Mindee doCTR PyTorch checkpoints to GGUF for the doctr.cpp runtime.
4
+
5
+ Usage:
6
+ python scripts/convert.py --variant db_mobilenet_v3_large --out vision/doctr-det.gguf
7
+ python scripts/convert.py --variant crnn_mobilenet_v3_small --out vision/doctr-rec.gguf
8
+
9
+ This script defines the conversion entrypoint and expected GGUF contract. The
10
+ actual tensor-name mapping table must be completed on a build host with
11
+ `python-doctr` and `gguf` installed, then validated end-to-end against
12
+ src/doctr_det.cpp / src/doctr_rec.cpp.
13
+
14
+ Requirements (install before running):
15
+ pip install python-doctr[torch] gguf numpy
16
+
17
+ Tensor naming convention written to the GGUF file:
18
+ Detection:
19
+ backbone.stem.conv.weight
20
+ backbone.stem.bn.{weight,bias,running_mean,running_var}
21
+ backbone.blocks.<i>.conv1.weight
22
+ backbone.blocks.<i>.bn1.{weight,bias,running_mean,running_var}
23
+ ... (per inverted-residual block; see torchvision MobileNetV3 mapping)
24
+ head.conv1.weight
25
+ head.bn1.weight
26
+ head.up1.weight
27
+ head.up2.weight
28
+ head.out.weight
29
+ Recognition:
30
+ backbone.* (mobilenetv3-small mapping)
31
+ lstm.weight_ih_l{0,1}
32
+ lstm.weight_hh_l{0,1}
33
+ lstm.bias_ih_l{0,1}
34
+ lstm.bias_hh_l{0,1}
35
+ head.weight
36
+ head.bias
37
+
38
+ Metadata KV entries:
39
+ "doctr.det.variant" | "doctr.rec.variant" : str
40
+ "doctr.<stage>.mean" : f32[3]
41
+ "doctr.<stage>.std" : f32[3]
42
+ "doctr.<stage>.input_h" : i32
43
+ "doctr.<stage>.input_w" : i32
44
+ "doctr.rec.charset" : str (utf-8, newline-separated)
45
+ """
46
+
47
+ import argparse
48
+ import sys
49
+
50
+ VALID_VARIANTS = (
51
+ "db_mobilenet_v3_large",
52
+ "crnn_mobilenet_v3_small",
53
+ )
54
+
55
+
56
+ def main() -> int:
57
+ parser = argparse.ArgumentParser()
58
+ parser.add_argument("--variant", required=True, choices=VALID_VARIANTS)
59
+ parser.add_argument("--out", required=True, help="Output GGUF path")
60
+ parser.add_argument(
61
+ "--quantize",
62
+ default="f16",
63
+ choices=["f32", "f16", "q4_0", "q8_0"],
64
+ help="Tensor quantization for conv/linear weights",
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ try:
69
+ from doctr.models import recognition, detection # noqa: F401
70
+ except ImportError:
71
+ print(
72
+ "python-doctr not installed. Install with: pip install 'python-doctr[torch]'",
73
+ file=sys.stderr,
74
+ )
75
+ return 2
76
+
77
+ try:
78
+ import gguf # noqa: F401
79
+ except ImportError:
80
+ print(
81
+ "gguf library not installed. Install with: pip install gguf",
82
+ file=sys.stderr,
83
+ )
84
+ return 2
85
+
86
+ print(f"[convert] variant={args.variant} out={args.out} quantize={args.quantize}",
87
+ file=sys.stderr)
88
+ print(
89
+ "[convert] WEIGHT MAPPING UNAVAILABLE — run this on a build host with "
90
+ "the full python-doctr environment and fill in the per-tensor mapping "
91
+ "table per the docstring above.",
92
+ file=sys.stderr,
93
+ )
94
+ return 1
95
+
96
+
97
+ if __name__ == "__main__":
98
+ raise SystemExit(main())
@@ -0,0 +1,112 @@
1
+ // doctr_det.cpp — detection forward pass (db_mobilenet_v3_large + DBNet head).
2
+ //
3
+ // This file pins the detection API and high-level graph structure. Builds that
4
+ // do not link a complete ggml dependency tree refuse initialization through the
5
+ // explicit nullptr path below. The matching CMakeLists.txt vendors `ggml` as a
6
+ // git submodule; keeping this file compilable-in-isolation lets the plugin
7
+ // build while the native model runtime remains unavailable.
8
+
9
+ #include "doctr.h"
10
+
11
+ #include <cstdio>
12
+ #include <cstdlib>
13
+ #include <cstring>
14
+ #include <string>
15
+ #include <vector>
16
+
17
+ #if defined(DOCTR_HAVE_GGML)
18
+ # include "ggml.h"
19
+ # include "ggml-backend.h"
20
+ #endif
21
+
22
+ struct doctr_det_ctx {
23
+ std::string gguf_path;
24
+
25
+ // Hyperparameters from GGUF metadata.
26
+ int input_h = 1024;
27
+ int input_w = 1024;
28
+ float mean[3] = {0.798f, 0.785f, 0.772f}; // doCTR defaults
29
+ float std_[3] = {0.264f, 0.275f, 0.286f};
30
+
31
+ #if defined(DOCTR_HAVE_GGML)
32
+ struct ggml_context * gctx = nullptr;
33
+ ggml_backend_t backend = nullptr;
34
+ // The compute graph + parameter tensors get built lazily on the first
35
+ // run, so the model file can be opened and validated without paying for
36
+ // the graph allocation cost.
37
+ struct ggml_cgraph * graph = nullptr;
38
+ #endif
39
+ };
40
+
41
+ extern "C" doctr_det_ctx * doctr_det_init(const char * gguf_path) {
42
+ if (!gguf_path) return nullptr;
43
+
44
+ auto * ctx = new (std::nothrow) doctr_det_ctx();
45
+ if (!ctx) return nullptr;
46
+ ctx->gguf_path = gguf_path;
47
+
48
+ #if defined(DOCTR_HAVE_GGML)
49
+ // 1. Open GGUF, validate `doctr.det.variant == db_mobilenet_v3_large`.
50
+ // 2. Read mean/std/input_{h,w} metadata into ctx.
51
+ // 3. Pick backend (Metal on darwin, CUDA when available, CPU else).
52
+ // 4. Load conv/bn/linear parameter tensors into ctx->gctx.
53
+ //
54
+ // The runtime loader is intentionally unavailable until it mirrors the
55
+ // tensor naming implemented by the conversion harness in `scripts/convert.py`.
56
+ // See README.md for the conversion pipeline.
57
+ fprintf(stderr,
58
+ "[doctr_det] init called for %s — GGML path not yet wired; weights must be built first.\n",
59
+ gguf_path);
60
+ delete ctx;
61
+ return nullptr;
62
+ #else
63
+ // Build without ggml linked: refuse to initialize at all so the caller's
64
+ // JS layer can throw a clear "GGUF not ready" error.
65
+ fprintf(stderr,
66
+ "[doctr_det] built without DOCTR_HAVE_GGML — weights cannot load.\n");
67
+ delete ctx;
68
+ return nullptr;
69
+ #endif
70
+ }
71
+
72
+ extern "C" int doctr_det_run(doctr_det_ctx * ctx,
73
+ const float * rgb_chw,
74
+ int h, int w,
75
+ float * out_prob,
76
+ int * out_h, int * out_w) {
77
+ if (!ctx || !rgb_chw || !out_prob || !out_h || !out_w) {
78
+ return DOCTR_ERR_SHAPE;
79
+ }
80
+ if (h != ctx->input_h || w != ctx->input_w) {
81
+ return DOCTR_ERR_SHAPE;
82
+ }
83
+ #if defined(DOCTR_HAVE_GGML)
84
+ // Forward pass:
85
+ // 1. apply mean/std normalization in-place to a scratch tensor
86
+ // 2. run db_mobilenet_v3_large backbone (12 inverted-residual blocks,
87
+ // hidden-state dims [16,24,40,80,112,160])
88
+ // 3. FPN-like neck producing a single (B, 256, H/4, W/4) feature map
89
+ // 4. DBNet head: 3x3 conv → conv-transpose ×2 → 1x1 conv → sigmoid
90
+ //
91
+ // Output: (1, 1, H/4, W/4) probability map copied into out_prob.
92
+ *out_h = h / 4;
93
+ *out_w = w / 4;
94
+ std::memset(out_prob, 0, sizeof(float) * (*out_h) * (*out_w));
95
+ return DOCTR_ERR_BACKEND;
96
+ #else
97
+ (void)out_prob;
98
+ *out_h = 0;
99
+ *out_w = 0;
100
+ return DOCTR_ERR_BACKEND;
101
+ #endif
102
+ }
103
+
104
+ extern "C" void doctr_det_free(doctr_det_ctx * ctx) {
105
+ if (!ctx) return;
106
+ #if defined(DOCTR_HAVE_GGML)
107
+ if (ctx->graph) { /* ggml_graph_free handled by gctx */ }
108
+ if (ctx->gctx) { ggml_free(ctx->gctx); }
109
+ if (ctx->backend) { ggml_backend_free(ctx->backend); }
110
+ #endif
111
+ delete ctx;
112
+ }
@@ -0,0 +1,103 @@
1
+ // doctr_rec.cpp — recognition forward pass (crnn_mobilenet_v3_small + BiLSTM).
2
+ //
3
+ // Same scaffolding rationale as doctr_det.cpp: API + graph structure are
4
+ // pinned; the ggml-backed loader/forward is gated behind DOCTR_HAVE_GGML and
5
+ // will be wired once the GGUF weight files exist.
6
+
7
+ #include "doctr.h"
8
+
9
+ #include <cstdio>
10
+ #include <cstdlib>
11
+ #include <cstring>
12
+ #include <string>
13
+ #include <vector>
14
+
15
+ #if defined(DOCTR_HAVE_GGML)
16
+ # include "ggml.h"
17
+ # include "ggml-backend.h"
18
+ #endif
19
+
20
+ struct doctr_rec_ctx {
21
+ std::string gguf_path;
22
+ std::string charset; // utf-8, newline separated
23
+ int charset_size = 0; // number of glyph entries (charset_size+1 logits)
24
+
25
+ int input_h = 32;
26
+ int input_w = 128; // dynamic; this is the maximum
27
+ float mean[3] = {0.694f, 0.695f, 0.693f};
28
+ float std_[3] = {0.299f, 0.296f, 0.301f};
29
+
30
+ #if defined(DOCTR_HAVE_GGML)
31
+ struct ggml_context * gctx = nullptr;
32
+ ggml_backend_t backend = nullptr;
33
+ struct ggml_cgraph * graph = nullptr;
34
+ #endif
35
+ };
36
+
37
+ extern "C" doctr_rec_ctx * doctr_rec_init(const char * gguf_path) {
38
+ if (!gguf_path) return nullptr;
39
+ auto * ctx = new (std::nothrow) doctr_rec_ctx();
40
+ if (!ctx) return nullptr;
41
+ ctx->gguf_path = gguf_path;
42
+
43
+ #if defined(DOCTR_HAVE_GGML)
44
+ // 1. Open GGUF, validate `doctr.rec.variant == crnn_mobilenet_v3_small`.
45
+ // 2. Read mean/std/input_h/input_w and charset KV.
46
+ // 3. Pick backend.
47
+ // 4. Load conv/bn/linear + LSTM gate weights.
48
+ fprintf(stderr,
49
+ "[doctr_rec] init called for %s — GGML path not yet wired; weights must be built first.\n",
50
+ gguf_path);
51
+ delete ctx;
52
+ return nullptr;
53
+ #else
54
+ fprintf(stderr,
55
+ "[doctr_rec] built without DOCTR_HAVE_GGML — weights cannot load.\n");
56
+ delete ctx;
57
+ return nullptr;
58
+ #endif
59
+ }
60
+
61
+ extern "C" int doctr_rec_run(doctr_rec_ctx * ctx,
62
+ const float * rgb_chw,
63
+ int h, int w,
64
+ float * out_logits,
65
+ int * out_T, int * out_C) {
66
+ if (!ctx || !rgb_chw || !out_logits || !out_T || !out_C) {
67
+ return DOCTR_ERR_SHAPE;
68
+ }
69
+ if (h != ctx->input_h) return DOCTR_ERR_SHAPE;
70
+ if (w <= 0 || w > ctx->input_w) return DOCTR_ERR_SHAPE;
71
+
72
+ #if defined(DOCTR_HAVE_GGML)
73
+ // Forward pass:
74
+ // 1. normalize input with mean/std
75
+ // 2. mobilenetv3-small backbone — outputs (1, 256, 1, w/8)
76
+ // 3. squeeze height -> (1, 256, w/8)
77
+ // 4. BiLSTM 128 hidden × 2 layers
78
+ // 5. linear projection to (charset_size + 1)
79
+ //
80
+ // Output: (T, C) row-major float32 logits.
81
+ *out_T = w / 8;
82
+ *out_C = ctx->charset_size + 1;
83
+ std::memset(out_logits, 0, sizeof(float) * (*out_T) * (*out_C));
84
+ return DOCTR_ERR_BACKEND;
85
+ #else
86
+ *out_T = 0;
87
+ *out_C = 0;
88
+ return DOCTR_ERR_BACKEND;
89
+ #endif
90
+ }
91
+
92
+ extern "C" const char * doctr_rec_charset(doctr_rec_ctx * ctx) {
93
+ return ctx ? ctx->charset.c_str() : nullptr;
94
+ }
95
+
96
+ extern "C" void doctr_rec_free(doctr_rec_ctx * ctx) {
97
+ if (!ctx) return;
98
+ #if defined(DOCTR_HAVE_GGML)
99
+ if (ctx->gctx) { ggml_free(ctx->gctx); }
100
+ if (ctx->backend) { ggml_backend_free(ctx->backend); }
101
+ #endif
102
+ delete ctx;
103
+ }
@@ -0,0 +1,113 @@
1
+ // macOS Apple Vision OCR helper for @elizaos/plugin-vision (issue #9105 — per-OS
2
+ // native OCR fallback). Reads PNG/JPEG bytes from stdin, runs an accurate
3
+ // `VNRecognizeTextRequest` with language correction, and prints ONE JSON object
4
+ // on stdout:
5
+ //
6
+ // {"lines":[{"text":..,"confidence":..,"boundingBox":{"x":..,"y":..,"width":..,"height":..}}],"fullText":..}
7
+ //
8
+ // Vision reports normalized, BOTTOM-LEFT-origin bounding boxes (x,y,width,height
9
+ // in 0..1, y growing upward). The other plugin-vision OCR providers use
10
+ // TOP-LEFT-origin PIXEL coordinates (display-absolute convention), so we convert
11
+ // here: pixelX = x*W, pixelY = (1 - y - height)*H, and scale width/height by the
12
+ // image dimensions. Empty/zero results still print a well-formed empty object so
13
+ // the Node side never has to special-case a missing stdout.
14
+
15
+ import AppKit
16
+ import Foundation
17
+ import Vision
18
+
19
+ struct OcrLine: Encodable {
20
+ let text: String
21
+ let confidence: Double
22
+ let boundingBox: BBox
23
+ }
24
+
25
+ struct BBox: Encodable {
26
+ let x: Double
27
+ let y: Double
28
+ let width: Double
29
+ let height: Double
30
+ }
31
+
32
+ struct OcrPayload: Encodable {
33
+ let lines: [OcrLine]
34
+ let fullText: String
35
+ }
36
+
37
+ func emit(_ payload: OcrPayload) {
38
+ let encoder = JSONEncoder()
39
+ encoder.outputFormatting = [.withoutEscapingSlashes]
40
+ if let data = try? encoder.encode(payload),
41
+ let json = String(data: data, encoding: .utf8)
42
+ {
43
+ print(json)
44
+ } else {
45
+ print("{\"lines\":[],\"fullText\":\"\"}")
46
+ }
47
+ }
48
+
49
+ let emptyPayload = OcrPayload(lines: [], fullText: "")
50
+
51
+ // Read the full image from stdin.
52
+ let inputData = FileHandle.standardInput.readDataToEndOfFile()
53
+ guard !inputData.isEmpty, let nsImage = NSImage(data: inputData),
54
+ let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil)
55
+ else {
56
+ emit(emptyPayload)
57
+ exit(0)
58
+ }
59
+
60
+ let pixelWidth = Double(cgImage.width)
61
+ let pixelHeight = Double(cgImage.height)
62
+ if pixelWidth <= 0 || pixelHeight <= 0 {
63
+ emit(emptyPayload)
64
+ exit(0)
65
+ }
66
+
67
+ let request = VNRecognizeTextRequest()
68
+ request.recognitionLevel = .accurate
69
+ request.usesLanguageCorrection = true
70
+
71
+ let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
72
+ do {
73
+ try handler.perform([request])
74
+ } catch {
75
+ emit(emptyPayload)
76
+ exit(0)
77
+ }
78
+
79
+ guard let observations = request.results, !observations.isEmpty else {
80
+ emit(emptyPayload)
81
+ exit(0)
82
+ }
83
+
84
+ var lines: [OcrLine] = []
85
+ for observation in observations {
86
+ guard let candidate = observation.topCandidates(1).first else { continue }
87
+ let text = candidate.string
88
+ if text.isEmpty { continue }
89
+
90
+ // `observation.boundingBox` is normalized, bottom-left origin. Convert to
91
+ // top-left-origin pixel coordinates matching the other OCR providers.
92
+ let box = observation.boundingBox
93
+ let pixelX = box.origin.x * pixelWidth
94
+ let pixelY = (1.0 - box.origin.y - box.size.height) * pixelHeight
95
+ let pixelW = box.size.width * pixelWidth
96
+ let pixelH = box.size.height * pixelHeight
97
+
98
+ lines.append(
99
+ OcrLine(
100
+ text: text,
101
+ confidence: Double(candidate.confidence),
102
+ boundingBox: BBox(
103
+ x: pixelX,
104
+ y: pixelY,
105
+ width: pixelW,
106
+ height: pixelH
107
+ )
108
+ )
109
+ )
110
+ }
111
+
112
+ let fullText = lines.map { $0.text }.joined(separator: "\n")
113
+ emit(OcrPayload(lines: lines, fullText: fullText))
@@ -0,0 +1,13 @@
1
+ # mobilefacenet.cpp — Phase 3 planned port
2
+
3
+ ggml port of MobileFaceNet for 128-d face embeddings.
4
+
5
+ **Status: planned.** See `plugins/plugin-vision/VISION_RUNTIME_MIGRATION.md`
6
+ ("Phase 3 plan" / "MobileFaceNet (face embedding)") for the conversion strategy.
7
+
8
+ Replaces `face-api.js::faceRecognitionNet` (Inception/ResNet variant).
9
+
10
+ Reference checkpoint: `deepinsight/insightface` MobileFaceNet weights, MIT.
11
+
12
+ The embedding-compare logic in `src/face-recognition.ts::euclideanDistance`
13
+ is reusable as-is; only the model load + forward pass need replacing.
@@ -0,0 +1,10 @@
1
+ # movenet.cpp — Phase 3 planned port
2
+
3
+ ggml port of Google MoveNet (MultiPose Lightning) for keypoint detection.
4
+
5
+ **Status: planned.** See `plugins/plugin-vision/VISION_RUNTIME_MIGRATION.md`
6
+ ("Phase 3 plan" / "MoveNet (pose)") for the conversion strategy.
7
+
8
+ Until this port lands, `src/vision-models.ts::detectPoses` continues to depend
9
+ on `@tensorflow/tfjs-node` + `@tensorflow-models/pose-detection`. Those deps
10
+ are scheduled for removal once MoveNet ggml is built.
@@ -0,0 +1,12 @@
1
+ # retinaface.cpp — Phase 3 planned port
2
+
3
+ ggml port of RetinaFace (MobileNet 0.25× backbone) for face detection.
4
+
5
+ **Status: planned.** See `plugins/plugin-vision/VISION_RUNTIME_MIGRATION.md`
6
+ ("Phase 3 plan" / "RetinaFace (face detection)") for the conversion strategy.
7
+
8
+ Replaces:
9
+ - `src/face-recognition.ts` SSD-MobileNet-v1 face detector (face-api.js).
10
+ - `src/face-detector-mediapipe.ts` BlazeFace alt path (deprecated migration shim; was onnxruntime).
11
+
12
+ Reference checkpoint: `biubug6/Pytorch_Retinaface`, MIT-licensed.