@geravant/sinain 1.7.2 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/onboard.js ADDED
@@ -0,0 +1,345 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * sinain onboard — interactive setup wizard using @clack/prompts
4
+ * Modeled after `openclaw onboard` for a familiar, polished experience.
5
+ */
6
+ import * as p from "@clack/prompts";
7
+ import fs from "fs";
8
+ import path from "path";
9
+ import { execFileSync } from "child_process";
10
+ import {
11
+ c, guard, maskKey, readEnv, writeEnv, summarizeConfig, runHealthCheck,
12
+ stepApiKey, stepTranscription, stepGateway, stepPrivacy, stepModel,
13
+ HOME, SINAIN_DIR, ENV_PATH, PKG_DIR, IS_WINDOWS, IS_MAC,
14
+ } from "./config-shared.js";
15
+
16
+ // ── Header ──────────────────────────────────────────────────────────────────
17
+
18
+ function printHeader() {
19
+ console.log();
20
+ console.log(c.bold(" ┌─────────────────────────────────────────┐"));
21
+ console.log(c.bold(" │ │"));
22
+ console.log(c.bold(" │") + c.cyan(" ╔═╗╦╔╗╔╔═╗╦╔╗╔ ╦ ╦╦ ╦╔╦╗ ") + c.bold("│"));
23
+ console.log(c.bold(" │") + c.cyan(" ╚═╗║║║║╠═╣║║║║ ╠═╣║ ║ ║║ ") + c.bold("│"));
24
+ console.log(c.bold(" │") + c.cyan(" ╚═╝╩╝╚╝╩ ╩╩╝╚╝ ╩ ╩╚═╝═╩╝ ") + c.bold("│"));
25
+ console.log(c.bold(" │") + c.dim(" Privacy-first AI overlay ") + c.bold("│"));
26
+ console.log(c.bold(" │ │"));
27
+ console.log(c.bold(" └─────────────────────────────────────────┘"));
28
+ console.log();
29
+ }
30
+
31
+ // ── Steps (imported from config-shared.js) ──────────────────────────────────
32
+ // stepApiKey, stepTranscription, stepGateway, stepPrivacy, stepModel
33
+ // are imported above and accept an optional label parameter.
34
+
35
+ async function stepOverlay(existing) {
36
+ // Check if overlay is already installed
37
+ const overlayPaths = [
38
+ path.join(SINAIN_DIR, "overlay", "SinainHUD.app"),
39
+ path.join(SINAIN_DIR, "overlay", "sinain_hud.exe"),
40
+ ];
41
+ const overlayInstalled = overlayPaths.some((p) => fs.existsSync(p));
42
+
43
+ const choice = guard(await p.select({
44
+ message: "Install overlay",
45
+ options: [
46
+ {
47
+ value: "download",
48
+ label: "Download pre-built app (recommended)",
49
+ hint: "No Flutter SDK needed",
50
+ },
51
+ {
52
+ value: "source",
53
+ label: "Build from source",
54
+ hint: "Requires Flutter SDK",
55
+ },
56
+ {
57
+ value: "skip",
58
+ label: overlayInstalled ? "Skip (already installed)" : "Skip for now",
59
+ hint: overlayInstalled ? "SinainHUD.app detected" : "Install later: sinain setup-overlay",
60
+ },
61
+ ],
62
+ initialValue: overlayInstalled ? "skip" : "download",
63
+ }));
64
+
65
+ if (choice === "download" || choice === "source") {
66
+ const s = p.spinner();
67
+ const label = choice === "download" ? "Downloading overlay..." : "Building overlay from source...";
68
+ s.start(label);
69
+ try {
70
+ // setup-overlay.js handles both modes via process.argv
71
+ if (choice === "source") process.argv.push("--from-source");
72
+ await import("./setup-overlay.js");
73
+ s.stop(c.green("Overlay installed."));
74
+ } catch (err) {
75
+ s.stop(c.yellow(`Failed: ${err.message}`));
76
+ p.note("Install manually: sinain setup-overlay", "Overlay");
77
+ }
78
+ }
79
+ }
80
+
81
+ // ── Main ────────────────────────────────────────────────────────────────────
82
+
83
+ export async function runOnboard(args = {}) {
84
+ printHeader();
85
+ p.intro("SinainHUD setup");
86
+
87
+ const existing = readEnv(ENV_PATH);
88
+ const hasExisting = Object.keys(existing).length > 0;
89
+
90
+ // ── Existing config handling ────────────────────────────────────────────
91
+
92
+ let configAction = "fresh";
93
+ if (hasExisting) {
94
+ p.note(summarizeConfig(existing).join("\n"), "Existing config detected");
95
+
96
+ configAction = guard(await p.select({
97
+ message: "Config handling",
98
+ options: [
99
+ { value: "keep", label: "Use existing values" },
100
+ { value: "update", label: "Update values" },
101
+ { value: "reset", label: "Reset (start fresh)" },
102
+ ],
103
+ initialValue: "keep",
104
+ }));
105
+
106
+ if (configAction === "keep") {
107
+ p.log.success("Using existing configuration.");
108
+ await stepOverlay(existing);
109
+ await runHealthCheck();
110
+ printOutro();
111
+ return;
112
+ }
113
+
114
+ if (configAction === "reset") {
115
+ fs.unlinkSync(ENV_PATH);
116
+ p.log.info("Config reset.");
117
+ }
118
+ }
119
+
120
+ const base = configAction === "update" ? existing : {};
121
+
122
+ // ── Flow selection ──────────────────────────────────────────────────────
123
+
124
+ const flow = args.flow || guard(await p.select({
125
+ message: "Setup mode",
126
+ options: [
127
+ {
128
+ value: "quickstart",
129
+ label: "QuickStart",
130
+ hint: "Get running in 2 minutes. Configure details later.",
131
+ },
132
+ {
133
+ value: "advanced",
134
+ label: "Advanced",
135
+ hint: "Full control over privacy, models, and connections.",
136
+ },
137
+ ],
138
+ initialValue: "quickstart",
139
+ }));
140
+
141
+ const totalSteps = flow === "quickstart" ? 2 : 5;
142
+
143
+ // ── Collect vars ────────────────────────────────────────────────────────
144
+
145
+ const vars = { ...base };
146
+
147
+ // Step 1: API key (both flows)
148
+ const apiKey = await stepApiKey(base, `[1/${totalSteps}] OpenRouter API key`);
149
+ vars.OPENROUTER_API_KEY = apiKey;
150
+ p.log.success("API key saved.");
151
+
152
+ if (flow === "quickstart") {
153
+ // QuickStart: sensible defaults
154
+ vars.TRANSCRIPTION_BACKEND = base.TRANSCRIPTION_BACKEND || "openrouter";
155
+ vars.PRIVACY_MODE = base.PRIVACY_MODE || "standard";
156
+ vars.AGENT_MODEL = base.AGENT_MODEL || "google/gemini-2.5-flash-lite";
157
+ vars.ESCALATION_MODE = base.ESCALATION_MODE || "off";
158
+ vars.SINAIN_AGENT = base.SINAIN_AGENT || "claude";
159
+ if (!vars.OPENCLAW_WS_URL) {
160
+ vars.OPENCLAW_WS_URL = "";
161
+ vars.OPENCLAW_HTTP_URL = "";
162
+ }
163
+
164
+ p.note(
165
+ [
166
+ `Transcription: ${vars.TRANSCRIPTION_BACKEND}`,
167
+ `Privacy: ${vars.PRIVACY_MODE}`,
168
+ `Model: ${vars.AGENT_MODEL}`,
169
+ `Escalation: ${vars.ESCALATION_MODE}`,
170
+ "",
171
+ `Change later: sinain config`,
172
+ ].join("\n"),
173
+ "QuickStart defaults",
174
+ );
175
+ } else {
176
+ // Advanced flow: steps 2-5
177
+ const transcription = await stepTranscription(base, "[2/5] Audio transcription");
178
+ vars.TRANSCRIPTION_BACKEND = transcription;
179
+ p.log.success(`Using ${transcription === "openrouter" ? "cloud" : "local"} transcription.`);
180
+
181
+ if (transcription === "local") {
182
+ const modelDir = path.join(HOME, "models");
183
+ const modelPath = path.join(modelDir, "ggml-large-v3-turbo.bin");
184
+ if (!fs.existsSync(modelPath)) {
185
+ const download = guard(await p.confirm({
186
+ message: "Download Whisper model (~1.5 GB)?",
187
+ initialValue: true,
188
+ }));
189
+ if (download) {
190
+ const s = p.spinner();
191
+ s.start("Downloading Whisper model...");
192
+ try {
193
+ fs.mkdirSync(modelDir, { recursive: true });
194
+ execFileSync("curl", [
195
+ "-L", "--progress-bar",
196
+ "-o", modelPath,
197
+ "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin",
198
+ ], { stdio: "inherit" });
199
+ s.stop(c.green("Model downloaded."));
200
+ vars.LOCAL_WHISPER_MODEL = modelPath;
201
+ } catch {
202
+ s.stop(c.yellow("Download failed. You can download manually later."));
203
+ }
204
+ }
205
+ } else {
206
+ vars.LOCAL_WHISPER_MODEL = modelPath;
207
+ p.log.info(`Whisper model found: ${c.dim(modelPath)}`);
208
+ }
209
+ }
210
+
211
+ const gatewayVars = await stepGateway(base, "[3/5] OpenClaw gateway");
212
+ Object.assign(vars, gatewayVars);
213
+ if (gatewayVars.ESCALATION_MODE === "off") {
214
+ p.log.info("Standalone mode (no gateway).");
215
+ } else {
216
+ p.log.success("Gateway configured.");
217
+ }
218
+
219
+ const privacy = await stepPrivacy(base, "[4/5] Privacy mode");
220
+ vars.PRIVACY_MODE = privacy;
221
+ p.log.success(`Privacy: ${privacy}.`);
222
+
223
+ const model = await stepModel(base, "[5/5] AI model for HUD analysis");
224
+ vars.AGENT_MODEL = model;
225
+ p.log.success(`Model: ${model}.`);
226
+
227
+ vars.SINAIN_AGENT = base.SINAIN_AGENT || "claude";
228
+ }
229
+
230
+ // ── Common defaults ───────────────────────────────────────────────────
231
+
232
+ vars.SINAIN_CORE_URL = vars.SINAIN_CORE_URL || "http://localhost:9500";
233
+ vars.SINAIN_POLL_INTERVAL = vars.SINAIN_POLL_INTERVAL || "5";
234
+ vars.SINAIN_HEARTBEAT_INTERVAL = vars.SINAIN_HEARTBEAT_INTERVAL || "900";
235
+ vars.AUDIO_CAPTURE_CMD = vars.AUDIO_CAPTURE_CMD || "screencapturekit";
236
+ vars.AUDIO_AUTO_START = vars.AUDIO_AUTO_START || "true";
237
+ vars.PORT = vars.PORT || "9500";
238
+
239
+ // ── Write config ──────────────────────────────────────────────────────
240
+
241
+ const s = p.spinner();
242
+ s.start("Writing configuration...");
243
+ writeEnv(vars);
244
+ s.stop(c.green(`Config saved: ${c.dim(ENV_PATH)}`));
245
+
246
+ // ── Overlay ───────────────────────────────────────────────────────────
247
+
248
+ await stepOverlay(base);
249
+
250
+ // ── Health check ──────────────────────────────────────────────────────
251
+
252
+ await runHealthCheck();
253
+
254
+ // ── What now ──────────────────────────────────────────────────────────
255
+
256
+ printOutro();
257
+
258
+ // ── Start? ────────────────────────────────────────────────────────────
259
+
260
+ const startNow = guard(await p.confirm({
261
+ message: "Start sinain now?",
262
+ initialValue: true,
263
+ }));
264
+
265
+ if (startNow) {
266
+ p.outro("Launching sinain...");
267
+ try {
268
+ await import("./launcher.js");
269
+ } catch (err) {
270
+ console.log(c.yellow(` Launch failed: ${err.message}`));
271
+ console.log(c.dim(" Try manually: sinain start"));
272
+ }
273
+ } else {
274
+ p.outro("Run when ready: sinain start");
275
+ }
276
+ }
277
+
278
+ function printOutro() {
279
+ const hotkey = IS_WINDOWS ? "Ctrl+Shift" : "Cmd+Shift";
280
+ p.note(
281
+ [
282
+ "Hotkeys:",
283
+ ` ${hotkey}+Space — Show/hide overlay`,
284
+ ` ${hotkey}+E — Cycle tabs (Stream → Agent → Tasks)`,
285
+ ` ${hotkey}+C — Toggle click-through`,
286
+ ` ${hotkey}+M — Cycle display mode`,
287
+ "",
288
+ "Docs: https://github.com/geravant/sinain-hud",
289
+ "Re-run: sinain onboard (or sinain onboard --advanced)",
290
+ ].join("\n"),
291
+ "What now",
292
+ );
293
+ }
294
+
295
+ // ── CLI entry point ─────────────────────────────────────────────────────────
296
+
297
+ const cliArgs = process.argv.slice(2);
298
+ const flags = {};
299
+ for (const arg of cliArgs) {
300
+ if (arg === "--advanced") flags.flow = "advanced";
301
+ if (arg === "--quickstart") flags.flow = "quickstart";
302
+ if (arg.startsWith("--key=")) flags.key = arg.slice(6);
303
+ if (arg === "--non-interactive") flags.nonInteractive = true;
304
+ if (arg === "--reset") flags.reset = true;
305
+ }
306
+
307
+ if (flags.reset) {
308
+ if (fs.existsSync(ENV_PATH)) {
309
+ fs.unlinkSync(ENV_PATH);
310
+ console.log(c.green(" Config reset."));
311
+ }
312
+ }
313
+
314
+ if (flags.nonInteractive) {
315
+ const vars = {
316
+ OPENROUTER_API_KEY: flags.key || process.env.OPENROUTER_API_KEY || "",
317
+ TRANSCRIPTION_BACKEND: "openrouter",
318
+ PRIVACY_MODE: "standard",
319
+ AGENT_MODEL: "google/gemini-2.5-flash-lite",
320
+ ESCALATION_MODE: "off",
321
+ SINAIN_AGENT: "claude",
322
+ OPENCLAW_WS_URL: "",
323
+ OPENCLAW_HTTP_URL: "",
324
+ PORT: "9500",
325
+ AUDIO_CAPTURE_CMD: "screencapturekit",
326
+ AUDIO_AUTO_START: "true",
327
+ SINAIN_CORE_URL: "http://localhost:9500",
328
+ SINAIN_POLL_INTERVAL: "5",
329
+ SINAIN_HEARTBEAT_INTERVAL: "900",
330
+ };
331
+
332
+ if (!vars.OPENROUTER_API_KEY) {
333
+ console.error(c.red(" --key=<key> or OPENROUTER_API_KEY required for non-interactive mode."));
334
+ process.exit(1);
335
+ }
336
+
337
+ writeEnv(vars);
338
+ console.log(c.green(` Config written to ${ENV_PATH}`));
339
+ process.exit(0);
340
+ } else {
341
+ runOnboard(flags).catch((err) => {
342
+ console.error(c.red(` Error: ${err.message}`));
343
+ process.exit(1);
344
+ });
345
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@geravant/sinain",
3
- "version": "1.7.2",
3
+ "version": "1.9.0",
4
4
  "description": "Ambient intelligence that sees what you see, hears what you hear, and acts on your behalf",
5
5
  "type": "module",
6
6
  "bin": {
@@ -14,6 +14,9 @@
14
14
  },
15
15
  "files": [
16
16
  "cli.js",
17
+ "config.js",
18
+ "config-shared.js",
19
+ "onboard.js",
17
20
  "launcher.js",
18
21
  "setup-overlay.js",
19
22
  "setup-sck-capture.js",
@@ -47,5 +50,8 @@
47
50
  "./index.ts"
48
51
  ]
49
52
  },
50
- "license": "MIT"
53
+ "license": "MIT",
54
+ "dependencies": {
55
+ "@clack/prompts": "^1.1.0"
56
+ }
51
57
  }
@@ -371,13 +371,17 @@ def main():
371
371
  try:
372
372
  from PIL import Image as PILImage
373
373
  pil = PILImage.fromarray(frame) if isinstance(frame, np.ndarray) else frame
374
- scene = vision_provider.describe(pil, prompt=prompt or None)
375
- if scene:
376
- log(f"vision: {scene[:80]}...")
374
+ result = vision_provider.describe(pil, prompt=prompt or None)
375
+ scene = result.text
376
+ v_cost = result.cost
377
+ if scene or v_cost:
378
+ if scene:
379
+ log(f"vision: {scene[:80]}...")
377
380
  ctx_ev = SenseEvent(type="context", ts=ts)
378
- ctx_ev.observation = SenseObservation(scene=scene)
381
+ ctx_ev.observation = SenseObservation(scene=scene or "")
379
382
  ctx_ev.meta = meta
380
383
  ctx_ev.roi = package_full_frame(frame)
384
+ ctx_ev.vision_cost = v_cost
381
385
  sender.send(ctx_ev)
382
386
  except Exception as e:
383
387
  log(f"vision error: {e}")
@@ -43,6 +43,7 @@ class SenseEvent:
43
43
  diff: dict | None = None
44
44
  meta: SenseMeta = field(default_factory=SenseMeta)
45
45
  observation: SenseObservation = field(default_factory=SenseObservation)
46
+ vision_cost: dict | None = None # {cost, tokens_in, tokens_out, model}
46
47
 
47
48
 
48
49
  class DecisionGate:
@@ -1,4 +1,5 @@
1
1
  """OCR backends for UI text extraction: macOS Vision, Windows.Media.Ocr, and Tesseract."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  import io
@@ -24,8 +25,13 @@ class OCRResult:
24
25
  class LocalOCR:
25
26
  """Tesseract OCR wrapper for UI text extraction."""
26
27
 
27
- def __init__(self, lang: str = "eng", psm: int = 11,
28
- min_confidence: int = 30, enabled: bool = True):
28
+ def __init__(
29
+ self,
30
+ lang: str = "eng",
31
+ psm: int = 11,
32
+ min_confidence: int = 30,
33
+ enabled: bool = True,
34
+ ):
29
35
  self.lang = lang
30
36
  self.psm = psm
31
37
  self.min_confidence = min_confidence
@@ -87,8 +93,12 @@ class LocalOCR:
87
93
  class VisionOCR:
88
94
  """macOS Vision framework OCR using pyobjc."""
89
95
 
90
- def __init__(self, languages: list[str] | None = None,
91
- min_confidence: float = 0.5, enabled: bool = True):
96
+ def __init__(
97
+ self,
98
+ languages: list[str] | None = None,
99
+ min_confidence: float = 0.5,
100
+ enabled: bool = True,
101
+ ):
92
102
  self.languages = languages or ["en", "ru"]
93
103
  self.min_confidence = min_confidence
94
104
  self.enabled = enabled
@@ -101,8 +111,12 @@ class VisionOCR:
101
111
  import objc # noqa: F401
102
112
  import Quartz # noqa: F401
103
113
  from Foundation import NSURL, NSData # noqa: F401
104
- objc.loadBundle('Vision', bundle_path='/System/Library/Frameworks/Vision.framework',
105
- module_globals=globals())
114
+
115
+ objc.loadBundle(
116
+ "Vision",
117
+ bundle_path="/System/Library/Frameworks/Vision.framework",
118
+ module_globals=globals(),
119
+ )
106
120
  self._available = True
107
121
  except Exception as e:
108
122
  print(f"[ocr] Vision framework unavailable: {e}", flush=True)
@@ -120,9 +134,8 @@ class VisionOCR:
120
134
 
121
135
  def _do_extract(self, image: Image.Image) -> OCRResult:
122
136
  import objc
123
- import Vision
124
- from Foundation import NSData
125
137
  import Quartz
138
+ from Foundation import NSData
126
139
 
127
140
  # Convert PIL Image to CGImage via PNG bytes
128
141
  buf = io.BytesIO()
@@ -138,13 +151,13 @@ class VisionOCR:
138
151
  return OCRResult(text="", confidence=0, word_count=0)
139
152
 
140
153
  # Create and configure request
141
- request = Vision.VNRecognizeTextRequest.alloc().init()
142
- request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
154
+ request = VNRecognizeTextRequest.alloc().init()
155
+ request.setRecognitionLevel_(0) # VNRequestTextRecognitionLevelAccurate
143
156
  request.setRecognitionLanguages_(self.languages)
144
157
  request.setUsesLanguageCorrection_(True)
145
158
 
146
159
  # Execute
147
- handler = Vision.VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
160
+ handler = VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
148
161
  success = handler.performRequests_error_([request], objc.nil)
149
162
  if not success[0]:
150
163
  return OCRResult(text="", confidence=0, word_count=0)
@@ -197,8 +210,9 @@ class VisionOCR:
197
210
  class WinOCR:
198
211
  """Windows.Media.Ocr backend via winrt-python (Windows 10+)."""
199
212
 
200
- def __init__(self, language: str = "en", min_confidence: float = 0.5,
201
- enabled: bool = True):
213
+ def __init__(
214
+ self, language: str = "en", min_confidence: float = 0.5, enabled: bool = True
215
+ ):
202
216
  self.language = language
203
217
  self.min_confidence = min_confidence
204
218
  self.enabled = enabled
@@ -209,8 +223,8 @@ class WinOCR:
209
223
  return
210
224
 
211
225
  try:
212
- from winrt.windows.media.ocr import OcrEngine
213
226
  from winrt.windows.globalization import Language
227
+ from winrt.windows.media.ocr import OcrEngine
214
228
 
215
229
  lang = Language(language)
216
230
  if OcrEngine.is_language_supported(lang):
@@ -234,11 +248,15 @@ class WinOCR:
234
248
 
235
249
  def _do_extract(self, image: Image.Image) -> OCRResult:
236
250
  import asyncio
251
+
237
252
  from winrt.windows.graphics.imaging import (
238
- SoftwareBitmap, BitmapPixelFormat, BitmapAlphaMode,
253
+ BitmapAlphaMode,
254
+ BitmapPixelFormat,
255
+ SoftwareBitmap,
239
256
  )
240
257
  from winrt.windows.storage.streams import (
241
- InMemoryRandomAccessStream, DataWriter,
258
+ DataWriter,
259
+ InMemoryRandomAccessStream,
242
260
  )
243
261
 
244
262
  # Convert PIL to BMP bytes and load as SoftwareBitmap
@@ -254,13 +272,15 @@ class WinOCR:
254
272
  stream.seek(0)
255
273
 
256
274
  from winrt.windows.graphics.imaging import BitmapDecoder
275
+
257
276
  decoder = await BitmapDecoder.create_async(stream)
258
277
  bitmap = await decoder.get_software_bitmap_async()
259
278
 
260
279
  # Convert to supported pixel format if needed
261
280
  if bitmap.bitmap_pixel_format != BitmapPixelFormat.BGRA8:
262
- bitmap = SoftwareBitmap.convert(bitmap, BitmapPixelFormat.BGRA8,
263
- BitmapAlphaMode.PREMULTIPLIED)
281
+ bitmap = SoftwareBitmap.convert(
282
+ bitmap, BitmapPixelFormat.BGRA8, BitmapAlphaMode.PREMULTIPLIED
283
+ )
264
284
 
265
285
  result = await self._engine.recognize_async(bitmap)
266
286
  return result
@@ -318,10 +338,15 @@ def create_ocr(config: dict):
318
338
  enabled=enabled,
319
339
  )
320
340
  if vision._available:
321
- print(f"[ocr] using Vision backend (languages={vision.languages})", flush=True)
341
+ print(
342
+ f"[ocr] using Vision backend (languages={vision.languages})", flush=True
343
+ )
322
344
  return vision
323
345
  if backend == "vision":
324
- print("[ocr] Vision requested but unavailable, falling back to Tesseract", flush=True)
346
+ print(
347
+ "[ocr] Vision requested but unavailable, falling back to Tesseract",
348
+ flush=True,
349
+ )
325
350
 
326
351
  # Windows: try Windows.Media.Ocr
327
352
  if sys.platform == "win32" and backend in ("auto", "winocr"):
@@ -332,10 +357,15 @@ def create_ocr(config: dict):
332
357
  enabled=enabled,
333
358
  )
334
359
  if winocr._available:
335
- print(f"[ocr] using WinOCR backend (language={winocr.language})", flush=True)
360
+ print(
361
+ f"[ocr] using WinOCR backend (language={winocr.language})", flush=True
362
+ )
336
363
  return winocr
337
364
  if backend == "winocr":
338
- print("[ocr] WinOCR requested but unavailable, falling back to Tesseract", flush=True)
365
+ print(
366
+ "[ocr] WinOCR requested but unavailable, falling back to Tesseract",
367
+ flush=True,
368
+ )
339
369
 
340
370
  # Fallback to Tesseract (cross-platform)
341
371
  print("[ocr] using Tesseract backend", flush=True)
@@ -51,6 +51,8 @@ class SenseSender:
51
51
  "narrative": event.observation.narrative,
52
52
  "concepts": event.observation.concepts,
53
53
  }
54
+ if event.vision_cost:
55
+ payload["vision_cost"] = event.vision_cost
54
56
 
55
57
  for attempt in range(_MAX_RETRIES):
56
58
  try:
@@ -18,6 +18,7 @@ import json
18
18
  import logging
19
19
  import os
20
20
  import time
21
+ import uuid
21
22
  from abc import ABC, abstractmethod
22
23
  from typing import TYPE_CHECKING, Optional
23
24
 
@@ -27,14 +28,23 @@ if TYPE_CHECKING:
27
28
  logger = logging.getLogger("sinain.vision")
28
29
 
29
30
 
31
+ class VisionResult:
32
+ """Result of a vision call: text + optional cost info."""
33
+ __slots__ = ("text", "cost")
34
+
35
+ def __init__(self, text: Optional[str], cost: Optional[dict] = None):
36
+ self.text = text
37
+ self.cost = cost # {cost, tokens_in, tokens_out, model, cost_id}
38
+
39
+
30
40
  class VisionProvider(ABC):
31
41
  """Abstract base for vision inference backends."""
32
42
 
33
43
  name: str = "unknown"
34
44
 
35
45
  @abstractmethod
36
- def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
37
- """Describe image content. Returns None on failure."""
46
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
47
+ """Describe image content. Returns VisionResult (text may be None on failure)."""
38
48
  ...
39
49
 
40
50
  @abstractmethod
@@ -53,8 +63,8 @@ class OllamaVisionProvider(VisionProvider):
53
63
  timeout=timeout, max_tokens=max_tokens)
54
64
  self.name = f"ollama ({model})"
55
65
 
56
- def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
57
- return self._client.describe(image, prompt)
66
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
67
+ return VisionResult(self._client.describe(image, prompt))
58
68
 
59
69
  def is_available(self) -> bool:
60
70
  return self._client.is_available()
@@ -73,9 +83,9 @@ class OpenRouterVisionProvider(VisionProvider):
73
83
  self._max_tokens = max_tokens
74
84
  self.name = f"openrouter ({model})"
75
85
 
76
- def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
86
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
77
87
  if not self._api_key:
78
- return None
88
+ return VisionResult(None)
79
89
 
80
90
  try:
81
91
  import requests
@@ -83,7 +93,7 @@ class OpenRouterVisionProvider(VisionProvider):
83
93
  # Encode image
84
94
  img_b64 = self._encode(image)
85
95
  if not img_b64:
86
- return None
96
+ return VisionResult(None)
87
97
 
88
98
  prompt_text = prompt or "Describe what's on this screen concisely (2-3 sentences)."
89
99
 
@@ -112,13 +122,23 @@ class OpenRouterVisionProvider(VisionProvider):
112
122
  resp.raise_for_status()
113
123
  data = resp.json()
114
124
  content = data["choices"][0]["message"]["content"].strip()
115
- logger.debug("openrouter vision: model=%s tokens=%s",
116
- self._model, data.get("usage", {}).get("total_tokens", "?"))
117
- return content if content else None
125
+ usage = data.get("usage", {})
126
+ logger.debug("openrouter vision: model=%s tokens=%s cost=%s",
127
+ self._model, usage.get("total_tokens", "?"), usage.get("cost", "?"))
128
+ cost_info = None
129
+ if usage.get("cost") is not None:
130
+ cost_info = {
131
+ "cost": usage["cost"],
132
+ "tokens_in": usage.get("prompt_tokens", 0),
133
+ "tokens_out": usage.get("completion_tokens", 0),
134
+ "model": self._model,
135
+ "cost_id": uuid.uuid4().hex[:16],
136
+ }
137
+ return VisionResult(content if content else None, cost_info)
118
138
 
119
139
  except Exception as e:
120
140
  logger.debug("openrouter vision failed: %s", e)
121
- return None
141
+ return VisionResult(None)
122
142
 
123
143
  def is_available(self) -> bool:
124
144
  return bool(self._api_key)