@geravant/sinain 1.5.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example ADDED
@@ -0,0 +1,108 @@
1
+ # sinain configuration
2
+ # Location: ~/.sinain/.env (created by `sinain start` wizard or manually)
3
+ # The launcher reads this file on every start. sinain-core and sinain-agent
4
+ # inherit all vars via the launcher's process environment.
5
+
6
+ # ── Required ─────────────────────────────────────────────────────────────────
7
+ OPENROUTER_API_KEY= # get one free at https://openrouter.ai
8
+
9
+ # ── Privacy ──────────────────────────────────────────────────────────────────
10
+ PRIVACY_MODE=standard # off | standard | strict | paranoid
11
+ # standard: auto-redacts credentials before cloud APIs
12
+ # strict: only summaries leave your machine
13
+ # paranoid: almost nothing leaves your machine
14
+
15
+ # ── Agent ────────────────────────────────────────────────────────────────────
16
+ SINAIN_AGENT=claude # claude | codex | junie | goose | aider | <custom command>
17
+ # MCP agents (claude, codex, junie, goose) call sinain tools directly
18
+ # Pipe agents (aider, custom) receive escalation text on stdin
19
+ SINAIN_CORE_URL=http://localhost:9500
20
+ SINAIN_POLL_INTERVAL=5 # seconds between escalation polls
21
+ SINAIN_HEARTBEAT_INTERVAL=900 # seconds between heartbeat ticks (15 min)
22
+ SINAIN_WORKSPACE=~/.openclaw/workspace # knowledge files, curation scripts, playbook
23
+
24
+ # ── Escalation ───────────────────────────────────────────────────────────────
25
+ ESCALATION_MODE=rich # off | selective | focus | rich
26
+ # off: no escalation
27
+ # selective: score-based (errors, questions trigger it)
28
+ # focus: always escalate every tick
29
+ # rich: always escalate with maximum context
30
+ ESCALATION_COOLDOWN_MS=30000
31
+ # ESCALATION_TRANSPORT=auto # ws | http | auto
32
+ # auto = WS when gateway connected, HTTP fallback
33
+ # http = bare agent only (no gateway)
34
+
35
+ # ── Server ───────────────────────────────────────────────────────────────────
36
+ PORT=9500
37
+
38
+ # ── System Audio ─────────────────────────────────────────────────────────────
39
+ # Default: ScreenCaptureKit (zero-setup, macOS 13+). Fallback: ffmpeg + BlackHole.
40
+ # Windows: win-audio-capture.exe (WASAPI, auto-built by setup-windows.sh)
41
+ AUDIO_CAPTURE_CMD=screencapturekit # screencapturekit | sox | ffmpeg
42
+ AUDIO_DEVICE=BlackHole 2ch # macOS audio device (only used by sox/ffmpeg fallback)
43
+ AUDIO_SAMPLE_RATE=16000
44
+ AUDIO_CHUNK_MS=5000
45
+ AUDIO_VAD_ENABLED=true
46
+ AUDIO_VAD_THRESHOLD=0.003
47
+ AUDIO_AUTO_START=true
48
+ AUDIO_GAIN_DB=20
49
+
50
+ # ── Microphone (opt-in) ─────────────────────────────────────────────────────
51
+ MIC_ENABLED=false # set true to capture user's microphone
52
+ MIC_DEVICE=default # "default" = system mic
53
+ MIC_CAPTURE_CMD=sox # sox or ffmpeg
54
+ MIC_SAMPLE_RATE=16000
55
+ MIC_CHUNK_MS=5000
56
+ MIC_VAD_ENABLED=true
57
+ MIC_VAD_THRESHOLD=0.008 # higher threshold (ambient noise)
58
+ MIC_AUTO_START=false
59
+ MIC_GAIN_DB=0
60
+
61
+ # ── Transcription ────────────────────────────────────────────────────────────
62
+ TRANSCRIPTION_BACKEND=openrouter # openrouter | local (local = whisper.cpp on-device)
63
+ TRANSCRIPTION_MODEL=google/gemini-2.5-flash
64
+ TRANSCRIPTION_LANGUAGE=en-US
65
+
66
+ # ── Local Transcription (only when TRANSCRIPTION_BACKEND=local) ──────────────
67
+ # Install: brew install whisper-cpp
68
+ # Models: https://huggingface.co/ggerganov/whisper.cpp/tree/main
69
+ # LOCAL_WHISPER_BIN=whisper-cli
70
+ # LOCAL_WHISPER_MODEL=~/models/ggml-large-v3-turbo.bin
71
+ # LOCAL_WHISPER_TIMEOUT_MS=15000
72
+
73
+ # ── Local Agent Loop ─────────────────────────────────────────────────────────
74
+ AGENT_ENABLED=true
75
+ AGENT_MODEL=google/gemini-2.5-flash-lite
76
+ # AGENT_FALLBACK_MODELS=google/gemini-2.5-flash,anthropic/claude-3.5-haiku
77
+ AGENT_MAX_TOKENS=300
78
+ AGENT_TEMPERATURE=0.3
79
+ AGENT_PUSH_TO_FEED=true
80
+ AGENT_DEBOUNCE_MS=3000
81
+ AGENT_MAX_INTERVAL_MS=30000
82
+ AGENT_COOLDOWN_MS=10000
83
+ AGENT_MAX_AGE_MS=120000 # context window lookback (2 min)
84
+
85
+ # ── OpenClaw / NemoClaw Gateway ──────────────────────────────────────────────
86
+ # Leave blank to run without a gateway (bare agent mode).
87
+ # The setup wizard fills these in if you have an OpenClaw gateway.
88
+ OPENCLAW_WS_URL=ws://localhost:18789
89
+ OPENCLAW_WS_TOKEN= # 48-char hex — from gateway config
90
+ OPENCLAW_HTTP_URL=http://localhost:18789/hooks/agent
91
+ OPENCLAW_HTTP_TOKEN= # same token as WS_TOKEN
92
+ OPENCLAW_SESSION_KEY=agent:main:sinain
93
+ # OPENCLAW_PHASE1_TIMEOUT_MS=10000
94
+ # OPENCLAW_PHASE2_TIMEOUT_MS=120000
95
+ # OPENCLAW_QUEUE_TTL_MS=300000
96
+ # OPENCLAW_QUEUE_MAX_SIZE=10
97
+ # OPENCLAW_PING_INTERVAL_MS=30000
98
+
99
+ # ── SITUATION.md ─────────────────────────────────────────────────────────────
100
+ SITUATION_MD_PATH=~/.openclaw/workspace/SITUATION.md
101
+ # OPENCLAW_WORKSPACE_DIR=~/.openclaw/workspace
102
+
103
+ # ── Debug ────────────────────────────────────────────────────────────────────
104
+ # DEBUG=true # verbose logging (every tick, every chunk)
105
+
106
+ # ── Tracing ──────────────────────────────────────────────────────────────────
107
+ TRACE_ENABLED=true
108
+ TRACE_DIR=~/.sinain-core/traces
package/cli.js CHANGED
@@ -29,6 +29,17 @@ switch (cmd) {
29
29
  await import("./setup-overlay.js");
30
30
  break;
31
31
 
32
+ case "setup-sck-capture": {
33
+ const { downloadBinary } = await import("./setup-sck-capture.js");
34
+ if (os.platform() === "win32") {
35
+ console.log("sck-capture is macOS-only (Windows uses win-audio-capture.exe)");
36
+ } else {
37
+ const forceUpdate = process.argv.includes("--update");
38
+ await downloadBinary({ forceUpdate });
39
+ }
40
+ break;
41
+ }
42
+
32
43
  case "install":
33
44
  // --if-openclaw: only run if OpenClaw is installed (for postinstall)
34
45
  if (process.argv.includes("--if-openclaw")) {
@@ -156,6 +167,10 @@ async function runSetupWizard() {
156
167
  }
157
168
  vars.OPENCLAW_HTTP_URL = vars.OPENCLAW_WS_URL.replace(/^ws/, "http") + "/hooks/agent";
158
169
  vars.OPENCLAW_SESSION_KEY = "agent:main:sinain";
170
+ } else {
171
+ // No gateway — disable WS connection attempts
172
+ vars.OPENCLAW_WS_URL = "";
173
+ vars.OPENCLAW_HTTP_URL = "";
159
174
  }
160
175
 
161
176
  vars.SINAIN_POLL_INTERVAL = "5";
@@ -346,6 +361,7 @@ Usage:
346
361
  sinain status Check what's running
347
362
  sinain setup Run interactive setup wizard (~/.sinain/.env)
348
363
  sinain setup-overlay Download pre-built overlay app
364
+ sinain setup-sck-capture Download sck-capture audio binary (macOS)
349
365
  sinain install Install OpenClaw plugin (server-side)
350
366
 
351
367
  Start options:
package/launcher.js CHANGED
@@ -78,6 +78,25 @@ async function main() {
78
78
  // Install deps if needed
79
79
  await installDeps();
80
80
 
81
+ // Auto-download sck-capture binary if missing (macOS only)
82
+ if (!IS_WINDOWS) {
83
+ const sckBinary = path.join(SINAIN_DIR, "sck-capture", "sck-capture");
84
+ if (!fs.existsSync(sckBinary)) {
85
+ log("sck-capture not found — downloading from GitHub Releases...");
86
+ try {
87
+ const { downloadBinary } = await import("./setup-sck-capture.js");
88
+ const success = await downloadBinary({ silent: true });
89
+ if (success) {
90
+ ok("sck-capture downloaded");
91
+ } else {
92
+ warn("sck-capture download failed — audio capture may not work");
93
+ }
94
+ } catch (e) {
95
+ warn(`sck-capture auto-download failed: ${e.message}`);
96
+ }
97
+ }
98
+ }
99
+
81
100
  // Start core
82
101
  log("Starting sinain-core...");
83
102
  const coreDir = path.join(PKG_DIR, "sinain-core");
@@ -108,7 +127,7 @@ async function main() {
108
127
  const scDir = path.join(PKG_DIR, "sense_client");
109
128
  // Check if key package is importable to skip pip
110
129
  try {
111
- execSync('python3 -c "import cv2; import skimage"', { stdio: "pipe" });
130
+ execSync('python3 -c "import PIL; import skimage"', { stdio: "pipe" });
112
131
  } catch {
113
132
  log("Installing sense_client Python dependencies...");
114
133
  try {
@@ -289,6 +308,27 @@ async function preflight() {
289
308
  } else {
290
309
  ok("port 9500 free");
291
310
  }
311
+
312
+ // Ollama (if local vision enabled)
313
+ if (process.env.LOCAL_VISION_ENABLED === "true") {
314
+ try {
315
+ const resp = await fetch("http://localhost:11434/api/tags", { signal: AbortSignal.timeout(2000) });
316
+ if (resp.ok) {
317
+ ok("ollama server running");
318
+ } else {
319
+ warn("ollama server not responding — local vision will be unavailable");
320
+ }
321
+ } catch {
322
+ // Try to start Ollama in background
323
+ try {
324
+ const { spawn: spawnProc } = await import("child_process");
325
+ spawnProc("ollama", ["serve"], { detached: true, stdio: "ignore" }).unref();
326
+ ok("ollama server started in background");
327
+ } catch {
328
+ warn("ollama not running and could not auto-start — local vision disabled");
329
+ }
330
+ }
331
+ }
292
332
  }
293
333
 
294
334
  // ── Setup wizard ─────────────────────────────────────────────────────────────
@@ -376,6 +416,52 @@ async function setupWizard(envPath) {
376
416
  const agentChoice = await ask(` Agent? [${BOLD}claude${RESET}/codex/goose/junie/aider]: `);
377
417
  vars.SINAIN_AGENT = agentChoice.trim().toLowerCase() || "claude";
378
418
 
419
+ // 3b. Local vision (Ollama)
420
+ const IS_MACOS = os.platform() === "darwin";
421
+ const hasOllama = commandExists("ollama");
422
+ if (hasOllama) {
423
+ const useVision = await ask(` Enable local vision AI? [Y/n] (Ollama — screen understanding without cloud API): `);
424
+ if (!useVision.trim() || useVision.trim().toLowerCase() === "y") {
425
+ vars.LOCAL_VISION_ENABLED = "true";
426
+ try {
427
+ const models = execSync("ollama list 2>/dev/null", { encoding: "utf-8" });
428
+ if (!models.includes("llava")) {
429
+ const pull = await ask(` Pull llava vision model (~4GB)? [Y/n]: `);
430
+ if (!pull.trim() || pull.trim().toLowerCase() === "y") {
431
+ console.log(` ${DIM}Pulling llava...${RESET}`);
432
+ execSync("ollama pull llava", { stdio: "inherit" });
433
+ ok("llava model pulled");
434
+ }
435
+ } else {
436
+ ok("llava model already available");
437
+ }
438
+ } catch {
439
+ warn("Could not check Ollama models");
440
+ }
441
+ vars.LOCAL_VISION_MODEL = "llava";
442
+ }
443
+ } else {
444
+ const installOllama = await ask(` Install Ollama for local vision AI? [y/N]: `);
445
+ if (installOllama.trim().toLowerCase() === "y") {
446
+ try {
447
+ if (IS_MACOS) {
448
+ console.log(` ${DIM}Installing Ollama via Homebrew...${RESET}`);
449
+ execSync("brew install ollama", { stdio: "inherit" });
450
+ } else {
451
+ console.log(` ${DIM}Installing Ollama...${RESET}`);
452
+ execSync("curl -fsSL https://ollama.com/install.sh | sh", { stdio: "inherit" });
453
+ }
454
+ console.log(` ${DIM}Pulling llava vision model...${RESET}`);
455
+ execSync("ollama pull llava", { stdio: "inherit" });
456
+ vars.LOCAL_VISION_ENABLED = "true";
457
+ vars.LOCAL_VISION_MODEL = "llava";
458
+ ok("Ollama + llava installed");
459
+ } catch {
460
+ warn("Ollama installation failed — local vision disabled");
461
+ }
462
+ }
463
+ }
464
+
379
465
  // 4. Escalation mode
380
466
  console.log();
381
467
  console.log(` ${DIM}Escalation modes:${RESET}`);
@@ -402,6 +488,10 @@ async function setupWizard(envPath) {
402
488
  const httpBase = vars.OPENCLAW_WS_URL.replace(/^ws/, "http");
403
489
  vars.OPENCLAW_HTTP_URL = `${httpBase}/hooks/agent`;
404
490
  vars.OPENCLAW_SESSION_KEY = "agent:main:sinain";
491
+ } else {
492
+ // No gateway — disable WS connection attempts
493
+ vars.OPENCLAW_WS_URL = "";
494
+ vars.OPENCLAW_HTTP_URL = "";
405
495
  }
406
496
 
407
497
  // 6. Agent-specific defaults
package/pack-prepare.js CHANGED
@@ -5,7 +5,7 @@
5
5
  import fs from "fs";
6
6
  import path from "path";
7
7
 
8
- const LINKS = ["sinain-core", "sinain-mcp-server", "sinain-agent", "sense_client"];
8
+ const LINKS = ["sinain-core", "sinain-mcp-server", "sinain-agent", "sense_client", ".env.example"];
9
9
  const PKG_DIR = path.dirname(new URL(import.meta.url).pathname);
10
10
 
11
11
  const action = process.argv[2]; // "pre" or "post"
@@ -18,7 +18,12 @@ if (action === "pre") {
18
18
  if (!stat.isSymbolicLink()) continue;
19
19
  const target = fs.realpathSync(linkPath);
20
20
  fs.unlinkSync(linkPath);
21
- copyDir(target, linkPath);
21
+ const targetStat = fs.statSync(target);
22
+ if (targetStat.isDirectory()) {
23
+ copyDir(target, linkPath);
24
+ } else {
25
+ fs.copyFileSync(target, linkPath);
26
+ }
22
27
  }
23
28
  console.log("prepack: symlinks → copies");
24
29
  } else if (action === "post") {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@geravant/sinain",
3
- "version": "1.5.1",
3
+ "version": "1.6.0",
4
4
  "description": "Ambient AI overlay invisible to screen capture — real-time insights from audio + screen context",
5
5
  "type": "module",
6
6
  "bin": {
@@ -33,6 +33,7 @@ from .sender import SenseSender, package_full_frame, package_roi
33
33
  from .app_detector import AppDetector
34
34
  from .config import load_config
35
35
  from .privacy import apply_privacy
36
+ from .vision import create_vision
36
37
 
37
38
  if sys.platform == "win32":
38
39
  CONTROL_FILE = os.path.join(os.environ.get("TEMP", "C:\\Temp"), "sinain-sense-control.json")
@@ -128,6 +129,17 @@ def main():
128
129
  app_detector = AppDetector()
129
130
  ocr_pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)
130
131
 
132
+ # Vision provider — routes to Ollama (local) or OpenRouter (cloud) based on config/privacy
133
+ vision_cfg = config.get("vision", {})
134
+ vision_provider = create_vision(config)
135
+ vision_throttle_s = vision_cfg.get("throttleSeconds", 5)
136
+ last_vision_time = 0.0
137
+ vision_prompt = vision_cfg.get("prompt", "")
138
+ if vision_provider:
139
+ log(f" vision: {vision_provider.name}")
140
+ else:
141
+ log(" vision: disabled (no provider available)")
142
+
131
143
  # Adaptive SSIM threshold state
132
144
  ssim_stable_threshold = config["detection"]["ssimThreshold"] # 0.92
133
145
  ssim_sensitive_threshold = 0.85
@@ -343,6 +355,19 @@ def main():
343
355
  title=title, subtitle=subtitle, facts=facts,
344
356
  )
345
357
 
358
+ # Vision scene analysis (throttled, non-blocking on failure)
359
+ if vision_provider and time.time() - last_vision_time >= vision_throttle_s:
360
+ try:
361
+ from PIL import Image as PILImage
362
+ pil_frame = PILImage.fromarray(use_frame) if isinstance(use_frame, np.ndarray) else use_frame
363
+ scene = vision_provider.describe(pil_frame, prompt=vision_prompt or None)
364
+ if scene:
365
+ event.observation.scene = scene
366
+ last_vision_time = time.time()
367
+ log(f"vision: {scene[:80]}...")
368
+ except Exception as e:
369
+ log(f"vision error: {e}")
370
+
346
371
  # Send small thumbnail for ALL event types (agent uses vision)
347
372
  # Privacy matrix: gate image sending based on PRIVACY_IMAGES_OPENROUTER
348
373
  if _privacy_images_openrouter == "none":
@@ -38,6 +38,15 @@ DEFAULTS = {
38
38
  "sendThumbnails": True,
39
39
  "maxImageKB": 500,
40
40
  },
41
+ "vision": {
42
+ "enabled": False,
43
+ "backend": "ollama",
44
+ "model": "llava",
45
+ "ollamaUrl": "http://localhost:11434",
46
+ "timeout": 10.0,
47
+ "throttleSeconds": 5,
48
+ "prompt": "Describe what's on this screen: the application, UI state, any errors or notable content. Be concise (2-3 sentences).",
49
+ },
41
50
  "optimization": {
42
51
  "backpressure": False,
43
52
  "textDedup": False,
@@ -31,6 +31,7 @@ class SenseObservation:
31
31
  facts: list[str] = field(default_factory=list)
32
32
  narrative: str = ""
33
33
  concepts: list[str] = field(default_factory=list)
34
+ scene: str = "" # Local vision model scene description (Ollama)
34
35
 
35
36
 
36
37
  @dataclass
@@ -0,0 +1,162 @@
1
+ """Ollama Vision — local multimodal inference for screen scene understanding.
2
+
3
+ Provides a thin client for Ollama's vision models (llava, llama3.2-vision,
4
+ moondream, nanollava). Used by sense_client for scene descriptions and
5
+ optionally by sinain-core's agent analyzer for local vision analysis.
6
+
7
+ Falls back gracefully when Ollama is unavailable — never crashes the pipeline.
8
+ """
9
+
10
+ import base64
11
+ import io
12
+ import json
13
+ import logging
14
+ import time
15
+ from typing import Optional
16
+
17
+ try:
18
+ from PIL import Image
19
+ except ImportError:
20
+ Image = None # type: ignore
21
+
22
+ logger = logging.getLogger("sinain.vision")
23
+
24
+ DEFAULT_PROMPT = (
25
+ "Describe what's on this screen: the application, UI state, any errors "
26
+ "or notable content. Be concise (2-3 sentences)."
27
+ )
28
+
29
+
30
+ class OllamaVision:
31
+ """Local vision inference via Ollama HTTP API.
32
+
33
+ Uses the /api/chat endpoint with image support. Auto-encodes PIL images
34
+ to base64 JPEG. Returns None on any failure (timeout, connection error,
35
+ model not loaded).
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ model: str = "llava",
41
+ base_url: str = "http://localhost:11434",
42
+ timeout: float = 10.0,
43
+ max_tokens: int = 200,
44
+ ):
45
+ self.model = model
46
+ self.base_url = base_url.rstrip("/")
47
+ self.timeout = timeout
48
+ self.max_tokens = max_tokens
49
+ self._available: Optional[bool] = None
50
+ self._last_check: float = 0
51
+ self._check_interval = 30.0 # re-check availability every 30s
52
+
53
+ def is_available(self) -> bool:
54
+ """Check if Ollama server is reachable. Caches result for 30s."""
55
+ now = time.time()
56
+ if self._available is not None and now - self._last_check < self._check_interval:
57
+ return self._available
58
+
59
+ try:
60
+ import urllib.request
61
+ req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
62
+ with urllib.request.urlopen(req, timeout=2) as resp:
63
+ self._available = resp.status == 200
64
+ except Exception:
65
+ self._available = False
66
+
67
+ self._last_check = now
68
+ return self._available
69
+
70
+ def describe(
71
+ self,
72
+ image: "Image.Image",
73
+ prompt: Optional[str] = None,
74
+ ) -> Optional[str]:
75
+ """Describe image content using the local vision model.
76
+
77
+ Args:
78
+ image: PIL Image to analyze
79
+ prompt: Custom prompt (defaults to screen description prompt)
80
+
81
+ Returns:
82
+ Text description or None on failure/timeout.
83
+ """
84
+ if not self.is_available():
85
+ return None
86
+
87
+ try:
88
+ # Encode image to base64 JPEG
89
+ img_b64 = self._encode_image(image)
90
+ if not img_b64:
91
+ return None
92
+
93
+ # Build Ollama /api/chat request
94
+ payload = {
95
+ "model": self.model,
96
+ "messages": [
97
+ {
98
+ "role": "user",
99
+ "content": prompt or DEFAULT_PROMPT,
100
+ "images": [img_b64],
101
+ }
102
+ ],
103
+ "stream": False,
104
+ "options": {
105
+ "num_predict": self.max_tokens,
106
+ },
107
+ }
108
+
109
+ import urllib.request
110
+ data = json.dumps(payload).encode("utf-8")
111
+ req = urllib.request.Request(
112
+ f"{self.base_url}/api/chat",
113
+ data=data,
114
+ headers={"Content-Type": "application/json"},
115
+ method="POST",
116
+ )
117
+
118
+ t0 = time.time()
119
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
120
+ result = json.loads(resp.read().decode("utf-8"))
121
+
122
+ content = result.get("message", {}).get("content", "").strip()
123
+ latency_ms = int((time.time() - t0) * 1000)
124
+ logger.debug(
125
+ "ollama vision: model=%s latency=%dms tokens=%s",
126
+ self.model,
127
+ latency_ms,
128
+ result.get("eval_count", "?"),
129
+ )
130
+ return content if content else None
131
+
132
+ except Exception as e:
133
+ logger.debug("ollama vision failed: %s", e)
134
+ # Mark unavailable on connection errors so we don't retry every frame
135
+ if "Connection refused" in str(e) or "timed out" in str(e):
136
+ self._available = False
137
+ self._last_check = time.time()
138
+ return None
139
+
140
+ def _encode_image(self, image: "Image.Image", max_dim: int = 512, quality: int = 80) -> Optional[str]:
141
+ """Encode PIL Image to base64 JPEG string for Ollama."""
142
+ try:
143
+ # Resize if too large
144
+ w, h = image.size
145
+ if max(w, h) > max_dim:
146
+ scale = max_dim / max(w, h)
147
+ image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
148
+
149
+ # Convert RGBA to RGB
150
+ if image.mode == "RGBA":
151
+ bg = Image.new("RGB", image.size, (255, 255, 255))
152
+ bg.paste(image, mask=image.split()[3])
153
+ image = bg
154
+ elif image.mode != "RGB":
155
+ image = image.convert("RGB")
156
+
157
+ buf = io.BytesIO()
158
+ image.save(buf, format="JPEG", quality=quality)
159
+ return base64.b64encode(buf.getvalue()).decode("ascii")
160
+ except Exception as e:
161
+ logger.debug("image encoding failed: %s", e)
162
+ return None
@@ -3,6 +3,7 @@ scikit-image>=0.22
3
3
  numpy>=1.24
4
4
  pytesseract>=0.3
5
5
  requests>=2.31
6
+ pyobjc-framework-Quartz>=10.0; sys_platform == "darwin"
6
7
  mss>=9.0; sys_platform == "win32"
7
8
  psutil>=5.9; sys_platform == "win32"
8
9
  winrt-Windows.Media.Ocr>=2.0; sys_platform == "win32"
@@ -0,0 +1,189 @@
1
+ """Vision Provider — abstract interface for local and cloud image analysis.
2
+
3
+ Routes vision requests to either Ollama (local) or OpenRouter (cloud) based
4
+ on configuration, privacy mode, and API key availability.
5
+
6
+ Usage:
7
+ from .vision import create_vision
8
+ provider = create_vision(config)
9
+ if provider:
10
+ scene = provider.describe(image, "What's on this screen?")
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import base64
16
+ import io
17
+ import json
18
+ import logging
19
+ import os
20
+ import time
21
+ from abc import ABC, abstractmethod
22
+ from typing import TYPE_CHECKING, Optional
23
+
24
+ if TYPE_CHECKING:
25
+ from PIL import Image
26
+
27
+ logger = logging.getLogger("sinain.vision")
28
+
29
+
30
+ class VisionProvider(ABC):
31
+ """Abstract base for vision inference backends."""
32
+
33
+ name: str = "unknown"
34
+
35
+ @abstractmethod
36
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
37
+ """Describe image content. Returns None on failure."""
38
+ ...
39
+
40
+ @abstractmethod
41
+ def is_available(self) -> bool:
42
+ """Check if the backend is reachable."""
43
+ ...
44
+
45
+
46
+ class OllamaVisionProvider(VisionProvider):
47
+ """Local vision via Ollama HTTP API."""
48
+
49
+ def __init__(self, model: str = "llava", base_url: str = "http://localhost:11434",
50
+ timeout: float = 10.0, max_tokens: int = 200):
51
+ from .ollama_vision import OllamaVision
52
+ self._client = OllamaVision(model=model, base_url=base_url,
53
+ timeout=timeout, max_tokens=max_tokens)
54
+ self.name = f"ollama ({model})"
55
+
56
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
57
+ return self._client.describe(image, prompt)
58
+
59
+ def is_available(self) -> bool:
60
+ return self._client.is_available()
61
+
62
+
63
+ class OpenRouterVisionProvider(VisionProvider):
64
+ """Cloud vision via OpenRouter API."""
65
+
66
+ name = "openrouter"
67
+
68
+ def __init__(self, api_key: str, model: str = "google/gemini-2.5-flash-lite",
69
+ timeout: float = 15.0, max_tokens: int = 200):
70
+ self._api_key = api_key
71
+ self._model = model
72
+ self._timeout = timeout
73
+ self._max_tokens = max_tokens
74
+ self.name = f"openrouter ({model})"
75
+
76
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
77
+ if not self._api_key:
78
+ return None
79
+
80
+ try:
81
+ import requests
82
+
83
+ # Encode image
84
+ img_b64 = self._encode(image)
85
+ if not img_b64:
86
+ return None
87
+
88
+ prompt_text = prompt or "Describe what's on this screen concisely (2-3 sentences)."
89
+
90
+ resp = requests.post(
91
+ "https://openrouter.ai/api/v1/chat/completions",
92
+ headers={
93
+ "Authorization": f"Bearer {self._api_key}",
94
+ "Content-Type": "application/json",
95
+ },
96
+ json={
97
+ "model": self._model,
98
+ "max_tokens": self._max_tokens,
99
+ "messages": [{
100
+ "role": "user",
101
+ "content": [
102
+ {"type": "text", "text": prompt_text},
103
+ {"type": "image_url", "image_url": {
104
+ "url": f"data:image/jpeg;base64,{img_b64}",
105
+ "detail": "low",
106
+ }},
107
+ ],
108
+ }],
109
+ },
110
+ timeout=self._timeout,
111
+ )
112
+ resp.raise_for_status()
113
+ data = resp.json()
114
+ content = data["choices"][0]["message"]["content"].strip()
115
+ logger.debug("openrouter vision: model=%s tokens=%s",
116
+ self._model, data.get("usage", {}).get("total_tokens", "?"))
117
+ return content if content else None
118
+
119
+ except Exception as e:
120
+ logger.debug("openrouter vision failed: %s", e)
121
+ return None
122
+
123
+ def is_available(self) -> bool:
124
+ return bool(self._api_key)
125
+
126
+ @staticmethod
127
+ def _encode(image: "Image.Image", max_dim: int = 512, quality: int = 80) -> Optional[str]:
128
+ try:
129
+ from PIL import Image as PILImage
130
+
131
+ w, h = image.size
132
+ if max(w, h) > max_dim:
133
+ scale = max_dim / max(w, h)
134
+ image = image.resize((int(w * scale), int(h * scale)), PILImage.LANCZOS)
135
+
136
+ if image.mode == "RGBA":
137
+ bg = PILImage.new("RGB", image.size, (255, 255, 255))
138
+ bg.paste(image, mask=image.split()[3])
139
+ image = bg
140
+ elif image.mode != "RGB":
141
+ image = image.convert("RGB")
142
+
143
+ buf = io.BytesIO()
144
+ image.save(buf, format="JPEG", quality=quality)
145
+ return base64.b64encode(buf.getvalue()).decode("ascii")
146
+ except Exception:
147
+ return None
148
+
149
+
150
+ def create_vision(config: dict) -> Optional[VisionProvider]:
151
+ """Factory: create the appropriate vision provider based on config and environment.
152
+
153
+ Priority:
154
+ 1. Paranoid privacy or no API key → local only (Ollama)
155
+ 2. LOCAL_VISION_ENABLED=true → local (Ollama)
156
+ 3. API key available → cloud (OpenRouter)
157
+ 4. Nothing available → None (vision disabled, OCR still works)
158
+ """
159
+ privacy = os.environ.get("PRIVACY_MODE", "off")
160
+ api_key = os.environ.get("OPENROUTER_API_KEY", "")
161
+ vision_cfg = config.get("vision", {})
162
+
163
+ local_enabled = (
164
+ vision_cfg.get("enabled", False)
165
+ or os.environ.get("LOCAL_VISION_ENABLED", "").lower() == "true"
166
+ )
167
+ local_model = os.environ.get("LOCAL_VISION_MODEL", vision_cfg.get("model", "llava"))
168
+ local_url = vision_cfg.get("ollamaUrl", "http://localhost:11434")
169
+ local_timeout = vision_cfg.get("timeout", 10.0)
170
+
171
+ cloud_blocked = privacy in ("paranoid", "strict") or not api_key
172
+
173
+ # Local vision preferred when enabled or when cloud is blocked
174
+ if local_enabled:
175
+ provider = OllamaVisionProvider(
176
+ model=local_model, base_url=local_url, timeout=local_timeout,
177
+ )
178
+ if provider.is_available():
179
+ return provider
180
+ logger.info("Ollama not available, %s",
181
+ "vision disabled (cloud blocked)" if cloud_blocked else "falling back to OpenRouter")
182
+ if cloud_blocked:
183
+ return None
184
+
185
+ # Cloud vision (only if not blocked)
186
+ if not cloud_blocked:
187
+ return OpenRouterVisionProvider(api_key=api_key)
188
+
189
+ return None
@@ -11,6 +11,10 @@ if [ -f "$SCRIPT_DIR/.env" ]; then
11
11
  [[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
12
12
  key=$(echo "$key" | xargs) # trim whitespace
13
13
  val=$(echo "$val" | xargs)
14
+ # Strip inline comments (e.g. "5 # seconds" → "5")
15
+ val="${val%%#*}"
16
+ val=$(echo "$val" | xargs) # re-trim after comment strip
17
+ [[ -z "$val" ]] && continue
14
18
  # Only set if not already in environment
15
19
  if [ -z "${!key+x}" ]; then
16
20
  export "$key=$val"
@@ -5,6 +5,9 @@ import { levelFor, applyLevel } from "../privacy/index.js";
5
5
 
6
6
  const TAG = "agent";
7
7
 
8
+ /** Guard: only one Ollama vision call at a time (latest-wins, skip if busy). */
9
+ let ollamaInFlight = false;
10
+
8
11
  /**
9
12
  * Model-specific timeouts in milliseconds.
10
13
  * Only increases timeouts for slow models to avoid false timeouts.
@@ -223,6 +226,30 @@ export async function analyzeContext(
223
226
  } catch { /* privacy not initialized, keep images */ }
224
227
  const systemPrompt = traitSystemPrompt ?? SYSTEM_PROMPT;
225
228
 
229
+ // Try local Ollama first when enabled (handles both vision and text-only ticks)
230
+ // Guard: skip if a previous Ollama call is still in-flight (avoids "no slots available")
231
+ if (config.localVisionEnabled && !ollamaInFlight) {
232
+ ollamaInFlight = true;
233
+ try {
234
+ const result = await callOllamaVision(systemPrompt, userPrompt, images, config);
235
+ const mode = images.length > 0 ? "vision" : "text";
236
+ log(TAG, `local ollama (${config.localVisionModel}, ${mode}): success`);
237
+ return result;
238
+ } catch (err: any) {
239
+ log(TAG, `local ollama failed: ${err.message || err}, falling back to OpenRouter`);
240
+ } finally {
241
+ ollamaInFlight = false;
242
+ }
243
+ }
244
+
245
+ // Skip OpenRouter entirely if no API key (local-only mode)
246
+ if (!config.openrouterApiKey) {
247
+ if (config.localVisionEnabled) {
248
+ throw new Error("local ollama failed and no OpenRouter API key — cannot analyze");
249
+ }
250
+ throw new Error("no OpenRouter API key configured");
251
+ }
252
+
226
253
  const models = [config.model, ...config.fallbackModels];
227
254
 
228
255
  // Auto-upgrade: use vision model when images are present
@@ -364,3 +391,101 @@ async function callModel(
364
391
  clearTimeout(timeout);
365
392
  }
366
393
  }
394
+
395
+ /**
396
+ * Call Ollama local vision model for image analysis.
397
+ * Uses the /api/chat endpoint with base64 images.
398
+ * Falls back to OpenRouter on any failure.
399
+ */
400
+ async function callOllamaVision(
401
+ systemPrompt: string,
402
+ userPrompt: string,
403
+ images: ContextWindow["images"],
404
+ config: AgentConfig,
405
+ ): Promise<AgentResult> {
406
+ const start = Date.now();
407
+ const controller = new AbortController();
408
+ const timeout = setTimeout(() => controller.abort(), config.localVisionTimeout);
409
+
410
+ try {
411
+ const imageB64List = (images || []).map((img) => img.data);
412
+
413
+ const response = await fetch(`${config.localVisionUrl}/api/chat`, {
414
+ method: "POST",
415
+ headers: { "Content-Type": "application/json" },
416
+ body: JSON.stringify({
417
+ model: config.localVisionModel,
418
+ messages: [
419
+ { role: "system", content: systemPrompt },
420
+ { role: "user", content: userPrompt, images: imageB64List },
421
+ ],
422
+ stream: false,
423
+ options: { num_predict: config.maxTokens },
424
+ }),
425
+ signal: controller.signal,
426
+ });
427
+
428
+ if (!response.ok) {
429
+ throw new Error(`Ollama ${response.status}: ${await response.text()}`);
430
+ }
431
+
432
+ const data = await response.json() as {
433
+ message?: { content?: string };
434
+ eval_count?: number;
435
+ prompt_eval_count?: number;
436
+ };
437
+
438
+ const content = data.message?.content?.trim() || "";
439
+ const latencyMs = Date.now() - start;
440
+ const tokensIn = data.prompt_eval_count || 0;
441
+ const tokensOut = data.eval_count || 0;
442
+
443
+ log(TAG, `ollama vision: model=${config.localVisionModel} latency=${latencyMs}ms tokens=${tokensIn}+${tokensOut}`);
444
+
445
+ // Parse the response (same format as OpenRouter)
446
+ // Parse JSON response (same logic as callModel)
447
+ try {
448
+ const jsonStr = content.replace(/^```\w*\s*\n?/, "").replace(/\n?\s*```\s*$/, "").trim();
449
+ const parsed = JSON.parse(jsonStr);
450
+ return {
451
+ hud: parsed.hud || "\u2014",
452
+ digest: parsed.digest || "\u2014",
453
+ record: parseRecord(parsed),
454
+ task: parseTask(parsed),
455
+ latencyMs,
456
+ tokensIn, tokensOut,
457
+ model: config.localVisionModel,
458
+ parsedOk: true,
459
+ };
460
+ } catch {
461
+ const match = content.match(/\{[\s\S]*\}/);
462
+ if (match) {
463
+ try {
464
+ const parsed = JSON.parse(match[0]);
465
+ if (parsed.hud) {
466
+ return {
467
+ hud: parsed.hud,
468
+ digest: parsed.digest || "\u2014",
469
+ record: parseRecord(parsed),
470
+ task: parseTask(parsed),
471
+ latencyMs,
472
+ tokensIn, tokensOut,
473
+ model: config.localVisionModel,
474
+ parsedOk: true,
475
+ };
476
+ }
477
+ } catch {}
478
+ }
479
+ return {
480
+ hud: content.slice(0, 160) || "\u2014",
481
+ digest: content || "\u2014",
482
+ latencyMs,
483
+ tokensIn, tokensOut,
484
+ model: config.localVisionModel,
485
+ parsedOk: false,
486
+ };
487
+ }
488
+ } finally {
489
+ clearTimeout(timeout);
490
+ }
491
+ }
@@ -2,6 +2,7 @@ import os from "node:os";
2
2
  import { spawn, type ChildProcess } from "node:child_process";
3
3
  import { resolve, dirname } from "node:path";
4
4
  import { fileURLToPath } from "node:url";
5
+ import { existsSync } from "node:fs";
5
6
  import type { AudioPipelineConfig, AudioSourceTag } from "../types.js";
6
7
  import type { CaptureSpawner } from "./capture-spawner.js";
7
8
  import { log } from "../log.js";
@@ -16,7 +17,18 @@ const TAG = "audio";
16
17
  */
17
18
  export class MacOSCaptureSpawner implements CaptureSpawner {
18
19
  spawn(config: AudioPipelineConfig, source: AudioSourceTag): ChildProcess {
19
- const binaryPath = resolve(__dirname, "..", "..", "..", "tools", "sck-capture", "sck-capture");
20
+ // Check ~/.sinain/sck-capture/ first (npx install), then dev path
21
+ const homeBinary = resolve(os.homedir(), ".sinain", "sck-capture", "sck-capture");
22
+ const devBinary = resolve(__dirname, "..", "..", "..", "tools", "sck-capture", "sck-capture");
23
+ const binaryPath = existsSync(homeBinary) ? homeBinary : devBinary;
24
+
25
+ if (!existsSync(binaryPath)) {
26
+ throw new Error(
27
+ `sck-capture binary not found at ${binaryPath}. ` +
28
+ `Run: npx @geravant/sinain setup-sck-capture`
29
+ );
30
+ }
31
+
20
32
  const args = [
21
33
  "--sample-rate", String(config.sampleRate),
22
34
  "--channels", String(config.channels),
@@ -63,6 +63,13 @@ function boolEnv(key: string, fallback: boolean): boolean {
63
63
  return v === "true";
64
64
  }
65
65
 
66
+ /** Like env() but treats a defined-but-empty value as "" instead of falling through to fallback. */
67
+ function envAllowEmpty(key: string, fallbackKey?: string, defaultVal = ""): string {
68
+ if (process.env[key] !== undefined) return process.env[key]!;
69
+ if (fallbackKey && process.env[fallbackKey] !== undefined) return process.env[fallbackKey]!;
70
+ return defaultVal;
71
+ }
72
+
66
73
  function resolvePath(p: string): string {
67
74
  if (process.platform === "win32") {
68
75
  // Expand %APPDATA%, %USERPROFILE%, %TEMP% etc.
@@ -172,6 +179,10 @@ export function loadConfig(): CoreConfig {
172
179
  model: env("AGENT_MODEL", "google/gemini-2.5-flash-lite"),
173
180
  visionModel: env("AGENT_VISION_MODEL", "google/gemini-2.5-flash"),
174
181
  visionEnabled: boolEnv("AGENT_VISION_ENABLED", true),
182
+ localVisionEnabled: boolEnv("LOCAL_VISION_ENABLED", false),
183
+ localVisionModel: env("LOCAL_VISION_MODEL", "llava"),
184
+ localVisionUrl: env("LOCAL_VISION_URL", "http://localhost:11434"),
185
+ localVisionTimeout: intEnv("LOCAL_VISION_TIMEOUT", 10000),
175
186
  openrouterApiKey: env("OPENROUTER_API_KEY", ""),
176
187
  maxTokens: intEnv("AGENT_MAX_TOKENS", 800),
177
188
  temperature: floatEnv("AGENT_TEMPERATURE", 0.3),
@@ -194,9 +205,9 @@ export function loadConfig(): CoreConfig {
194
205
  };
195
206
 
196
207
  const openclawConfig: OpenClawConfig = {
197
- gatewayWsUrl: env("OPENCLAW_WS_URL", env("OPENCLAW_GATEWAY_WS_URL", "ws://localhost:18789")),
208
+ gatewayWsUrl: envAllowEmpty("OPENCLAW_WS_URL", "OPENCLAW_GATEWAY_WS_URL", "ws://localhost:18789"),
198
209
  gatewayToken: env("OPENCLAW_WS_TOKEN", env("OPENCLAW_GATEWAY_TOKEN", "")),
199
- hookUrl: env("OPENCLAW_HTTP_URL", env("OPENCLAW_HOOK_URL", "http://localhost:18789/hooks/agent")),
210
+ hookUrl: envAllowEmpty("OPENCLAW_HTTP_URL", "OPENCLAW_HOOK_URL", "http://localhost:18789/hooks/agent"),
200
211
  hookToken: env("OPENCLAW_HTTP_TOKEN", env("OPENCLAW_HOOK_TOKEN", "")),
201
212
  sessionKey: env("OPENCLAW_SESSION_KEY", "agent:main:sinain"),
202
213
  phase1TimeoutMs: intEnv("OPENCLAW_PHASE1_TIMEOUT_MS", 30_000),
@@ -161,6 +161,10 @@ export class OpenClawWsClient extends EventEmitter {
161
161
 
162
162
  /** Connect to the OpenClaw gateway. */
163
163
  connect(): void {
164
+ if (!this.config.gatewayWsUrl) {
165
+ log(TAG, "connect: no gateway URL configured — skipping");
166
+ return;
167
+ }
164
168
  if (!this.config.gatewayToken && !this.config.hookUrl) {
165
169
  log(TAG, "connect: no gateway token or hookUrl — skipping");
166
170
  return;
@@ -223,6 +223,10 @@ export interface AgentConfig {
223
223
  model: string;
224
224
  visionModel: string;
225
225
  visionEnabled: boolean;
226
+ localVisionEnabled: boolean;
227
+ localVisionModel: string;
228
+ localVisionUrl: string;
229
+ localVisionTimeout: number;
226
230
  openrouterApiKey: string;
227
231
  maxTokens: number;
228
232
  temperature: number;