npm - @geravant/sinain - Versions diffs - 1.5.0 → 1.6.0 - Mend

@geravant/sinain 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/.env.example +108 -0
package/cli.js +16 -0
package/launcher.js +91 -1
package/pack-prepare.js +7 -2
package/package.json +1 -1
package/sense_client/__main__.py +25 -0
package/sense_client/config.py +9 -0
package/sense_client/gate.py +1 -0
package/sense_client/ollama_vision.py +162 -0
package/sense_client/requirements.txt +1 -0
package/sense_client/vision.py +189 -0
package/sinain-agent/run.sh +4 -0
package/sinain-core/src/agent/analyzer.ts +125 -0
package/sinain-core/src/audio/capture-spawner-macos.ts +13 -1
package/sinain-core/src/config.ts +13 -2
package/sinain-core/src/escalation/openclaw-ws.ts +4 -0
package/sinain-core/src/types.ts +4 -0

package/.env.example ADDED Viewed

@@ -0,0 +1,108 @@
+# sinain configuration
+# Location: ~/.sinain/.env (created by `sinain start` wizard or manually)
+# The launcher reads this file on every start. sinain-core and sinain-agent
+# inherit all vars via the launcher's process environment.
+# ── Required ─────────────────────────────────────────────────────────────────
+OPENROUTER_API_KEY=                # get one free at https://openrouter.ai
+# ── Privacy ──────────────────────────────────────────────────────────────────
+PRIVACY_MODE=standard              # off | standard | strict | paranoid
+                                   # standard: auto-redacts credentials before cloud APIs
+                                   # strict: only summaries leave your machine
+                                   # paranoid: almost nothing leaves your machine
+# ── Agent ────────────────────────────────────────────────────────────────────
+SINAIN_AGENT=claude                # claude | codex | junie | goose | aider | <custom command>
+                                   # MCP agents (claude, codex, junie, goose) call sinain tools directly
+                                   # Pipe agents (aider, custom) receive escalation text on stdin
+SINAIN_CORE_URL=http://localhost:9500
+SINAIN_POLL_INTERVAL=5             # seconds between escalation polls
+SINAIN_HEARTBEAT_INTERVAL=900      # seconds between heartbeat ticks (15 min)
+SINAIN_WORKSPACE=~/.openclaw/workspace  # knowledge files, curation scripts, playbook
+# ── Escalation ───────────────────────────────────────────────────────────────
+ESCALATION_MODE=rich               # off | selective | focus | rich
+                                   # off: no escalation
+                                   # selective: score-based (errors, questions trigger it)
+                                   # focus: always escalate every tick
+                                   # rich: always escalate with maximum context
+ESCALATION_COOLDOWN_MS=30000
+# ESCALATION_TRANSPORT=auto        # ws | http | auto
+                                   # auto = WS when gateway connected, HTTP fallback
+                                   # http = bare agent only (no gateway)
+# ── Server ───────────────────────────────────────────────────────────────────
+PORT=9500
+# ── System Audio ─────────────────────────────────────────────────────────────
+# Default: ScreenCaptureKit (zero-setup, macOS 13+). Fallback: ffmpeg + BlackHole.
+# Windows: win-audio-capture.exe (WASAPI, auto-built by setup-windows.sh)
+AUDIO_CAPTURE_CMD=screencapturekit # screencapturekit | sox | ffmpeg
+AUDIO_DEVICE=BlackHole 2ch        # macOS audio device (only used by sox/ffmpeg fallback)
+AUDIO_SAMPLE_RATE=16000
+AUDIO_CHUNK_MS=5000
+AUDIO_VAD_ENABLED=true
+AUDIO_VAD_THRESHOLD=0.003
+AUDIO_AUTO_START=true
+AUDIO_GAIN_DB=20
+# ── Microphone (opt-in) ─────────────────────────────────────────────────────
+MIC_ENABLED=false                  # set true to capture user's microphone
+MIC_DEVICE=default                 # "default" = system mic
+MIC_CAPTURE_CMD=sox                # sox or ffmpeg
+MIC_SAMPLE_RATE=16000
+MIC_CHUNK_MS=5000
+MIC_VAD_ENABLED=true
+MIC_VAD_THRESHOLD=0.008            # higher threshold (ambient noise)
+MIC_AUTO_START=false
+MIC_GAIN_DB=0
+# ── Transcription ────────────────────────────────────────────────────────────
+TRANSCRIPTION_BACKEND=openrouter   # openrouter | local (local = whisper.cpp on-device)
+TRANSCRIPTION_MODEL=google/gemini-2.5-flash
+TRANSCRIPTION_LANGUAGE=en-US
+# ── Local Transcription (only when TRANSCRIPTION_BACKEND=local) ──────────────
+# Install: brew install whisper-cpp
+# Models:  https://huggingface.co/ggerganov/whisper.cpp/tree/main
+# LOCAL_WHISPER_BIN=whisper-cli
+# LOCAL_WHISPER_MODEL=~/models/ggml-large-v3-turbo.bin
+# LOCAL_WHISPER_TIMEOUT_MS=15000
+# ── Local Agent Loop ─────────────────────────────────────────────────────────
+AGENT_ENABLED=true
+AGENT_MODEL=google/gemini-2.5-flash-lite
+# AGENT_FALLBACK_MODELS=google/gemini-2.5-flash,anthropic/claude-3.5-haiku
+AGENT_MAX_TOKENS=300
+AGENT_TEMPERATURE=0.3
+AGENT_PUSH_TO_FEED=true
+AGENT_DEBOUNCE_MS=3000
+AGENT_MAX_INTERVAL_MS=30000
+AGENT_COOLDOWN_MS=10000
+AGENT_MAX_AGE_MS=120000            # context window lookback (2 min)
+# ── OpenClaw / NemoClaw Gateway ──────────────────────────────────────────────
+# Leave blank to run without a gateway (bare agent mode).
+# The setup wizard fills these in if you have an OpenClaw gateway.
+OPENCLAW_WS_URL=ws://localhost:18789
+OPENCLAW_WS_TOKEN=                 # 48-char hex — from gateway config
+OPENCLAW_HTTP_URL=http://localhost:18789/hooks/agent
+OPENCLAW_HTTP_TOKEN=               # same token as WS_TOKEN
+OPENCLAW_SESSION_KEY=agent:main:sinain
+# OPENCLAW_PHASE1_TIMEOUT_MS=10000
+# OPENCLAW_PHASE2_TIMEOUT_MS=120000
+# OPENCLAW_QUEUE_TTL_MS=300000
+# OPENCLAW_QUEUE_MAX_SIZE=10
+# OPENCLAW_PING_INTERVAL_MS=30000
+# ── SITUATION.md ─────────────────────────────────────────────────────────────
+SITUATION_MD_PATH=~/.openclaw/workspace/SITUATION.md
+# OPENCLAW_WORKSPACE_DIR=~/.openclaw/workspace
+# ── Debug ────────────────────────────────────────────────────────────────────
+# DEBUG=true                       # verbose logging (every tick, every chunk)
+# ── Tracing ──────────────────────────────────────────────────────────────────
+TRACE_ENABLED=true
+TRACE_DIR=~/.sinain-core/traces

package/cli.js CHANGED Viewed

@@ -29,6 +29,17 @@ switch (cmd) {
     await import("./setup-overlay.js");
     break;
+  case "setup-sck-capture": {
+    const { downloadBinary } = await import("./setup-sck-capture.js");
+    if (os.platform() === "win32") {
+      console.log("sck-capture is macOS-only (Windows uses win-audio-capture.exe)");
+    } else {
+      const forceUpdate = process.argv.includes("--update");
+      await downloadBinary({ forceUpdate });
+    }
+    break;
+  }
   case "install":
     // --if-openclaw: only run if OpenClaw is installed (for postinstall)
     if (process.argv.includes("--if-openclaw")) {
@@ -156,6 +167,10 @@ async function runSetupWizard() {
     }
     vars.OPENCLAW_HTTP_URL = vars.OPENCLAW_WS_URL.replace(/^ws/, "http") + "/hooks/agent";
     vars.OPENCLAW_SESSION_KEY = "agent:main:sinain";
+  } else {
+    // No gateway — disable WS connection attempts
+    vars.OPENCLAW_WS_URL = "";
+    vars.OPENCLAW_HTTP_URL = "";
   }
   vars.SINAIN_POLL_INTERVAL = "5";
@@ -346,6 +361,7 @@ Usage:
   sinain status                Check what's running
   sinain setup                 Run interactive setup wizard (~/.sinain/.env)
   sinain setup-overlay         Download pre-built overlay app
+  sinain setup-sck-capture     Download sck-capture audio binary (macOS)
   sinain install               Install OpenClaw plugin (server-side)
 Start options:

package/launcher.js CHANGED Viewed

@@ -78,6 +78,25 @@ async function main() {
   // Install deps if needed
   await installDeps();
+  // Auto-download sck-capture binary if missing (macOS only)
+  if (!IS_WINDOWS) {
+    const sckBinary = path.join(SINAIN_DIR, "sck-capture", "sck-capture");
+    if (!fs.existsSync(sckBinary)) {
+      log("sck-capture not found — downloading from GitHub Releases...");
+      try {
+        const { downloadBinary } = await import("./setup-sck-capture.js");
+        const success = await downloadBinary({ silent: true });
+        if (success) {
+          ok("sck-capture downloaded");
+        } else {
+          warn("sck-capture download failed — audio capture may not work");
+        }
+      } catch (e) {
+        warn(`sck-capture auto-download failed: ${e.message}`);
+      }
+    }
+  }
   // Start core
   log("Starting sinain-core...");
   const coreDir = path.join(PKG_DIR, "sinain-core");
@@ -108,7 +127,7 @@ async function main() {
         const scDir = path.join(PKG_DIR, "sense_client");
         // Check if key package is importable to skip pip
         try {
-          execSync('python3 -c "import cv2; import skimage"', { stdio: "pipe" });
+          execSync('python3 -c "import PIL; import skimage"', { stdio: "pipe" });
         } catch {
           log("Installing sense_client Python dependencies...");
           try {
@@ -289,6 +308,27 @@ async function preflight() {
   } else {
     ok("port 9500 free");
   }
+  // Ollama (if local vision enabled)
+  if (process.env.LOCAL_VISION_ENABLED === "true") {
+    try {
+      const resp = await fetch("http://localhost:11434/api/tags", { signal: AbortSignal.timeout(2000) });
+      if (resp.ok) {
+        ok("ollama server running");
+      } else {
+        warn("ollama server not responding — local vision will be unavailable");
+      }
+    } catch {
+      // Try to start Ollama in background
+      try {
+        const { spawn: spawnProc } = await import("child_process");
+        spawnProc("ollama", ["serve"], { detached: true, stdio: "ignore" }).unref();
+        ok("ollama server started in background");
+      } catch {
+        warn("ollama not running and could not auto-start — local vision disabled");
+      }
+    }
+  }
 }
 // ── Setup wizard ─────────────────────────────────────────────────────────────
@@ -376,6 +416,52 @@ async function setupWizard(envPath) {
   const agentChoice = await ask(`  Agent? [${BOLD}claude${RESET}/codex/goose/junie/aider]: `);
   vars.SINAIN_AGENT = agentChoice.trim().toLowerCase() || "claude";
+  // 3b. Local vision (Ollama)
+  const IS_MACOS = os.platform() === "darwin";
+  const hasOllama = commandExists("ollama");
+  if (hasOllama) {
+    const useVision = await ask(`  Enable local vision AI? [Y/n] (Ollama — screen understanding without cloud API): `);
+    if (!useVision.trim() || useVision.trim().toLowerCase() === "y") {
+      vars.LOCAL_VISION_ENABLED = "true";
+      try {
+        const models = execSync("ollama list 2>/dev/null", { encoding: "utf-8" });
+        if (!models.includes("llava")) {
+          const pull = await ask(`  Pull llava vision model (~4GB)? [Y/n]: `);
+          if (!pull.trim() || pull.trim().toLowerCase() === "y") {
+            console.log(`  ${DIM}Pulling llava...${RESET}`);
+            execSync("ollama pull llava", { stdio: "inherit" });
+            ok("llava model pulled");
+          }
+        } else {
+          ok("llava model already available");
+        }
+      } catch {
+        warn("Could not check Ollama models");
+      }
+      vars.LOCAL_VISION_MODEL = "llava";
+    }
+  } else {
+    const installOllama = await ask(`  Install Ollama for local vision AI? [y/N]: `);
+    if (installOllama.trim().toLowerCase() === "y") {
+      try {
+        if (IS_MACOS) {
+          console.log(`  ${DIM}Installing Ollama via Homebrew...${RESET}`);
+          execSync("brew install ollama", { stdio: "inherit" });
+        } else {
+          console.log(`  ${DIM}Installing Ollama...${RESET}`);
+          execSync("curl -fsSL https://ollama.com/install.sh | sh", { stdio: "inherit" });
+        }
+        console.log(`  ${DIM}Pulling llava vision model...${RESET}`);
+        execSync("ollama pull llava", { stdio: "inherit" });
+        vars.LOCAL_VISION_ENABLED = "true";
+        vars.LOCAL_VISION_MODEL = "llava";
+        ok("Ollama + llava installed");
+      } catch {
+        warn("Ollama installation failed — local vision disabled");
+      }
+    }
+  }
   // 4. Escalation mode
   console.log();
   console.log(`  ${DIM}Escalation modes:${RESET}`);
@@ -402,6 +488,10 @@ async function setupWizard(envPath) {
     const httpBase = vars.OPENCLAW_WS_URL.replace(/^ws/, "http");
     vars.OPENCLAW_HTTP_URL = `${httpBase}/hooks/agent`;
     vars.OPENCLAW_SESSION_KEY = "agent:main:sinain";
+  } else {
+    // No gateway — disable WS connection attempts
+    vars.OPENCLAW_WS_URL = "";
+    vars.OPENCLAW_HTTP_URL = "";
   }
   // 6. Agent-specific defaults

package/pack-prepare.js CHANGED Viewed

@@ -5,7 +5,7 @@
 import fs from "fs";
 import path from "path";
-const LINKS = ["sinain-core", "sinain-mcp-server", "sinain-agent", "sense_client"];
+const LINKS = ["sinain-core", "sinain-mcp-server", "sinain-agent", "sense_client", ".env.example"];
 const PKG_DIR = path.dirname(new URL(import.meta.url).pathname);
 const action = process.argv[2]; // "pre" or "post"
@@ -18,7 +18,12 @@ if (action === "pre") {
     if (!stat.isSymbolicLink()) continue;
     const target = fs.realpathSync(linkPath);
     fs.unlinkSync(linkPath);
-    copyDir(target, linkPath);
+    const targetStat = fs.statSync(target);
+    if (targetStat.isDirectory()) {
+      copyDir(target, linkPath);
+    } else {
+      fs.copyFileSync(target, linkPath);
+    }
   }
   console.log("prepack: symlinks → copies");
 } else if (action === "post") {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@geravant/sinain",
-  "version": "1.5.0",
+  "version": "1.6.0",
   "description": "Ambient AI overlay invisible to screen capture — real-time insights from audio + screen context",
   "type": "module",
   "bin": {

package/sense_client/__main__.py CHANGED Viewed

@@ -33,6 +33,7 @@ from .sender import SenseSender, package_full_frame, package_roi
 from .app_detector import AppDetector
 from .config import load_config
 from .privacy import apply_privacy
+from .vision import create_vision
 if sys.platform == "win32":
     CONTROL_FILE = os.path.join(os.environ.get("TEMP", "C:\\Temp"), "sinain-sense-control.json")
@@ -128,6 +129,17 @@ def main():
     app_detector = AppDetector()
     ocr_pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)
+    # Vision provider — routes to Ollama (local) or OpenRouter (cloud) based on config/privacy
+    vision_cfg = config.get("vision", {})
+    vision_provider = create_vision(config)
+    vision_throttle_s = vision_cfg.get("throttleSeconds", 5)
+    last_vision_time = 0.0
+    vision_prompt = vision_cfg.get("prompt", "")
+    if vision_provider:
+        log(f"  vision: {vision_provider.name}")
+    else:
+        log("  vision: disabled (no provider available)")
     # Adaptive SSIM threshold state
     ssim_stable_threshold = config["detection"]["ssimThreshold"]  # 0.92
     ssim_sensitive_threshold = 0.85
@@ -343,6 +355,19 @@ def main():
             title=title, subtitle=subtitle, facts=facts,
         )
+        # Vision scene analysis (throttled, non-blocking on failure)
+        if vision_provider and time.time() - last_vision_time >= vision_throttle_s:
+            try:
+                from PIL import Image as PILImage
+                pil_frame = PILImage.fromarray(use_frame) if isinstance(use_frame, np.ndarray) else use_frame
+                scene = vision_provider.describe(pil_frame, prompt=vision_prompt or None)
+                if scene:
+                    event.observation.scene = scene
+                    last_vision_time = time.time()
+                    log(f"vision: {scene[:80]}...")
+            except Exception as e:
+                log(f"vision error: {e}")
         # Send small thumbnail for ALL event types (agent uses vision)
         # Privacy matrix: gate image sending based on PRIVACY_IMAGES_OPENROUTER
         if _privacy_images_openrouter == "none":

package/sense_client/config.py CHANGED Viewed

@@ -38,6 +38,15 @@ DEFAULTS = {
         "sendThumbnails": True,
         "maxImageKB": 500,
     },
+    "vision": {
+        "enabled": False,
+        "backend": "ollama",
+        "model": "llava",
+        "ollamaUrl": "http://localhost:11434",
+        "timeout": 10.0,
+        "throttleSeconds": 5,
+        "prompt": "Describe what's on this screen: the application, UI state, any errors or notable content. Be concise (2-3 sentences).",
+    },
     "optimization": {
         "backpressure": False,
         "textDedup": False,

package/sense_client/gate.py CHANGED Viewed

@@ -31,6 +31,7 @@ class SenseObservation:
     facts: list[str] = field(default_factory=list)
     narrative: str = ""
     concepts: list[str] = field(default_factory=list)
+    scene: str = ""  # Local vision model scene description (Ollama)
 @dataclass

package/sense_client/ollama_vision.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""Ollama Vision — local multimodal inference for screen scene understanding.
+Provides a thin client for Ollama's vision models (llava, llama3.2-vision,
+moondream, nanollava). Used by sense_client for scene descriptions and
+optionally by sinain-core's agent analyzer for local vision analysis.
+Falls back gracefully when Ollama is unavailable — never crashes the pipeline.
+"""
+import base64
+import io
+import json
+import logging
+import time
+from typing import Optional
+try:
+    from PIL import Image
+except ImportError:
+    Image = None  # type: ignore
+logger = logging.getLogger("sinain.vision")
+DEFAULT_PROMPT = (
+    "Describe what's on this screen: the application, UI state, any errors "
+    "or notable content. Be concise (2-3 sentences)."
+)
+class OllamaVision:
+    """Local vision inference via Ollama HTTP API.
+    Uses the /api/chat endpoint with image support. Auto-encodes PIL images
+    to base64 JPEG. Returns None on any failure (timeout, connection error,
+    model not loaded).
+    """
+    def __init__(
+        self,
+        model: str = "llava",
+        base_url: str = "http://localhost:11434",
+        timeout: float = 10.0,
+        max_tokens: int = 200,
+    ):
+        self.model = model
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+        self.max_tokens = max_tokens
+        self._available: Optional[bool] = None
+        self._last_check: float = 0
+        self._check_interval = 30.0  # re-check availability every 30s
+    def is_available(self) -> bool:
+        """Check if Ollama server is reachable. Caches result for 30s."""
+        now = time.time()
+        if self._available is not None and now - self._last_check < self._check_interval:
+            return self._available
+        try:
+            import urllib.request
+            req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
+            with urllib.request.urlopen(req, timeout=2) as resp:
+                self._available = resp.status == 200
+        except Exception:
+            self._available = False
+        self._last_check = now
+        return self._available
+    def describe(
+        self,
+        image: "Image.Image",
+        prompt: Optional[str] = None,
+    ) -> Optional[str]:
+        """Describe image content using the local vision model.
+        Args:
+            image: PIL Image to analyze
+            prompt: Custom prompt (defaults to screen description prompt)
+        Returns:
+            Text description or None on failure/timeout.
+        """
+        if not self.is_available():
+            return None
+        try:
+            # Encode image to base64 JPEG
+            img_b64 = self._encode_image(image)
+            if not img_b64:
+                return None
+            # Build Ollama /api/chat request
+            payload = {
+                "model": self.model,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": prompt or DEFAULT_PROMPT,
+                        "images": [img_b64],
+                    }
+                ],
+                "stream": False,
+                "options": {
+                    "num_predict": self.max_tokens,
+                },
+            }
+            import urllib.request
+            data = json.dumps(payload).encode("utf-8")
+            req = urllib.request.Request(
+                f"{self.base_url}/api/chat",
+                data=data,
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            t0 = time.time()
+            with urllib.request.urlopen(req, timeout=self.timeout) as resp:
+                result = json.loads(resp.read().decode("utf-8"))
+            content = result.get("message", {}).get("content", "").strip()
+            latency_ms = int((time.time() - t0) * 1000)
+            logger.debug(
+                "ollama vision: model=%s latency=%dms tokens=%s",
+                self.model,
+                latency_ms,
+                result.get("eval_count", "?"),
+            )
+            return content if content else None
+        except Exception as e:
+            logger.debug("ollama vision failed: %s", e)
+            # Mark unavailable on connection errors so we don't retry every frame
+            if "Connection refused" in str(e) or "timed out" in str(e):
+                self._available = False
+                self._last_check = time.time()
+            return None
+    def _encode_image(self, image: "Image.Image", max_dim: int = 512, quality: int = 80) -> Optional[str]:
+        """Encode PIL Image to base64 JPEG string for Ollama."""
+        try:
+            # Resize if too large
+            w, h = image.size
+            if max(w, h) > max_dim:
+                scale = max_dim / max(w, h)
+                image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+            # Convert RGBA to RGB
+            if image.mode == "RGBA":
+                bg = Image.new("RGB", image.size, (255, 255, 255))
+                bg.paste(image, mask=image.split()[3])
+                image = bg
+            elif image.mode != "RGB":
+                image = image.convert("RGB")
+            buf = io.BytesIO()
+            image.save(buf, format="JPEG", quality=quality)
+            return base64.b64encode(buf.getvalue()).decode("ascii")
+        except Exception as e:
+            logger.debug("image encoding failed: %s", e)
+            return None

package/sense_client/requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ scikit-image>=0.22
 numpy>=1.24
 pytesseract>=0.3
 requests>=2.31
+pyobjc-framework-Quartz>=10.0; sys_platform == "darwin"
 mss>=9.0; sys_platform == "win32"
 psutil>=5.9; sys_platform == "win32"
 winrt-Windows.Media.Ocr>=2.0; sys_platform == "win32"

package/sense_client/vision.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""Vision Provider — abstract interface for local and cloud image analysis.
+Routes vision requests to either Ollama (local) or OpenRouter (cloud) based
+on configuration, privacy mode, and API key availability.
+Usage:
+    from .vision import create_vision
+    provider = create_vision(config)
+    if provider:
+        scene = provider.describe(image, "What's on this screen?")
+"""
+from __future__ import annotations
+import base64
+import io
+import json
+import logging
+import os
+import time
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from PIL import Image
+logger = logging.getLogger("sinain.vision")
+class VisionProvider(ABC):
+    """Abstract base for vision inference backends."""
+    name: str = "unknown"
+    @abstractmethod
+    def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
+        """Describe image content. Returns None on failure."""
+        ...
+    @abstractmethod
+    def is_available(self) -> bool:
+        """Check if the backend is reachable."""
+        ...
+class OllamaVisionProvider(VisionProvider):
+    """Local vision via Ollama HTTP API."""
+    def __init__(self, model: str = "llava", base_url: str = "http://localhost:11434",
+                 timeout: float = 10.0, max_tokens: int = 200):
+        from .ollama_vision import OllamaVision
+        self._client = OllamaVision(model=model, base_url=base_url,
+                                     timeout=timeout, max_tokens=max_tokens)
+        self.name = f"ollama ({model})"
+    def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
+        return self._client.describe(image, prompt)
+    def is_available(self) -> bool:
+        return self._client.is_available()
+class OpenRouterVisionProvider(VisionProvider):
+    """Cloud vision via OpenRouter API."""
+    name = "openrouter"
+    def __init__(self, api_key: str, model: str = "google/gemini-2.5-flash-lite",
+                 timeout: float = 15.0, max_tokens: int = 200):
+        self._api_key = api_key
+        self._model = model
+        self._timeout = timeout
+        self._max_tokens = max_tokens
+        self.name = f"openrouter ({model})"
+    def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
+        if not self._api_key:
+            return None
+        try:
+            import requests
+            # Encode image
+            img_b64 = self._encode(image)
+            if not img_b64:
+                return None
+            prompt_text = prompt or "Describe what's on this screen concisely (2-3 sentences)."
+            resp = requests.post(
+                "https://openrouter.ai/api/v1/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {self._api_key}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "model": self._model,
+                    "max_tokens": self._max_tokens,
+                    "messages": [{
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt_text},
+                            {"type": "image_url", "image_url": {
+                                "url": f"data:image/jpeg;base64,{img_b64}",
+                                "detail": "low",
+                            }},
+                        ],
+                    }],
+                },
+                timeout=self._timeout,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            content = data["choices"][0]["message"]["content"].strip()
+            logger.debug("openrouter vision: model=%s tokens=%s",
+                         self._model, data.get("usage", {}).get("total_tokens", "?"))
+            return content if content else None
+        except Exception as e:
+            logger.debug("openrouter vision failed: %s", e)
+            return None
+    def is_available(self) -> bool:
+        return bool(self._api_key)
+    @staticmethod
+    def _encode(image: "Image.Image", max_dim: int = 512, quality: int = 80) -> Optional[str]:
+        try:
+            from PIL import Image as PILImage
+            w, h = image.size
+            if max(w, h) > max_dim:
+                scale = max_dim / max(w, h)
+                image = image.resize((int(w * scale), int(h * scale)), PILImage.LANCZOS)
+            if image.mode == "RGBA":
+                bg = PILImage.new("RGB", image.size, (255, 255, 255))
+                bg.paste(image, mask=image.split()[3])
+                image = bg
+            elif image.mode != "RGB":
+                image = image.convert("RGB")
+            buf = io.BytesIO()
+            image.save(buf, format="JPEG", quality=quality)
+            return base64.b64encode(buf.getvalue()).decode("ascii")
+        except Exception:
+            return None
+def create_vision(config: dict) -> Optional[VisionProvider]:
+    """Factory: create the appropriate vision provider based on config and environment.
+    Priority:
+    1. Paranoid privacy or no API key → local only (Ollama)
+    2. LOCAL_VISION_ENABLED=true → local (Ollama)
+    3. API key available → cloud (OpenRouter)
+    4. Nothing available → None (vision disabled, OCR still works)
+    """
+    privacy = os.environ.get("PRIVACY_MODE", "off")
+    api_key = os.environ.get("OPENROUTER_API_KEY", "")
+    vision_cfg = config.get("vision", {})
+    local_enabled = (
+        vision_cfg.get("enabled", False)
+        or os.environ.get("LOCAL_VISION_ENABLED", "").lower() == "true"
+    )
+    local_model = os.environ.get("LOCAL_VISION_MODEL", vision_cfg.get("model", "llava"))
+    local_url = vision_cfg.get("ollamaUrl", "http://localhost:11434")
+    local_timeout = vision_cfg.get("timeout", 10.0)
+    cloud_blocked = privacy in ("paranoid", "strict") or not api_key
+    # Local vision preferred when enabled or when cloud is blocked
+    if local_enabled:
+        provider = OllamaVisionProvider(
+            model=local_model, base_url=local_url, timeout=local_timeout,
+        )
+        if provider.is_available():
+            return provider
+        logger.info("Ollama not available, %s",
+                     "vision disabled (cloud blocked)" if cloud_blocked else "falling back to OpenRouter")
+        if cloud_blocked:
+            return None
+    # Cloud vision (only if not blocked)
+    if not cloud_blocked:
+        return OpenRouterVisionProvider(api_key=api_key)
+    return None

package/sinain-agent/run.sh CHANGED Viewed

@@ -11,6 +11,10 @@ if [ -f "$SCRIPT_DIR/.env" ]; then
     [[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
     key=$(echo "$key" | xargs)  # trim whitespace
     val=$(echo "$val" | xargs)
+    # Strip inline comments (e.g. "5 # seconds" → "5")
+    val="${val%%#*}"
+    val=$(echo "$val" | xargs)  # re-trim after comment strip
+    [[ -z "$val" ]] && continue
     # Only set if not already in environment
     if [ -z "${!key+x}" ]; then
       export "$key=$val"

package/sinain-core/src/agent/analyzer.ts CHANGED Viewed

@@ -5,6 +5,9 @@ import { levelFor, applyLevel } from "../privacy/index.js";
 const TAG = "agent";
+/** Guard: only one Ollama vision call at a time (latest-wins, skip if busy). */
+let ollamaInFlight = false;
 /**
  * Model-specific timeouts in milliseconds.
  * Only increases timeouts for slow models to avoid false timeouts.
@@ -223,6 +226,30 @@ export async function analyzeContext(
   } catch { /* privacy not initialized, keep images */ }
   const systemPrompt = traitSystemPrompt ?? SYSTEM_PROMPT;
+  // Try local Ollama first when enabled (handles both vision and text-only ticks)
+  // Guard: skip if a previous Ollama call is still in-flight (avoids "no slots available")
+  if (config.localVisionEnabled && !ollamaInFlight) {
+    ollamaInFlight = true;
+    try {
+      const result = await callOllamaVision(systemPrompt, userPrompt, images, config);
+      const mode = images.length > 0 ? "vision" : "text";
+      log(TAG, `local ollama (${config.localVisionModel}, ${mode}): success`);
+      return result;
+    } catch (err: any) {
+      log(TAG, `local ollama failed: ${err.message || err}, falling back to OpenRouter`);
+    } finally {
+      ollamaInFlight = false;
+    }
+  }
+  // Skip OpenRouter entirely if no API key (local-only mode)
+  if (!config.openrouterApiKey) {
+    if (config.localVisionEnabled) {
+      throw new Error("local ollama failed and no OpenRouter API key — cannot analyze");
+    }
+    throw new Error("no OpenRouter API key configured");
+  }
   const models = [config.model, ...config.fallbackModels];
   // Auto-upgrade: use vision model when images are present
@@ -364,3 +391,101 @@ async function callModel(
     clearTimeout(timeout);
   }
 }
+/**
+ * Call Ollama local vision model for image analysis.
+ * Uses the /api/chat endpoint with base64 images.
+ * Falls back to OpenRouter on any failure.
+ */
+async function callOllamaVision(
+  systemPrompt: string,
+  userPrompt: string,
+  images: ContextWindow["images"],
+  config: AgentConfig,
+): Promise<AgentResult> {
+  const start = Date.now();
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), config.localVisionTimeout);
+  try {
+    const imageB64List = (images || []).map((img) => img.data);
+    const response = await fetch(`${config.localVisionUrl}/api/chat`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        model: config.localVisionModel,
+        messages: [
+          { role: "system", content: systemPrompt },
+          { role: "user", content: userPrompt, images: imageB64List },
+        ],
+        stream: false,
+        options: { num_predict: config.maxTokens },
+      }),
+      signal: controller.signal,
+    });
+    if (!response.ok) {
+      throw new Error(`Ollama ${response.status}: ${await response.text()}`);
+    }
+    const data = await response.json() as {
+      message?: { content?: string };
+      eval_count?: number;
+      prompt_eval_count?: number;
+    };
+    const content = data.message?.content?.trim() || "";
+    const latencyMs = Date.now() - start;
+    const tokensIn = data.prompt_eval_count || 0;
+    const tokensOut = data.eval_count || 0;
+    log(TAG, `ollama vision: model=${config.localVisionModel} latency=${latencyMs}ms tokens=${tokensIn}+${tokensOut}`);
+    // Parse the response (same format as OpenRouter)
+    // Parse JSON response (same logic as callModel)
+    try {
+      const jsonStr = content.replace(/^```\w*\s*\n?/, "").replace(/\n?\s*```\s*$/, "").trim();
+      const parsed = JSON.parse(jsonStr);
+      return {
+        hud: parsed.hud || "\u2014",
+        digest: parsed.digest || "\u2014",
+        record: parseRecord(parsed),
+        task: parseTask(parsed),
+        latencyMs,
+        tokensIn, tokensOut,
+        model: config.localVisionModel,
+        parsedOk: true,
+      };
+    } catch {
+      const match = content.match(/\{[\s\S]*\}/);
+      if (match) {
+        try {
+          const parsed = JSON.parse(match[0]);
+          if (parsed.hud) {
+            return {
+              hud: parsed.hud,
+              digest: parsed.digest || "\u2014",
+              record: parseRecord(parsed),
+              task: parseTask(parsed),
+              latencyMs,
+              tokensIn, tokensOut,
+              model: config.localVisionModel,
+              parsedOk: true,
+            };
+          }
+        } catch {}
+      }
+      return {
+        hud: content.slice(0, 160) || "\u2014",
+        digest: content || "\u2014",
+        latencyMs,
+        tokensIn, tokensOut,
+        model: config.localVisionModel,
+        parsedOk: false,
+      };
+    }
+  } finally {
+    clearTimeout(timeout);
+  }
+}

package/sinain-core/src/audio/capture-spawner-macos.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import os from "node:os";
 import { spawn, type ChildProcess } from "node:child_process";
 import { resolve, dirname } from "node:path";
 import { fileURLToPath } from "node:url";
+import { existsSync } from "node:fs";
 import type { AudioPipelineConfig, AudioSourceTag } from "../types.js";
 import type { CaptureSpawner } from "./capture-spawner.js";
 import { log } from "../log.js";
@@ -16,7 +17,18 @@ const TAG = "audio";
  */
 export class MacOSCaptureSpawner implements CaptureSpawner {
   spawn(config: AudioPipelineConfig, source: AudioSourceTag): ChildProcess {
-    const binaryPath = resolve(__dirname, "..", "..", "..", "tools", "sck-capture", "sck-capture");
+    // Check ~/.sinain/sck-capture/ first (npx install), then dev path
+    const homeBinary = resolve(os.homedir(), ".sinain", "sck-capture", "sck-capture");
+    const devBinary = resolve(__dirname, "..", "..", "..", "tools", "sck-capture", "sck-capture");
+    const binaryPath = existsSync(homeBinary) ? homeBinary : devBinary;
+    if (!existsSync(binaryPath)) {
+      throw new Error(
+        `sck-capture binary not found at ${binaryPath}. ` +
+        `Run: npx @geravant/sinain setup-sck-capture`
+      );
+    }
     const args = [
       "--sample-rate", String(config.sampleRate),
       "--channels", String(config.channels),

package/sinain-core/src/config.ts CHANGED Viewed

@@ -63,6 +63,13 @@ function boolEnv(key: string, fallback: boolean): boolean {
   return v === "true";
 }
+/** Like env() but treats a defined-but-empty value as "" instead of falling through to fallback. */
+function envAllowEmpty(key: string, fallbackKey?: string, defaultVal = ""): string {
+  if (process.env[key] !== undefined) return process.env[key]!;
+  if (fallbackKey && process.env[fallbackKey] !== undefined) return process.env[fallbackKey]!;
+  return defaultVal;
+}
 function resolvePath(p: string): string {
   if (process.platform === "win32") {
     // Expand %APPDATA%, %USERPROFILE%, %TEMP% etc.
@@ -172,6 +179,10 @@ export function loadConfig(): CoreConfig {
     model: env("AGENT_MODEL", "google/gemini-2.5-flash-lite"),
     visionModel: env("AGENT_VISION_MODEL", "google/gemini-2.5-flash"),
     visionEnabled: boolEnv("AGENT_VISION_ENABLED", true),
+    localVisionEnabled: boolEnv("LOCAL_VISION_ENABLED", false),
+    localVisionModel: env("LOCAL_VISION_MODEL", "llava"),
+    localVisionUrl: env("LOCAL_VISION_URL", "http://localhost:11434"),
+    localVisionTimeout: intEnv("LOCAL_VISION_TIMEOUT", 10000),
     openrouterApiKey: env("OPENROUTER_API_KEY", ""),
     maxTokens: intEnv("AGENT_MAX_TOKENS", 800),
     temperature: floatEnv("AGENT_TEMPERATURE", 0.3),
@@ -194,9 +205,9 @@ export function loadConfig(): CoreConfig {
   };
   const openclawConfig: OpenClawConfig = {
-    gatewayWsUrl: env("OPENCLAW_WS_URL", env("OPENCLAW_GATEWAY_WS_URL", "ws://localhost:18789")),
+    gatewayWsUrl: envAllowEmpty("OPENCLAW_WS_URL", "OPENCLAW_GATEWAY_WS_URL", "ws://localhost:18789"),
     gatewayToken: env("OPENCLAW_WS_TOKEN", env("OPENCLAW_GATEWAY_TOKEN", "")),
-    hookUrl: env("OPENCLAW_HTTP_URL", env("OPENCLAW_HOOK_URL", "http://localhost:18789/hooks/agent")),
+    hookUrl: envAllowEmpty("OPENCLAW_HTTP_URL", "OPENCLAW_HOOK_URL", "http://localhost:18789/hooks/agent"),
     hookToken: env("OPENCLAW_HTTP_TOKEN", env("OPENCLAW_HOOK_TOKEN", "")),
     sessionKey: env("OPENCLAW_SESSION_KEY", "agent:main:sinain"),
     phase1TimeoutMs: intEnv("OPENCLAW_PHASE1_TIMEOUT_MS", 30_000),

package/sinain-core/src/escalation/openclaw-ws.ts CHANGED Viewed

@@ -161,6 +161,10 @@ export class OpenClawWsClient extends EventEmitter {
   /** Connect to the OpenClaw gateway. */
   connect(): void {
+    if (!this.config.gatewayWsUrl) {
+      log(TAG, "connect: no gateway URL configured — skipping");
+      return;
+    }
     if (!this.config.gatewayToken && !this.config.hookUrl) {
       log(TAG, "connect: no gateway token or hookUrl — skipping");
       return;

package/sinain-core/src/types.ts CHANGED Viewed

@@ -223,6 +223,10 @@ export interface AgentConfig {
   model: string;
   visionModel: string;
   visionEnabled: boolean;
+  localVisionEnabled: boolean;
+  localVisionModel: string;
+  localVisionUrl: string;
+  localVisionTimeout: number;
   openrouterApiKey: string;
   maxTokens: number;
   temperature: number;