@geravant/sinain 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +108 -0
- package/cli.js +16 -0
- package/launcher.js +91 -1
- package/pack-prepare.js +7 -2
- package/package.json +1 -1
- package/sense_client/__main__.py +25 -0
- package/sense_client/config.py +9 -0
- package/sense_client/gate.py +1 -0
- package/sense_client/ollama_vision.py +162 -0
- package/sense_client/requirements.txt +1 -0
- package/sense_client/vision.py +189 -0
- package/sinain-agent/run.sh +4 -0
- package/sinain-core/src/agent/analyzer.ts +125 -0
- package/sinain-core/src/audio/capture-spawner-macos.ts +13 -1
- package/sinain-core/src/config.ts +13 -2
- package/sinain-core/src/escalation/openclaw-ws.ts +4 -0
- package/sinain-core/src/types.ts +4 -0
package/.env.example
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# sinain configuration
|
|
2
|
+
# Location: ~/.sinain/.env (created by `sinain start` wizard or manually)
|
|
3
|
+
# The launcher reads this file on every start. sinain-core and sinain-agent
|
|
4
|
+
# inherit all vars via the launcher's process environment.
|
|
5
|
+
|
|
6
|
+
# ── Required ─────────────────────────────────────────────────────────────────
|
|
7
|
+
OPENROUTER_API_KEY= # get one free at https://openrouter.ai
|
|
8
|
+
|
|
9
|
+
# ── Privacy ──────────────────────────────────────────────────────────────────
|
|
10
|
+
PRIVACY_MODE=standard # off | standard | strict | paranoid
|
|
11
|
+
# standard: auto-redacts credentials before cloud APIs
|
|
12
|
+
# strict: only summaries leave your machine
|
|
13
|
+
# paranoid: almost nothing leaves your machine
|
|
14
|
+
|
|
15
|
+
# ── Agent ────────────────────────────────────────────────────────────────────
|
|
16
|
+
SINAIN_AGENT=claude # claude | codex | junie | goose | aider | <custom command>
|
|
17
|
+
# MCP agents (claude, codex, junie, goose) call sinain tools directly
|
|
18
|
+
# Pipe agents (aider, custom) receive escalation text on stdin
|
|
19
|
+
SINAIN_CORE_URL=http://localhost:9500
|
|
20
|
+
SINAIN_POLL_INTERVAL=5 # seconds between escalation polls
|
|
21
|
+
SINAIN_HEARTBEAT_INTERVAL=900 # seconds between heartbeat ticks (15 min)
|
|
22
|
+
SINAIN_WORKSPACE=~/.openclaw/workspace # knowledge files, curation scripts, playbook
|
|
23
|
+
|
|
24
|
+
# ── Escalation ───────────────────────────────────────────────────────────────
|
|
25
|
+
ESCALATION_MODE=rich # off | selective | focus | rich
|
|
26
|
+
# off: no escalation
|
|
27
|
+
# selective: score-based (errors, questions trigger it)
|
|
28
|
+
# focus: always escalate every tick
|
|
29
|
+
# rich: always escalate with maximum context
|
|
30
|
+
ESCALATION_COOLDOWN_MS=30000
|
|
31
|
+
# ESCALATION_TRANSPORT=auto # ws | http | auto
|
|
32
|
+
# auto = WS when gateway connected, HTTP fallback
|
|
33
|
+
# http = bare agent only (no gateway)
|
|
34
|
+
|
|
35
|
+
# ── Server ───────────────────────────────────────────────────────────────────
|
|
36
|
+
PORT=9500
|
|
37
|
+
|
|
38
|
+
# ── System Audio ─────────────────────────────────────────────────────────────
|
|
39
|
+
# Default: ScreenCaptureKit (zero-setup, macOS 13+). Fallback: ffmpeg + BlackHole.
|
|
40
|
+
# Windows: win-audio-capture.exe (WASAPI, auto-built by setup-windows.sh)
|
|
41
|
+
AUDIO_CAPTURE_CMD=screencapturekit # screencapturekit | sox | ffmpeg
|
|
42
|
+
AUDIO_DEVICE=BlackHole 2ch # macOS audio device (only used by sox/ffmpeg fallback)
|
|
43
|
+
AUDIO_SAMPLE_RATE=16000
|
|
44
|
+
AUDIO_CHUNK_MS=5000
|
|
45
|
+
AUDIO_VAD_ENABLED=true
|
|
46
|
+
AUDIO_VAD_THRESHOLD=0.003
|
|
47
|
+
AUDIO_AUTO_START=true
|
|
48
|
+
AUDIO_GAIN_DB=20
|
|
49
|
+
|
|
50
|
+
# ── Microphone (opt-in) ─────────────────────────────────────────────────────
|
|
51
|
+
MIC_ENABLED=false # set true to capture user's microphone
|
|
52
|
+
MIC_DEVICE=default # "default" = system mic
|
|
53
|
+
MIC_CAPTURE_CMD=sox # sox or ffmpeg
|
|
54
|
+
MIC_SAMPLE_RATE=16000
|
|
55
|
+
MIC_CHUNK_MS=5000
|
|
56
|
+
MIC_VAD_ENABLED=true
|
|
57
|
+
MIC_VAD_THRESHOLD=0.008 # higher threshold (ambient noise)
|
|
58
|
+
MIC_AUTO_START=false
|
|
59
|
+
MIC_GAIN_DB=0
|
|
60
|
+
|
|
61
|
+
# ── Transcription ────────────────────────────────────────────────────────────
|
|
62
|
+
TRANSCRIPTION_BACKEND=openrouter # openrouter | local (local = whisper.cpp on-device)
|
|
63
|
+
TRANSCRIPTION_MODEL=google/gemini-2.5-flash
|
|
64
|
+
TRANSCRIPTION_LANGUAGE=en-US
|
|
65
|
+
|
|
66
|
+
# ── Local Transcription (only when TRANSCRIPTION_BACKEND=local) ──────────────
|
|
67
|
+
# Install: brew install whisper-cpp
|
|
68
|
+
# Models: https://huggingface.co/ggerganov/whisper.cpp/tree/main
|
|
69
|
+
# LOCAL_WHISPER_BIN=whisper-cli
|
|
70
|
+
# LOCAL_WHISPER_MODEL=~/models/ggml-large-v3-turbo.bin
|
|
71
|
+
# LOCAL_WHISPER_TIMEOUT_MS=15000
|
|
72
|
+
|
|
73
|
+
# ── Local Agent Loop ─────────────────────────────────────────────────────────
|
|
74
|
+
AGENT_ENABLED=true
|
|
75
|
+
AGENT_MODEL=google/gemini-2.5-flash-lite
|
|
76
|
+
# AGENT_FALLBACK_MODELS=google/gemini-2.5-flash,anthropic/claude-3.5-haiku
|
|
77
|
+
AGENT_MAX_TOKENS=300
|
|
78
|
+
AGENT_TEMPERATURE=0.3
|
|
79
|
+
AGENT_PUSH_TO_FEED=true
|
|
80
|
+
AGENT_DEBOUNCE_MS=3000
|
|
81
|
+
AGENT_MAX_INTERVAL_MS=30000
|
|
82
|
+
AGENT_COOLDOWN_MS=10000
|
|
83
|
+
AGENT_MAX_AGE_MS=120000 # context window lookback (2 min)
|
|
84
|
+
|
|
85
|
+
# ── OpenClaw / NemoClaw Gateway ──────────────────────────────────────────────
|
|
86
|
+
# Leave blank to run without a gateway (bare agent mode).
|
|
87
|
+
# The setup wizard fills these in if you have an OpenClaw gateway.
|
|
88
|
+
OPENCLAW_WS_URL=ws://localhost:18789
|
|
89
|
+
OPENCLAW_WS_TOKEN= # 48-char hex — from gateway config
|
|
90
|
+
OPENCLAW_HTTP_URL=http://localhost:18789/hooks/agent
|
|
91
|
+
OPENCLAW_HTTP_TOKEN= # same token as WS_TOKEN
|
|
92
|
+
OPENCLAW_SESSION_KEY=agent:main:sinain
|
|
93
|
+
# OPENCLAW_PHASE1_TIMEOUT_MS=10000
|
|
94
|
+
# OPENCLAW_PHASE2_TIMEOUT_MS=120000
|
|
95
|
+
# OPENCLAW_QUEUE_TTL_MS=300000
|
|
96
|
+
# OPENCLAW_QUEUE_MAX_SIZE=10
|
|
97
|
+
# OPENCLAW_PING_INTERVAL_MS=30000
|
|
98
|
+
|
|
99
|
+
# ── SITUATION.md ─────────────────────────────────────────────────────────────
|
|
100
|
+
SITUATION_MD_PATH=~/.openclaw/workspace/SITUATION.md
|
|
101
|
+
# OPENCLAW_WORKSPACE_DIR=~/.openclaw/workspace
|
|
102
|
+
|
|
103
|
+
# ── Debug ────────────────────────────────────────────────────────────────────
|
|
104
|
+
# DEBUG=true # verbose logging (every tick, every chunk)
|
|
105
|
+
|
|
106
|
+
# ── Tracing ──────────────────────────────────────────────────────────────────
|
|
107
|
+
TRACE_ENABLED=true
|
|
108
|
+
TRACE_DIR=~/.sinain-core/traces
|
package/cli.js
CHANGED
|
@@ -29,6 +29,17 @@ switch (cmd) {
|
|
|
29
29
|
await import("./setup-overlay.js");
|
|
30
30
|
break;
|
|
31
31
|
|
|
32
|
+
case "setup-sck-capture": {
|
|
33
|
+
const { downloadBinary } = await import("./setup-sck-capture.js");
|
|
34
|
+
if (os.platform() === "win32") {
|
|
35
|
+
console.log("sck-capture is macOS-only (Windows uses win-audio-capture.exe)");
|
|
36
|
+
} else {
|
|
37
|
+
const forceUpdate = process.argv.includes("--update");
|
|
38
|
+
await downloadBinary({ forceUpdate });
|
|
39
|
+
}
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
|
|
32
43
|
case "install":
|
|
33
44
|
// --if-openclaw: only run if OpenClaw is installed (for postinstall)
|
|
34
45
|
if (process.argv.includes("--if-openclaw")) {
|
|
@@ -156,6 +167,10 @@ async function runSetupWizard() {
|
|
|
156
167
|
}
|
|
157
168
|
vars.OPENCLAW_HTTP_URL = vars.OPENCLAW_WS_URL.replace(/^ws/, "http") + "/hooks/agent";
|
|
158
169
|
vars.OPENCLAW_SESSION_KEY = "agent:main:sinain";
|
|
170
|
+
} else {
|
|
171
|
+
// No gateway — disable WS connection attempts
|
|
172
|
+
vars.OPENCLAW_WS_URL = "";
|
|
173
|
+
vars.OPENCLAW_HTTP_URL = "";
|
|
159
174
|
}
|
|
160
175
|
|
|
161
176
|
vars.SINAIN_POLL_INTERVAL = "5";
|
|
@@ -346,6 +361,7 @@ Usage:
|
|
|
346
361
|
sinain status Check what's running
|
|
347
362
|
sinain setup Run interactive setup wizard (~/.sinain/.env)
|
|
348
363
|
sinain setup-overlay Download pre-built overlay app
|
|
364
|
+
sinain setup-sck-capture Download sck-capture audio binary (macOS)
|
|
349
365
|
sinain install Install OpenClaw plugin (server-side)
|
|
350
366
|
|
|
351
367
|
Start options:
|
package/launcher.js
CHANGED
|
@@ -78,6 +78,25 @@ async function main() {
|
|
|
78
78
|
// Install deps if needed
|
|
79
79
|
await installDeps();
|
|
80
80
|
|
|
81
|
+
// Auto-download sck-capture binary if missing (macOS only)
|
|
82
|
+
if (!IS_WINDOWS) {
|
|
83
|
+
const sckBinary = path.join(SINAIN_DIR, "sck-capture", "sck-capture");
|
|
84
|
+
if (!fs.existsSync(sckBinary)) {
|
|
85
|
+
log("sck-capture not found — downloading from GitHub Releases...");
|
|
86
|
+
try {
|
|
87
|
+
const { downloadBinary } = await import("./setup-sck-capture.js");
|
|
88
|
+
const success = await downloadBinary({ silent: true });
|
|
89
|
+
if (success) {
|
|
90
|
+
ok("sck-capture downloaded");
|
|
91
|
+
} else {
|
|
92
|
+
warn("sck-capture download failed — audio capture may not work");
|
|
93
|
+
}
|
|
94
|
+
} catch (e) {
|
|
95
|
+
warn(`sck-capture auto-download failed: ${e.message}`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
81
100
|
// Start core
|
|
82
101
|
log("Starting sinain-core...");
|
|
83
102
|
const coreDir = path.join(PKG_DIR, "sinain-core");
|
|
@@ -108,7 +127,7 @@ async function main() {
|
|
|
108
127
|
const scDir = path.join(PKG_DIR, "sense_client");
|
|
109
128
|
// Check if key package is importable to skip pip
|
|
110
129
|
try {
|
|
111
|
-
execSync('python3 -c "import
|
|
130
|
+
execSync('python3 -c "import PIL; import skimage"', { stdio: "pipe" });
|
|
112
131
|
} catch {
|
|
113
132
|
log("Installing sense_client Python dependencies...");
|
|
114
133
|
try {
|
|
@@ -289,6 +308,27 @@ async function preflight() {
|
|
|
289
308
|
} else {
|
|
290
309
|
ok("port 9500 free");
|
|
291
310
|
}
|
|
311
|
+
|
|
312
|
+
// Ollama (if local vision enabled)
|
|
313
|
+
if (process.env.LOCAL_VISION_ENABLED === "true") {
|
|
314
|
+
try {
|
|
315
|
+
const resp = await fetch("http://localhost:11434/api/tags", { signal: AbortSignal.timeout(2000) });
|
|
316
|
+
if (resp.ok) {
|
|
317
|
+
ok("ollama server running");
|
|
318
|
+
} else {
|
|
319
|
+
warn("ollama server not responding — local vision will be unavailable");
|
|
320
|
+
}
|
|
321
|
+
} catch {
|
|
322
|
+
// Try to start Ollama in background
|
|
323
|
+
try {
|
|
324
|
+
const { spawn: spawnProc } = await import("child_process");
|
|
325
|
+
spawnProc("ollama", ["serve"], { detached: true, stdio: "ignore" }).unref();
|
|
326
|
+
ok("ollama server started in background");
|
|
327
|
+
} catch {
|
|
328
|
+
warn("ollama not running and could not auto-start — local vision disabled");
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
292
332
|
}
|
|
293
333
|
|
|
294
334
|
// ── Setup wizard ─────────────────────────────────────────────────────────────
|
|
@@ -376,6 +416,52 @@ async function setupWizard(envPath) {
|
|
|
376
416
|
const agentChoice = await ask(` Agent? [${BOLD}claude${RESET}/codex/goose/junie/aider]: `);
|
|
377
417
|
vars.SINAIN_AGENT = agentChoice.trim().toLowerCase() || "claude";
|
|
378
418
|
|
|
419
|
+
// 3b. Local vision (Ollama)
|
|
420
|
+
const IS_MACOS = os.platform() === "darwin";
|
|
421
|
+
const hasOllama = commandExists("ollama");
|
|
422
|
+
if (hasOllama) {
|
|
423
|
+
const useVision = await ask(` Enable local vision AI? [Y/n] (Ollama — screen understanding without cloud API): `);
|
|
424
|
+
if (!useVision.trim() || useVision.trim().toLowerCase() === "y") {
|
|
425
|
+
vars.LOCAL_VISION_ENABLED = "true";
|
|
426
|
+
try {
|
|
427
|
+
const models = execSync("ollama list 2>/dev/null", { encoding: "utf-8" });
|
|
428
|
+
if (!models.includes("llava")) {
|
|
429
|
+
const pull = await ask(` Pull llava vision model (~4GB)? [Y/n]: `);
|
|
430
|
+
if (!pull.trim() || pull.trim().toLowerCase() === "y") {
|
|
431
|
+
console.log(` ${DIM}Pulling llava...${RESET}`);
|
|
432
|
+
execSync("ollama pull llava", { stdio: "inherit" });
|
|
433
|
+
ok("llava model pulled");
|
|
434
|
+
}
|
|
435
|
+
} else {
|
|
436
|
+
ok("llava model already available");
|
|
437
|
+
}
|
|
438
|
+
} catch {
|
|
439
|
+
warn("Could not check Ollama models");
|
|
440
|
+
}
|
|
441
|
+
vars.LOCAL_VISION_MODEL = "llava";
|
|
442
|
+
}
|
|
443
|
+
} else {
|
|
444
|
+
const installOllama = await ask(` Install Ollama for local vision AI? [y/N]: `);
|
|
445
|
+
if (installOllama.trim().toLowerCase() === "y") {
|
|
446
|
+
try {
|
|
447
|
+
if (IS_MACOS) {
|
|
448
|
+
console.log(` ${DIM}Installing Ollama via Homebrew...${RESET}`);
|
|
449
|
+
execSync("brew install ollama", { stdio: "inherit" });
|
|
450
|
+
} else {
|
|
451
|
+
console.log(` ${DIM}Installing Ollama...${RESET}`);
|
|
452
|
+
execSync("curl -fsSL https://ollama.com/install.sh | sh", { stdio: "inherit" });
|
|
453
|
+
}
|
|
454
|
+
console.log(` ${DIM}Pulling llava vision model...${RESET}`);
|
|
455
|
+
execSync("ollama pull llava", { stdio: "inherit" });
|
|
456
|
+
vars.LOCAL_VISION_ENABLED = "true";
|
|
457
|
+
vars.LOCAL_VISION_MODEL = "llava";
|
|
458
|
+
ok("Ollama + llava installed");
|
|
459
|
+
} catch {
|
|
460
|
+
warn("Ollama installation failed — local vision disabled");
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
379
465
|
// 4. Escalation mode
|
|
380
466
|
console.log();
|
|
381
467
|
console.log(` ${DIM}Escalation modes:${RESET}`);
|
|
@@ -402,6 +488,10 @@ async function setupWizard(envPath) {
|
|
|
402
488
|
const httpBase = vars.OPENCLAW_WS_URL.replace(/^ws/, "http");
|
|
403
489
|
vars.OPENCLAW_HTTP_URL = `${httpBase}/hooks/agent`;
|
|
404
490
|
vars.OPENCLAW_SESSION_KEY = "agent:main:sinain";
|
|
491
|
+
} else {
|
|
492
|
+
// No gateway — disable WS connection attempts
|
|
493
|
+
vars.OPENCLAW_WS_URL = "";
|
|
494
|
+
vars.OPENCLAW_HTTP_URL = "";
|
|
405
495
|
}
|
|
406
496
|
|
|
407
497
|
// 6. Agent-specific defaults
|
package/pack-prepare.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import fs from "fs";
|
|
6
6
|
import path from "path";
|
|
7
7
|
|
|
8
|
-
const LINKS = ["sinain-core", "sinain-mcp-server", "sinain-agent", "sense_client"];
|
|
8
|
+
const LINKS = ["sinain-core", "sinain-mcp-server", "sinain-agent", "sense_client", ".env.example"];
|
|
9
9
|
const PKG_DIR = path.dirname(new URL(import.meta.url).pathname);
|
|
10
10
|
|
|
11
11
|
const action = process.argv[2]; // "pre" or "post"
|
|
@@ -18,7 +18,12 @@ if (action === "pre") {
|
|
|
18
18
|
if (!stat.isSymbolicLink()) continue;
|
|
19
19
|
const target = fs.realpathSync(linkPath);
|
|
20
20
|
fs.unlinkSync(linkPath);
|
|
21
|
-
|
|
21
|
+
const targetStat = fs.statSync(target);
|
|
22
|
+
if (targetStat.isDirectory()) {
|
|
23
|
+
copyDir(target, linkPath);
|
|
24
|
+
} else {
|
|
25
|
+
fs.copyFileSync(target, linkPath);
|
|
26
|
+
}
|
|
22
27
|
}
|
|
23
28
|
console.log("prepack: symlinks → copies");
|
|
24
29
|
} else if (action === "post") {
|
package/package.json
CHANGED
package/sense_client/__main__.py
CHANGED
|
@@ -33,6 +33,7 @@ from .sender import SenseSender, package_full_frame, package_roi
|
|
|
33
33
|
from .app_detector import AppDetector
|
|
34
34
|
from .config import load_config
|
|
35
35
|
from .privacy import apply_privacy
|
|
36
|
+
from .vision import create_vision
|
|
36
37
|
|
|
37
38
|
if sys.platform == "win32":
|
|
38
39
|
CONTROL_FILE = os.path.join(os.environ.get("TEMP", "C:\\Temp"), "sinain-sense-control.json")
|
|
@@ -128,6 +129,17 @@ def main():
|
|
|
128
129
|
app_detector = AppDetector()
|
|
129
130
|
ocr_pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)
|
|
130
131
|
|
|
132
|
+
# Vision provider — routes to Ollama (local) or OpenRouter (cloud) based on config/privacy
|
|
133
|
+
vision_cfg = config.get("vision", {})
|
|
134
|
+
vision_provider = create_vision(config)
|
|
135
|
+
vision_throttle_s = vision_cfg.get("throttleSeconds", 5)
|
|
136
|
+
last_vision_time = 0.0
|
|
137
|
+
vision_prompt = vision_cfg.get("prompt", "")
|
|
138
|
+
if vision_provider:
|
|
139
|
+
log(f" vision: {vision_provider.name}")
|
|
140
|
+
else:
|
|
141
|
+
log(" vision: disabled (no provider available)")
|
|
142
|
+
|
|
131
143
|
# Adaptive SSIM threshold state
|
|
132
144
|
ssim_stable_threshold = config["detection"]["ssimThreshold"] # 0.92
|
|
133
145
|
ssim_sensitive_threshold = 0.85
|
|
@@ -343,6 +355,19 @@ def main():
|
|
|
343
355
|
title=title, subtitle=subtitle, facts=facts,
|
|
344
356
|
)
|
|
345
357
|
|
|
358
|
+
# Vision scene analysis (throttled, non-blocking on failure)
|
|
359
|
+
if vision_provider and time.time() - last_vision_time >= vision_throttle_s:
|
|
360
|
+
try:
|
|
361
|
+
from PIL import Image as PILImage
|
|
362
|
+
pil_frame = PILImage.fromarray(use_frame) if isinstance(use_frame, np.ndarray) else use_frame
|
|
363
|
+
scene = vision_provider.describe(pil_frame, prompt=vision_prompt or None)
|
|
364
|
+
if scene:
|
|
365
|
+
event.observation.scene = scene
|
|
366
|
+
last_vision_time = time.time()
|
|
367
|
+
log(f"vision: {scene[:80]}...")
|
|
368
|
+
except Exception as e:
|
|
369
|
+
log(f"vision error: {e}")
|
|
370
|
+
|
|
346
371
|
# Send small thumbnail for ALL event types (agent uses vision)
|
|
347
372
|
# Privacy matrix: gate image sending based on PRIVACY_IMAGES_OPENROUTER
|
|
348
373
|
if _privacy_images_openrouter == "none":
|
package/sense_client/config.py
CHANGED
|
@@ -38,6 +38,15 @@ DEFAULTS = {
|
|
|
38
38
|
"sendThumbnails": True,
|
|
39
39
|
"maxImageKB": 500,
|
|
40
40
|
},
|
|
41
|
+
"vision": {
|
|
42
|
+
"enabled": False,
|
|
43
|
+
"backend": "ollama",
|
|
44
|
+
"model": "llava",
|
|
45
|
+
"ollamaUrl": "http://localhost:11434",
|
|
46
|
+
"timeout": 10.0,
|
|
47
|
+
"throttleSeconds": 5,
|
|
48
|
+
"prompt": "Describe what's on this screen: the application, UI state, any errors or notable content. Be concise (2-3 sentences).",
|
|
49
|
+
},
|
|
41
50
|
"optimization": {
|
|
42
51
|
"backpressure": False,
|
|
43
52
|
"textDedup": False,
|
package/sense_client/gate.py
CHANGED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Ollama Vision — local multimodal inference for screen scene understanding.
|
|
2
|
+
|
|
3
|
+
Provides a thin client for Ollama's vision models (llava, llama3.2-vision,
|
|
4
|
+
moondream, nanollava). Used by sense_client for scene descriptions and
|
|
5
|
+
optionally by sinain-core's agent analyzer for local vision analysis.
|
|
6
|
+
|
|
7
|
+
Falls back gracefully when Ollama is unavailable — never crashes the pipeline.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import io
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
except ImportError:
|
|
20
|
+
Image = None # type: ignore
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("sinain.vision")
|
|
23
|
+
|
|
24
|
+
DEFAULT_PROMPT = (
|
|
25
|
+
"Describe what's on this screen: the application, UI state, any errors "
|
|
26
|
+
"or notable content. Be concise (2-3 sentences)."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class OllamaVision:
|
|
31
|
+
"""Local vision inference via Ollama HTTP API.
|
|
32
|
+
|
|
33
|
+
Uses the /api/chat endpoint with image support. Auto-encodes PIL images
|
|
34
|
+
to base64 JPEG. Returns None on any failure (timeout, connection error,
|
|
35
|
+
model not loaded).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model: str = "llava",
|
|
41
|
+
base_url: str = "http://localhost:11434",
|
|
42
|
+
timeout: float = 10.0,
|
|
43
|
+
max_tokens: int = 200,
|
|
44
|
+
):
|
|
45
|
+
self.model = model
|
|
46
|
+
self.base_url = base_url.rstrip("/")
|
|
47
|
+
self.timeout = timeout
|
|
48
|
+
self.max_tokens = max_tokens
|
|
49
|
+
self._available: Optional[bool] = None
|
|
50
|
+
self._last_check: float = 0
|
|
51
|
+
self._check_interval = 30.0 # re-check availability every 30s
|
|
52
|
+
|
|
53
|
+
def is_available(self) -> bool:
|
|
54
|
+
"""Check if Ollama server is reachable. Caches result for 30s."""
|
|
55
|
+
now = time.time()
|
|
56
|
+
if self._available is not None and now - self._last_check < self._check_interval:
|
|
57
|
+
return self._available
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
import urllib.request
|
|
61
|
+
req = urllib.request.Request(f"{self.base_url}/api/tags", method="GET")
|
|
62
|
+
with urllib.request.urlopen(req, timeout=2) as resp:
|
|
63
|
+
self._available = resp.status == 200
|
|
64
|
+
except Exception:
|
|
65
|
+
self._available = False
|
|
66
|
+
|
|
67
|
+
self._last_check = now
|
|
68
|
+
return self._available
|
|
69
|
+
|
|
70
|
+
def describe(
|
|
71
|
+
self,
|
|
72
|
+
image: "Image.Image",
|
|
73
|
+
prompt: Optional[str] = None,
|
|
74
|
+
) -> Optional[str]:
|
|
75
|
+
"""Describe image content using the local vision model.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
image: PIL Image to analyze
|
|
79
|
+
prompt: Custom prompt (defaults to screen description prompt)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Text description or None on failure/timeout.
|
|
83
|
+
"""
|
|
84
|
+
if not self.is_available():
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
# Encode image to base64 JPEG
|
|
89
|
+
img_b64 = self._encode_image(image)
|
|
90
|
+
if not img_b64:
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
# Build Ollama /api/chat request
|
|
94
|
+
payload = {
|
|
95
|
+
"model": self.model,
|
|
96
|
+
"messages": [
|
|
97
|
+
{
|
|
98
|
+
"role": "user",
|
|
99
|
+
"content": prompt or DEFAULT_PROMPT,
|
|
100
|
+
"images": [img_b64],
|
|
101
|
+
}
|
|
102
|
+
],
|
|
103
|
+
"stream": False,
|
|
104
|
+
"options": {
|
|
105
|
+
"num_predict": self.max_tokens,
|
|
106
|
+
},
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
import urllib.request
|
|
110
|
+
data = json.dumps(payload).encode("utf-8")
|
|
111
|
+
req = urllib.request.Request(
|
|
112
|
+
f"{self.base_url}/api/chat",
|
|
113
|
+
data=data,
|
|
114
|
+
headers={"Content-Type": "application/json"},
|
|
115
|
+
method="POST",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
t0 = time.time()
|
|
119
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
|
120
|
+
result = json.loads(resp.read().decode("utf-8"))
|
|
121
|
+
|
|
122
|
+
content = result.get("message", {}).get("content", "").strip()
|
|
123
|
+
latency_ms = int((time.time() - t0) * 1000)
|
|
124
|
+
logger.debug(
|
|
125
|
+
"ollama vision: model=%s latency=%dms tokens=%s",
|
|
126
|
+
self.model,
|
|
127
|
+
latency_ms,
|
|
128
|
+
result.get("eval_count", "?"),
|
|
129
|
+
)
|
|
130
|
+
return content if content else None
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.debug("ollama vision failed: %s", e)
|
|
134
|
+
# Mark unavailable on connection errors so we don't retry every frame
|
|
135
|
+
if "Connection refused" in str(e) or "timed out" in str(e):
|
|
136
|
+
self._available = False
|
|
137
|
+
self._last_check = time.time()
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def _encode_image(self, image: "Image.Image", max_dim: int = 512, quality: int = 80) -> Optional[str]:
|
|
141
|
+
"""Encode PIL Image to base64 JPEG string for Ollama."""
|
|
142
|
+
try:
|
|
143
|
+
# Resize if too large
|
|
144
|
+
w, h = image.size
|
|
145
|
+
if max(w, h) > max_dim:
|
|
146
|
+
scale = max_dim / max(w, h)
|
|
147
|
+
image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
|
|
148
|
+
|
|
149
|
+
# Convert RGBA to RGB
|
|
150
|
+
if image.mode == "RGBA":
|
|
151
|
+
bg = Image.new("RGB", image.size, (255, 255, 255))
|
|
152
|
+
bg.paste(image, mask=image.split()[3])
|
|
153
|
+
image = bg
|
|
154
|
+
elif image.mode != "RGB":
|
|
155
|
+
image = image.convert("RGB")
|
|
156
|
+
|
|
157
|
+
buf = io.BytesIO()
|
|
158
|
+
image.save(buf, format="JPEG", quality=quality)
|
|
159
|
+
return base64.b64encode(buf.getvalue()).decode("ascii")
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.debug("image encoding failed: %s", e)
|
|
162
|
+
return None
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Vision Provider — abstract interface for local and cloud image analysis.
|
|
2
|
+
|
|
3
|
+
Routes vision requests to either Ollama (local) or OpenRouter (cloud) based
|
|
4
|
+
on configuration, privacy mode, and API key availability.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from .vision import create_vision
|
|
8
|
+
provider = create_vision(config)
|
|
9
|
+
if provider:
|
|
10
|
+
scene = provider.describe(image, "What's on this screen?")
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import io
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import time
|
|
21
|
+
from abc import ABC, abstractmethod
|
|
22
|
+
from typing import TYPE_CHECKING, Optional
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from PIL import Image
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("sinain.vision")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class VisionProvider(ABC):
|
|
31
|
+
"""Abstract base for vision inference backends."""
|
|
32
|
+
|
|
33
|
+
name: str = "unknown"
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
|
|
37
|
+
"""Describe image content. Returns None on failure."""
|
|
38
|
+
...
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def is_available(self) -> bool:
|
|
42
|
+
"""Check if the backend is reachable."""
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class OllamaVisionProvider(VisionProvider):
|
|
47
|
+
"""Local vision via Ollama HTTP API."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, model: str = "llava", base_url: str = "http://localhost:11434",
|
|
50
|
+
timeout: float = 10.0, max_tokens: int = 200):
|
|
51
|
+
from .ollama_vision import OllamaVision
|
|
52
|
+
self._client = OllamaVision(model=model, base_url=base_url,
|
|
53
|
+
timeout=timeout, max_tokens=max_tokens)
|
|
54
|
+
self.name = f"ollama ({model})"
|
|
55
|
+
|
|
56
|
+
def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
|
|
57
|
+
return self._client.describe(image, prompt)
|
|
58
|
+
|
|
59
|
+
def is_available(self) -> bool:
|
|
60
|
+
return self._client.is_available()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class OpenRouterVisionProvider(VisionProvider):
|
|
64
|
+
"""Cloud vision via OpenRouter API."""
|
|
65
|
+
|
|
66
|
+
name = "openrouter"
|
|
67
|
+
|
|
68
|
+
def __init__(self, api_key: str, model: str = "google/gemini-2.5-flash-lite",
|
|
69
|
+
timeout: float = 15.0, max_tokens: int = 200):
|
|
70
|
+
self._api_key = api_key
|
|
71
|
+
self._model = model
|
|
72
|
+
self._timeout = timeout
|
|
73
|
+
self._max_tokens = max_tokens
|
|
74
|
+
self.name = f"openrouter ({model})"
|
|
75
|
+
|
|
76
|
+
def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
|
|
77
|
+
if not self._api_key:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
import requests
|
|
82
|
+
|
|
83
|
+
# Encode image
|
|
84
|
+
img_b64 = self._encode(image)
|
|
85
|
+
if not img_b64:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
prompt_text = prompt or "Describe what's on this screen concisely (2-3 sentences)."
|
|
89
|
+
|
|
90
|
+
resp = requests.post(
|
|
91
|
+
"https://openrouter.ai/api/v1/chat/completions",
|
|
92
|
+
headers={
|
|
93
|
+
"Authorization": f"Bearer {self._api_key}",
|
|
94
|
+
"Content-Type": "application/json",
|
|
95
|
+
},
|
|
96
|
+
json={
|
|
97
|
+
"model": self._model,
|
|
98
|
+
"max_tokens": self._max_tokens,
|
|
99
|
+
"messages": [{
|
|
100
|
+
"role": "user",
|
|
101
|
+
"content": [
|
|
102
|
+
{"type": "text", "text": prompt_text},
|
|
103
|
+
{"type": "image_url", "image_url": {
|
|
104
|
+
"url": f"data:image/jpeg;base64,{img_b64}",
|
|
105
|
+
"detail": "low",
|
|
106
|
+
}},
|
|
107
|
+
],
|
|
108
|
+
}],
|
|
109
|
+
},
|
|
110
|
+
timeout=self._timeout,
|
|
111
|
+
)
|
|
112
|
+
resp.raise_for_status()
|
|
113
|
+
data = resp.json()
|
|
114
|
+
content = data["choices"][0]["message"]["content"].strip()
|
|
115
|
+
logger.debug("openrouter vision: model=%s tokens=%s",
|
|
116
|
+
self._model, data.get("usage", {}).get("total_tokens", "?"))
|
|
117
|
+
return content if content else None
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.debug("openrouter vision failed: %s", e)
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
def is_available(self) -> bool:
|
|
124
|
+
return bool(self._api_key)
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _encode(image: "Image.Image", max_dim: int = 512, quality: int = 80) -> Optional[str]:
|
|
128
|
+
try:
|
|
129
|
+
from PIL import Image as PILImage
|
|
130
|
+
|
|
131
|
+
w, h = image.size
|
|
132
|
+
if max(w, h) > max_dim:
|
|
133
|
+
scale = max_dim / max(w, h)
|
|
134
|
+
image = image.resize((int(w * scale), int(h * scale)), PILImage.LANCZOS)
|
|
135
|
+
|
|
136
|
+
if image.mode == "RGBA":
|
|
137
|
+
bg = PILImage.new("RGB", image.size, (255, 255, 255))
|
|
138
|
+
bg.paste(image, mask=image.split()[3])
|
|
139
|
+
image = bg
|
|
140
|
+
elif image.mode != "RGB":
|
|
141
|
+
image = image.convert("RGB")
|
|
142
|
+
|
|
143
|
+
buf = io.BytesIO()
|
|
144
|
+
image.save(buf, format="JPEG", quality=quality)
|
|
145
|
+
return base64.b64encode(buf.getvalue()).decode("ascii")
|
|
146
|
+
except Exception:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def create_vision(config: dict) -> Optional[VisionProvider]:
|
|
151
|
+
"""Factory: create the appropriate vision provider based on config and environment.
|
|
152
|
+
|
|
153
|
+
Priority:
|
|
154
|
+
1. Paranoid privacy or no API key → local only (Ollama)
|
|
155
|
+
2. LOCAL_VISION_ENABLED=true → local (Ollama)
|
|
156
|
+
3. API key available → cloud (OpenRouter)
|
|
157
|
+
4. Nothing available → None (vision disabled, OCR still works)
|
|
158
|
+
"""
|
|
159
|
+
privacy = os.environ.get("PRIVACY_MODE", "off")
|
|
160
|
+
api_key = os.environ.get("OPENROUTER_API_KEY", "")
|
|
161
|
+
vision_cfg = config.get("vision", {})
|
|
162
|
+
|
|
163
|
+
local_enabled = (
|
|
164
|
+
vision_cfg.get("enabled", False)
|
|
165
|
+
or os.environ.get("LOCAL_VISION_ENABLED", "").lower() == "true"
|
|
166
|
+
)
|
|
167
|
+
local_model = os.environ.get("LOCAL_VISION_MODEL", vision_cfg.get("model", "llava"))
|
|
168
|
+
local_url = vision_cfg.get("ollamaUrl", "http://localhost:11434")
|
|
169
|
+
local_timeout = vision_cfg.get("timeout", 10.0)
|
|
170
|
+
|
|
171
|
+
cloud_blocked = privacy in ("paranoid", "strict") or not api_key
|
|
172
|
+
|
|
173
|
+
# Local vision preferred when enabled or when cloud is blocked
|
|
174
|
+
if local_enabled:
|
|
175
|
+
provider = OllamaVisionProvider(
|
|
176
|
+
model=local_model, base_url=local_url, timeout=local_timeout,
|
|
177
|
+
)
|
|
178
|
+
if provider.is_available():
|
|
179
|
+
return provider
|
|
180
|
+
logger.info("Ollama not available, %s",
|
|
181
|
+
"vision disabled (cloud blocked)" if cloud_blocked else "falling back to OpenRouter")
|
|
182
|
+
if cloud_blocked:
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
# Cloud vision (only if not blocked)
|
|
186
|
+
if not cloud_blocked:
|
|
187
|
+
return OpenRouterVisionProvider(api_key=api_key)
|
|
188
|
+
|
|
189
|
+
return None
|
package/sinain-agent/run.sh
CHANGED
|
@@ -11,6 +11,10 @@ if [ -f "$SCRIPT_DIR/.env" ]; then
|
|
|
11
11
|
[[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
|
|
12
12
|
key=$(echo "$key" | xargs) # trim whitespace
|
|
13
13
|
val=$(echo "$val" | xargs)
|
|
14
|
+
# Strip inline comments (e.g. "5 # seconds" → "5")
|
|
15
|
+
val="${val%%#*}"
|
|
16
|
+
val=$(echo "$val" | xargs) # re-trim after comment strip
|
|
17
|
+
[[ -z "$val" ]] && continue
|
|
14
18
|
# Only set if not already in environment
|
|
15
19
|
if [ -z "${!key+x}" ]; then
|
|
16
20
|
export "$key=$val"
|
|
@@ -5,6 +5,9 @@ import { levelFor, applyLevel } from "../privacy/index.js";
|
|
|
5
5
|
|
|
6
6
|
const TAG = "agent";
|
|
7
7
|
|
|
8
|
+
/** Guard: only one Ollama vision call at a time (latest-wins, skip if busy). */
|
|
9
|
+
let ollamaInFlight = false;
|
|
10
|
+
|
|
8
11
|
/**
|
|
9
12
|
* Model-specific timeouts in milliseconds.
|
|
10
13
|
* Only increases timeouts for slow models to avoid false timeouts.
|
|
@@ -223,6 +226,30 @@ export async function analyzeContext(
|
|
|
223
226
|
} catch { /* privacy not initialized, keep images */ }
|
|
224
227
|
const systemPrompt = traitSystemPrompt ?? SYSTEM_PROMPT;
|
|
225
228
|
|
|
229
|
+
// Try local Ollama first when enabled (handles both vision and text-only ticks)
|
|
230
|
+
// Guard: skip if a previous Ollama call is still in-flight (avoids "no slots available")
|
|
231
|
+
if (config.localVisionEnabled && !ollamaInFlight) {
|
|
232
|
+
ollamaInFlight = true;
|
|
233
|
+
try {
|
|
234
|
+
const result = await callOllamaVision(systemPrompt, userPrompt, images, config);
|
|
235
|
+
const mode = images.length > 0 ? "vision" : "text";
|
|
236
|
+
log(TAG, `local ollama (${config.localVisionModel}, ${mode}): success`);
|
|
237
|
+
return result;
|
|
238
|
+
} catch (err: any) {
|
|
239
|
+
log(TAG, `local ollama failed: ${err.message || err}, falling back to OpenRouter`);
|
|
240
|
+
} finally {
|
|
241
|
+
ollamaInFlight = false;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Skip OpenRouter entirely if no API key (local-only mode)
|
|
246
|
+
if (!config.openrouterApiKey) {
|
|
247
|
+
if (config.localVisionEnabled) {
|
|
248
|
+
throw new Error("local ollama failed and no OpenRouter API key — cannot analyze");
|
|
249
|
+
}
|
|
250
|
+
throw new Error("no OpenRouter API key configured");
|
|
251
|
+
}
|
|
252
|
+
|
|
226
253
|
const models = [config.model, ...config.fallbackModels];
|
|
227
254
|
|
|
228
255
|
// Auto-upgrade: use vision model when images are present
|
|
@@ -364,3 +391,101 @@ async function callModel(
|
|
|
364
391
|
clearTimeout(timeout);
|
|
365
392
|
}
|
|
366
393
|
}
|
|
394
|
+
|
|
395
|
+
/**
|
|
396
|
+
* Call Ollama local vision model for image analysis.
|
|
397
|
+
* Uses the /api/chat endpoint with base64 images.
|
|
398
|
+
* Falls back to OpenRouter on any failure.
|
|
399
|
+
*/
|
|
400
|
+
async function callOllamaVision(
|
|
401
|
+
systemPrompt: string,
|
|
402
|
+
userPrompt: string,
|
|
403
|
+
images: ContextWindow["images"],
|
|
404
|
+
config: AgentConfig,
|
|
405
|
+
): Promise<AgentResult> {
|
|
406
|
+
const start = Date.now();
|
|
407
|
+
const controller = new AbortController();
|
|
408
|
+
const timeout = setTimeout(() => controller.abort(), config.localVisionTimeout);
|
|
409
|
+
|
|
410
|
+
try {
|
|
411
|
+
const imageB64List = (images || []).map((img) => img.data);
|
|
412
|
+
|
|
413
|
+
const response = await fetch(`${config.localVisionUrl}/api/chat`, {
|
|
414
|
+
method: "POST",
|
|
415
|
+
headers: { "Content-Type": "application/json" },
|
|
416
|
+
body: JSON.stringify({
|
|
417
|
+
model: config.localVisionModel,
|
|
418
|
+
messages: [
|
|
419
|
+
{ role: "system", content: systemPrompt },
|
|
420
|
+
{ role: "user", content: userPrompt, images: imageB64List },
|
|
421
|
+
],
|
|
422
|
+
stream: false,
|
|
423
|
+
options: { num_predict: config.maxTokens },
|
|
424
|
+
}),
|
|
425
|
+
signal: controller.signal,
|
|
426
|
+
});
|
|
427
|
+
|
|
428
|
+
if (!response.ok) {
|
|
429
|
+
throw new Error(`Ollama ${response.status}: ${await response.text()}`);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
const data = await response.json() as {
|
|
433
|
+
message?: { content?: string };
|
|
434
|
+
eval_count?: number;
|
|
435
|
+
prompt_eval_count?: number;
|
|
436
|
+
};
|
|
437
|
+
|
|
438
|
+
const content = data.message?.content?.trim() || "";
|
|
439
|
+
const latencyMs = Date.now() - start;
|
|
440
|
+
const tokensIn = data.prompt_eval_count || 0;
|
|
441
|
+
const tokensOut = data.eval_count || 0;
|
|
442
|
+
|
|
443
|
+
log(TAG, `ollama vision: model=${config.localVisionModel} latency=${latencyMs}ms tokens=${tokensIn}+${tokensOut}`);
|
|
444
|
+
|
|
445
|
+
// Parse the response (same format as OpenRouter)
|
|
446
|
+
// Parse JSON response (same logic as callModel)
|
|
447
|
+
try {
|
|
448
|
+
const jsonStr = content.replace(/^```\w*\s*\n?/, "").replace(/\n?\s*```\s*$/, "").trim();
|
|
449
|
+
const parsed = JSON.parse(jsonStr);
|
|
450
|
+
return {
|
|
451
|
+
hud: parsed.hud || "\u2014",
|
|
452
|
+
digest: parsed.digest || "\u2014",
|
|
453
|
+
record: parseRecord(parsed),
|
|
454
|
+
task: parseTask(parsed),
|
|
455
|
+
latencyMs,
|
|
456
|
+
tokensIn, tokensOut,
|
|
457
|
+
model: config.localVisionModel,
|
|
458
|
+
parsedOk: true,
|
|
459
|
+
};
|
|
460
|
+
} catch {
|
|
461
|
+
const match = content.match(/\{[\s\S]*\}/);
|
|
462
|
+
if (match) {
|
|
463
|
+
try {
|
|
464
|
+
const parsed = JSON.parse(match[0]);
|
|
465
|
+
if (parsed.hud) {
|
|
466
|
+
return {
|
|
467
|
+
hud: parsed.hud,
|
|
468
|
+
digest: parsed.digest || "\u2014",
|
|
469
|
+
record: parseRecord(parsed),
|
|
470
|
+
task: parseTask(parsed),
|
|
471
|
+
latencyMs,
|
|
472
|
+
tokensIn, tokensOut,
|
|
473
|
+
model: config.localVisionModel,
|
|
474
|
+
parsedOk: true,
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
} catch {}
|
|
478
|
+
}
|
|
479
|
+
return {
|
|
480
|
+
hud: content.slice(0, 160) || "\u2014",
|
|
481
|
+
digest: content || "\u2014",
|
|
482
|
+
latencyMs,
|
|
483
|
+
tokensIn, tokensOut,
|
|
484
|
+
model: config.localVisionModel,
|
|
485
|
+
parsedOk: false,
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
} finally {
|
|
489
|
+
clearTimeout(timeout);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
@@ -2,6 +2,7 @@ import os from "node:os";
|
|
|
2
2
|
import { spawn, type ChildProcess } from "node:child_process";
|
|
3
3
|
import { resolve, dirname } from "node:path";
|
|
4
4
|
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { existsSync } from "node:fs";
|
|
5
6
|
import type { AudioPipelineConfig, AudioSourceTag } from "../types.js";
|
|
6
7
|
import type { CaptureSpawner } from "./capture-spawner.js";
|
|
7
8
|
import { log } from "../log.js";
|
|
@@ -16,7 +17,18 @@ const TAG = "audio";
|
|
|
16
17
|
*/
|
|
17
18
|
export class MacOSCaptureSpawner implements CaptureSpawner {
|
|
18
19
|
spawn(config: AudioPipelineConfig, source: AudioSourceTag): ChildProcess {
|
|
19
|
-
|
|
20
|
+
// Check ~/.sinain/sck-capture/ first (npx install), then dev path
|
|
21
|
+
const homeBinary = resolve(os.homedir(), ".sinain", "sck-capture", "sck-capture");
|
|
22
|
+
const devBinary = resolve(__dirname, "..", "..", "..", "tools", "sck-capture", "sck-capture");
|
|
23
|
+
const binaryPath = existsSync(homeBinary) ? homeBinary : devBinary;
|
|
24
|
+
|
|
25
|
+
if (!existsSync(binaryPath)) {
|
|
26
|
+
throw new Error(
|
|
27
|
+
`sck-capture binary not found at ${binaryPath}. ` +
|
|
28
|
+
`Run: npx @geravant/sinain setup-sck-capture`
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
|
|
20
32
|
const args = [
|
|
21
33
|
"--sample-rate", String(config.sampleRate),
|
|
22
34
|
"--channels", String(config.channels),
|
|
@@ -63,6 +63,13 @@ function boolEnv(key: string, fallback: boolean): boolean {
|
|
|
63
63
|
return v === "true";
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
/** Like env() but treats a defined-but-empty value as "" instead of falling through to fallback. */
|
|
67
|
+
function envAllowEmpty(key: string, fallbackKey?: string, defaultVal = ""): string {
|
|
68
|
+
if (process.env[key] !== undefined) return process.env[key]!;
|
|
69
|
+
if (fallbackKey && process.env[fallbackKey] !== undefined) return process.env[fallbackKey]!;
|
|
70
|
+
return defaultVal;
|
|
71
|
+
}
|
|
72
|
+
|
|
66
73
|
function resolvePath(p: string): string {
|
|
67
74
|
if (process.platform === "win32") {
|
|
68
75
|
// Expand %APPDATA%, %USERPROFILE%, %TEMP% etc.
|
|
@@ -172,6 +179,10 @@ export function loadConfig(): CoreConfig {
|
|
|
172
179
|
model: env("AGENT_MODEL", "google/gemini-2.5-flash-lite"),
|
|
173
180
|
visionModel: env("AGENT_VISION_MODEL", "google/gemini-2.5-flash"),
|
|
174
181
|
visionEnabled: boolEnv("AGENT_VISION_ENABLED", true),
|
|
182
|
+
localVisionEnabled: boolEnv("LOCAL_VISION_ENABLED", false),
|
|
183
|
+
localVisionModel: env("LOCAL_VISION_MODEL", "llava"),
|
|
184
|
+
localVisionUrl: env("LOCAL_VISION_URL", "http://localhost:11434"),
|
|
185
|
+
localVisionTimeout: intEnv("LOCAL_VISION_TIMEOUT", 10000),
|
|
175
186
|
openrouterApiKey: env("OPENROUTER_API_KEY", ""),
|
|
176
187
|
maxTokens: intEnv("AGENT_MAX_TOKENS", 800),
|
|
177
188
|
temperature: floatEnv("AGENT_TEMPERATURE", 0.3),
|
|
@@ -194,9 +205,9 @@ export function loadConfig(): CoreConfig {
|
|
|
194
205
|
};
|
|
195
206
|
|
|
196
207
|
const openclawConfig: OpenClawConfig = {
|
|
197
|
-
gatewayWsUrl:
|
|
208
|
+
gatewayWsUrl: envAllowEmpty("OPENCLAW_WS_URL", "OPENCLAW_GATEWAY_WS_URL", "ws://localhost:18789"),
|
|
198
209
|
gatewayToken: env("OPENCLAW_WS_TOKEN", env("OPENCLAW_GATEWAY_TOKEN", "")),
|
|
199
|
-
hookUrl:
|
|
210
|
+
hookUrl: envAllowEmpty("OPENCLAW_HTTP_URL", "OPENCLAW_HOOK_URL", "http://localhost:18789/hooks/agent"),
|
|
200
211
|
hookToken: env("OPENCLAW_HTTP_TOKEN", env("OPENCLAW_HOOK_TOKEN", "")),
|
|
201
212
|
sessionKey: env("OPENCLAW_SESSION_KEY", "agent:main:sinain"),
|
|
202
213
|
phase1TimeoutMs: intEnv("OPENCLAW_PHASE1_TIMEOUT_MS", 30_000),
|
|
@@ -161,6 +161,10 @@ export class OpenClawWsClient extends EventEmitter {
|
|
|
161
161
|
|
|
162
162
|
/** Connect to the OpenClaw gateway. */
|
|
163
163
|
connect(): void {
|
|
164
|
+
if (!this.config.gatewayWsUrl) {
|
|
165
|
+
log(TAG, "connect: no gateway URL configured — skipping");
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
164
168
|
if (!this.config.gatewayToken && !this.config.hookUrl) {
|
|
165
169
|
log(TAG, "connect: no gateway token or hookUrl — skipping");
|
|
166
170
|
return;
|
package/sinain-core/src/types.ts
CHANGED
|
@@ -223,6 +223,10 @@ export interface AgentConfig {
|
|
|
223
223
|
model: string;
|
|
224
224
|
visionModel: string;
|
|
225
225
|
visionEnabled: boolean;
|
|
226
|
+
localVisionEnabled: boolean;
|
|
227
|
+
localVisionModel: string;
|
|
228
|
+
localVisionUrl: string;
|
|
229
|
+
localVisionTimeout: number;
|
|
226
230
|
openrouterApiKey: string;
|
|
227
231
|
maxTokens: number;
|
|
228
232
|
temperature: number;
|