@geravant/sinain 1.8.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +14 -13
- package/HEARTBEAT.md +1 -1
- package/README.md +4 -7
- package/cli.js +16 -2
- package/config-shared.js +469 -0
- package/config.js +152 -0
- package/index.ts +1 -3
- package/launcher.js +7 -1
- package/onboard.js +345 -0
- package/package.json +8 -2
- package/sense_client/__main__.py +8 -4
- package/sense_client/gate.py +1 -0
- package/sense_client/ocr.py +58 -25
- package/sense_client/sender.py +2 -0
- package/sense_client/vision.py +31 -11
- package/sinain-agent/CLAUDE.md +0 -1
- package/sinain-agent/run.sh +2 -1
- package/sinain-core/src/agent/analyzer.ts +56 -58
- package/sinain-core/src/agent/loop.ts +37 -11
- package/sinain-core/src/audio/transcription.ts +20 -5
- package/sinain-core/src/config.ts +20 -16
- package/sinain-core/src/cost/tracker.ts +64 -0
- package/sinain-core/src/escalation/escalator.ts +31 -59
- package/sinain-core/src/index.ts +41 -45
- package/sinain-core/src/overlay/commands.ts +12 -0
- package/sinain-core/src/overlay/ws-handler.ts +27 -0
- package/sinain-core/src/server.ts +41 -0
- package/sinain-core/src/types.ts +46 -11
- package/sinain-knowledge/curation/engine.ts +0 -17
- package/sinain-knowledge/protocol/heartbeat.md +1 -1
- package/sinain-mcp-server/index.ts +4 -20
- package/sinain-memory/git_backup.sh +0 -19
package/sense_client/ocr.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""OCR backends for UI text extraction: macOS Vision, Windows.Media.Ocr, and Tesseract."""
|
|
2
|
+
|
|
2
3
|
from __future__ import annotations
|
|
3
4
|
|
|
4
5
|
import io
|
|
@@ -24,8 +25,13 @@ class OCRResult:
|
|
|
24
25
|
class LocalOCR:
|
|
25
26
|
"""Tesseract OCR wrapper for UI text extraction."""
|
|
26
27
|
|
|
27
|
-
def __init__(
|
|
28
|
-
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
lang: str = "eng",
|
|
31
|
+
psm: int = 11,
|
|
32
|
+
min_confidence: int = 30,
|
|
33
|
+
enabled: bool = True,
|
|
34
|
+
):
|
|
29
35
|
self.lang = lang
|
|
30
36
|
self.psm = psm
|
|
31
37
|
self.min_confidence = min_confidence
|
|
@@ -87,8 +93,12 @@ class LocalOCR:
|
|
|
87
93
|
class VisionOCR:
|
|
88
94
|
"""macOS Vision framework OCR using pyobjc."""
|
|
89
95
|
|
|
90
|
-
def __init__(
|
|
91
|
-
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
languages: list[str] | None = None,
|
|
99
|
+
min_confidence: float = 0.5,
|
|
100
|
+
enabled: bool = True,
|
|
101
|
+
):
|
|
92
102
|
self.languages = languages or ["en", "ru"]
|
|
93
103
|
self.min_confidence = min_confidence
|
|
94
104
|
self.enabled = enabled
|
|
@@ -101,8 +111,12 @@ class VisionOCR:
|
|
|
101
111
|
import objc # noqa: F401
|
|
102
112
|
import Quartz # noqa: F401
|
|
103
113
|
from Foundation import NSURL, NSData # noqa: F401
|
|
104
|
-
|
|
105
|
-
|
|
114
|
+
|
|
115
|
+
objc.loadBundle(
|
|
116
|
+
"Vision",
|
|
117
|
+
bundle_path="/System/Library/Frameworks/Vision.framework",
|
|
118
|
+
module_globals=globals(),
|
|
119
|
+
)
|
|
106
120
|
self._available = True
|
|
107
121
|
except Exception as e:
|
|
108
122
|
print(f"[ocr] Vision framework unavailable: {e}", flush=True)
|
|
@@ -120,9 +134,8 @@ class VisionOCR:
|
|
|
120
134
|
|
|
121
135
|
def _do_extract(self, image: Image.Image) -> OCRResult:
|
|
122
136
|
import objc
|
|
123
|
-
import Vision
|
|
124
|
-
from Foundation import NSData
|
|
125
137
|
import Quartz
|
|
138
|
+
from Foundation import NSData
|
|
126
139
|
|
|
127
140
|
# Convert PIL Image to CGImage via PNG bytes
|
|
128
141
|
buf = io.BytesIO()
|
|
@@ -138,15 +151,17 @@ class VisionOCR:
|
|
|
138
151
|
return OCRResult(text="", confidence=0, word_count=0)
|
|
139
152
|
|
|
140
153
|
# Create and configure request
|
|
141
|
-
request =
|
|
142
|
-
request.setRecognitionLevel_(
|
|
154
|
+
request = VNRecognizeTextRequest.alloc().init()
|
|
155
|
+
request.setRecognitionLevel_(0) # VNRequestTextRecognitionLevelAccurate
|
|
143
156
|
request.setRecognitionLanguages_(self.languages)
|
|
144
157
|
request.setUsesLanguageCorrection_(True)
|
|
145
158
|
|
|
146
159
|
# Execute
|
|
147
|
-
handler =
|
|
148
|
-
|
|
149
|
-
|
|
160
|
+
handler = VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
|
|
161
|
+
result = handler.performRequests_error_([request], objc.nil)
|
|
162
|
+
# PyObjC may return (bool, error) tuple or just bool depending on version
|
|
163
|
+
success = result[0] if isinstance(result, tuple) else result
|
|
164
|
+
if not success:
|
|
150
165
|
return OCRResult(text="", confidence=0, word_count=0)
|
|
151
166
|
|
|
152
167
|
results = request.results()
|
|
@@ -159,7 +174,8 @@ class VisionOCR:
|
|
|
159
174
|
|
|
160
175
|
for observation in results:
|
|
161
176
|
candidate = observation.topCandidates_(1)
|
|
162
|
-
|
|
177
|
+
# PyObjC may return bool instead of list depending on version
|
|
178
|
+
if not candidate or isinstance(candidate, bool):
|
|
163
179
|
continue
|
|
164
180
|
text = candidate[0].string()
|
|
165
181
|
conf = candidate[0].confidence()
|
|
@@ -197,8 +213,9 @@ class VisionOCR:
|
|
|
197
213
|
class WinOCR:
|
|
198
214
|
"""Windows.Media.Ocr backend via winrt-python (Windows 10+)."""
|
|
199
215
|
|
|
200
|
-
def __init__(
|
|
201
|
-
|
|
216
|
+
def __init__(
|
|
217
|
+
self, language: str = "en", min_confidence: float = 0.5, enabled: bool = True
|
|
218
|
+
):
|
|
202
219
|
self.language = language
|
|
203
220
|
self.min_confidence = min_confidence
|
|
204
221
|
self.enabled = enabled
|
|
@@ -209,8 +226,8 @@ class WinOCR:
|
|
|
209
226
|
return
|
|
210
227
|
|
|
211
228
|
try:
|
|
212
|
-
from winrt.windows.media.ocr import OcrEngine
|
|
213
229
|
from winrt.windows.globalization import Language
|
|
230
|
+
from winrt.windows.media.ocr import OcrEngine
|
|
214
231
|
|
|
215
232
|
lang = Language(language)
|
|
216
233
|
if OcrEngine.is_language_supported(lang):
|
|
@@ -234,11 +251,15 @@ class WinOCR:
|
|
|
234
251
|
|
|
235
252
|
def _do_extract(self, image: Image.Image) -> OCRResult:
|
|
236
253
|
import asyncio
|
|
254
|
+
|
|
237
255
|
from winrt.windows.graphics.imaging import (
|
|
238
|
-
|
|
256
|
+
BitmapAlphaMode,
|
|
257
|
+
BitmapPixelFormat,
|
|
258
|
+
SoftwareBitmap,
|
|
239
259
|
)
|
|
240
260
|
from winrt.windows.storage.streams import (
|
|
241
|
-
|
|
261
|
+
DataWriter,
|
|
262
|
+
InMemoryRandomAccessStream,
|
|
242
263
|
)
|
|
243
264
|
|
|
244
265
|
# Convert PIL to BMP bytes and load as SoftwareBitmap
|
|
@@ -254,13 +275,15 @@ class WinOCR:
|
|
|
254
275
|
stream.seek(0)
|
|
255
276
|
|
|
256
277
|
from winrt.windows.graphics.imaging import BitmapDecoder
|
|
278
|
+
|
|
257
279
|
decoder = await BitmapDecoder.create_async(stream)
|
|
258
280
|
bitmap = await decoder.get_software_bitmap_async()
|
|
259
281
|
|
|
260
282
|
# Convert to supported pixel format if needed
|
|
261
283
|
if bitmap.bitmap_pixel_format != BitmapPixelFormat.BGRA8:
|
|
262
|
-
bitmap = SoftwareBitmap.convert(
|
|
263
|
-
|
|
284
|
+
bitmap = SoftwareBitmap.convert(
|
|
285
|
+
bitmap, BitmapPixelFormat.BGRA8, BitmapAlphaMode.PREMULTIPLIED
|
|
286
|
+
)
|
|
264
287
|
|
|
265
288
|
result = await self._engine.recognize_async(bitmap)
|
|
266
289
|
return result
|
|
@@ -318,10 +341,15 @@ def create_ocr(config: dict):
|
|
|
318
341
|
enabled=enabled,
|
|
319
342
|
)
|
|
320
343
|
if vision._available:
|
|
321
|
-
print(
|
|
344
|
+
print(
|
|
345
|
+
f"[ocr] using Vision backend (languages={vision.languages})", flush=True
|
|
346
|
+
)
|
|
322
347
|
return vision
|
|
323
348
|
if backend == "vision":
|
|
324
|
-
print(
|
|
349
|
+
print(
|
|
350
|
+
"[ocr] Vision requested but unavailable, falling back to Tesseract",
|
|
351
|
+
flush=True,
|
|
352
|
+
)
|
|
325
353
|
|
|
326
354
|
# Windows: try Windows.Media.Ocr
|
|
327
355
|
if sys.platform == "win32" and backend in ("auto", "winocr"):
|
|
@@ -332,10 +360,15 @@ def create_ocr(config: dict):
|
|
|
332
360
|
enabled=enabled,
|
|
333
361
|
)
|
|
334
362
|
if winocr._available:
|
|
335
|
-
print(
|
|
363
|
+
print(
|
|
364
|
+
f"[ocr] using WinOCR backend (language={winocr.language})", flush=True
|
|
365
|
+
)
|
|
336
366
|
return winocr
|
|
337
367
|
if backend == "winocr":
|
|
338
|
-
print(
|
|
368
|
+
print(
|
|
369
|
+
"[ocr] WinOCR requested but unavailable, falling back to Tesseract",
|
|
370
|
+
flush=True,
|
|
371
|
+
)
|
|
339
372
|
|
|
340
373
|
# Fallback to Tesseract (cross-platform)
|
|
341
374
|
print("[ocr] using Tesseract backend", flush=True)
|
package/sense_client/sender.py
CHANGED
package/sense_client/vision.py
CHANGED
|
@@ -18,6 +18,7 @@ import json
|
|
|
18
18
|
import logging
|
|
19
19
|
import os
|
|
20
20
|
import time
|
|
21
|
+
import uuid
|
|
21
22
|
from abc import ABC, abstractmethod
|
|
22
23
|
from typing import TYPE_CHECKING, Optional
|
|
23
24
|
|
|
@@ -27,14 +28,23 @@ if TYPE_CHECKING:
|
|
|
27
28
|
logger = logging.getLogger("sinain.vision")
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
class VisionResult:
|
|
32
|
+
"""Result of a vision call: text + optional cost info."""
|
|
33
|
+
__slots__ = ("text", "cost")
|
|
34
|
+
|
|
35
|
+
def __init__(self, text: Optional[str], cost: Optional[dict] = None):
|
|
36
|
+
self.text = text
|
|
37
|
+
self.cost = cost # {cost, tokens_in, tokens_out, model, cost_id}
|
|
38
|
+
|
|
39
|
+
|
|
30
40
|
class VisionProvider(ABC):
|
|
31
41
|
"""Abstract base for vision inference backends."""
|
|
32
42
|
|
|
33
43
|
name: str = "unknown"
|
|
34
44
|
|
|
35
45
|
@abstractmethod
|
|
36
|
-
def describe(self, image: "Image.Image", prompt: Optional[str] = None) ->
|
|
37
|
-
"""Describe image content. Returns None on failure."""
|
|
46
|
+
def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
|
|
47
|
+
"""Describe image content. Returns VisionResult (text may be None on failure)."""
|
|
38
48
|
...
|
|
39
49
|
|
|
40
50
|
@abstractmethod
|
|
@@ -53,8 +63,8 @@ class OllamaVisionProvider(VisionProvider):
|
|
|
53
63
|
timeout=timeout, max_tokens=max_tokens)
|
|
54
64
|
self.name = f"ollama ({model})"
|
|
55
65
|
|
|
56
|
-
def describe(self, image: "Image.Image", prompt: Optional[str] = None) ->
|
|
57
|
-
return self._client.describe(image, prompt)
|
|
66
|
+
def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
|
|
67
|
+
return VisionResult(self._client.describe(image, prompt))
|
|
58
68
|
|
|
59
69
|
def is_available(self) -> bool:
|
|
60
70
|
return self._client.is_available()
|
|
@@ -73,9 +83,9 @@ class OpenRouterVisionProvider(VisionProvider):
|
|
|
73
83
|
self._max_tokens = max_tokens
|
|
74
84
|
self.name = f"openrouter ({model})"
|
|
75
85
|
|
|
76
|
-
def describe(self, image: "Image.Image", prompt: Optional[str] = None) ->
|
|
86
|
+
def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
|
|
77
87
|
if not self._api_key:
|
|
78
|
-
return None
|
|
88
|
+
return VisionResult(None)
|
|
79
89
|
|
|
80
90
|
try:
|
|
81
91
|
import requests
|
|
@@ -83,7 +93,7 @@ class OpenRouterVisionProvider(VisionProvider):
|
|
|
83
93
|
# Encode image
|
|
84
94
|
img_b64 = self._encode(image)
|
|
85
95
|
if not img_b64:
|
|
86
|
-
return None
|
|
96
|
+
return VisionResult(None)
|
|
87
97
|
|
|
88
98
|
prompt_text = prompt or "Describe what's on this screen concisely (2-3 sentences)."
|
|
89
99
|
|
|
@@ -112,13 +122,23 @@ class OpenRouterVisionProvider(VisionProvider):
|
|
|
112
122
|
resp.raise_for_status()
|
|
113
123
|
data = resp.json()
|
|
114
124
|
content = data["choices"][0]["message"]["content"].strip()
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
125
|
+
usage = data.get("usage", {})
|
|
126
|
+
logger.debug("openrouter vision: model=%s tokens=%s cost=%s",
|
|
127
|
+
self._model, usage.get("total_tokens", "?"), usage.get("cost", "?"))
|
|
128
|
+
cost_info = None
|
|
129
|
+
if usage.get("cost") is not None:
|
|
130
|
+
cost_info = {
|
|
131
|
+
"cost": usage["cost"],
|
|
132
|
+
"tokens_in": usage.get("prompt_tokens", 0),
|
|
133
|
+
"tokens_out": usage.get("completion_tokens", 0),
|
|
134
|
+
"model": self._model,
|
|
135
|
+
"cost_id": uuid.uuid4().hex[:16],
|
|
136
|
+
}
|
|
137
|
+
return VisionResult(content if content else None, cost_info)
|
|
118
138
|
|
|
119
139
|
except Exception as e:
|
|
120
140
|
logger.debug("openrouter vision failed: %s", e)
|
|
121
|
-
return None
|
|
141
|
+
return VisionResult(None)
|
|
122
142
|
|
|
123
143
|
def is_available(self) -> bool:
|
|
124
144
|
return bool(self._api_key)
|
package/sinain-agent/CLAUDE.md
CHANGED
|
@@ -47,7 +47,6 @@ When responding to escalations:
|
|
|
47
47
|
|
|
48
48
|
1. Call `sinain_heartbeat_tick` with a brief session summary
|
|
49
49
|
2. The tool runs the full pipeline automatically:
|
|
50
|
-
- Git backup of memory directory
|
|
51
50
|
- Signal analysis (detects opportunities from session patterns)
|
|
52
51
|
- **Session distillation** — fetches new feed items from sinain-core, distills patterns/learnings
|
|
53
52
|
- **Knowledge integration** — updates playbook (working memory) and knowledge graph (long-term memory)
|
package/sinain-agent/run.sh
CHANGED
|
@@ -78,6 +78,7 @@ invoke_agent() {
|
|
|
78
78
|
codex)
|
|
79
79
|
codex exec -s danger-full-access \
|
|
80
80
|
--dangerously-bypass-approvals-and-sandbox \
|
|
81
|
+
--skip-git-repo-check \
|
|
81
82
|
"$prompt"
|
|
82
83
|
;;
|
|
83
84
|
junie)
|
|
@@ -271,7 +272,7 @@ while true; do
|
|
|
271
272
|
# MCP path: agent runs task with sinain tools available
|
|
272
273
|
SPAWN_PROMPT="You have a background task to complete. Task: $SPAWN_TASK
|
|
273
274
|
|
|
274
|
-
Complete this task thoroughly. Use sinain_get_knowledge and sinain_knowledge_query if you need context from past sessions.
|
|
275
|
+
Complete this task thoroughly. Use sinain_get_knowledge and sinain_knowledge_query if you need context from past sessions. Summarize your findings concisely."
|
|
275
276
|
SPAWN_RESULT=$(invoke_agent "$SPAWN_PROMPT" "$SPAWN_MAX_TURNS" || echo "ERROR: agent invocation failed")
|
|
276
277
|
else
|
|
277
278
|
# Pipe path: agent gets task text directly
|
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { AnalysisConfig, AgentResult, ContextWindow, RecorderStatus, RecordCommand } from "../types.js";
|
|
2
2
|
import { normalizeAppName } from "./context-window.js";
|
|
3
3
|
import { log, error } from "../log.js";
|
|
4
4
|
import { levelFor, applyLevel } from "../privacy/index.js";
|
|
5
5
|
|
|
6
6
|
const TAG = "agent";
|
|
7
7
|
|
|
8
|
-
/** Guard: only one Ollama vision call at a time (latest-wins, skip if busy). */
|
|
9
|
-
let ollamaInFlight = false;
|
|
10
|
-
|
|
11
8
|
/**
|
|
12
9
|
* Model-specific timeouts in milliseconds.
|
|
13
10
|
* Only increases timeouts for slow models to avoid false timeouts.
|
|
@@ -56,12 +53,13 @@ You produce outputs as JSON.
|
|
|
56
53
|
Respond ONLY with valid JSON. No markdown, no code fences, no explanation.
|
|
57
54
|
Your entire response must be parseable by JSON.parse().
|
|
58
55
|
|
|
59
|
-
{"hud":"...","digest":"...","record":{"command":"start"|"stop","label":"..."}}
|
|
56
|
+
{"hud":"...","digest":"...","record":{"command":"start"|"stop","label":"..."},"task":"..."}
|
|
60
57
|
|
|
61
58
|
Output fields:
|
|
62
59
|
- "hud" (required): max 60 words describing what user is doing NOW
|
|
63
60
|
- "digest" (required): 5-8 sentences with detailed activity description
|
|
64
61
|
- "record" (optional): control recording — {"command":"start","label":"Meeting name"} or {"command":"stop"}
|
|
62
|
+
- "task" (optional): natural language instruction to spawn a background task
|
|
65
63
|
|
|
66
64
|
When to use "record":
|
|
67
65
|
- START when user begins a meeting, call, lecture, YouTube video, or important audio content
|
|
@@ -69,7 +67,24 @@ When to use "record":
|
|
|
69
67
|
- Provide descriptive labels like "Team standup", "Client call", "YouTube: [video title from OCR]"
|
|
70
68
|
- For YouTube/video content: extract video title from screen OCR for the label
|
|
71
69
|
|
|
72
|
-
|
|
70
|
+
When to use "task":
|
|
71
|
+
- User explicitly asks for research, lookup, or action
|
|
72
|
+
- Something needs external search or processing that isn't a real-time response
|
|
73
|
+
- Example: "Search for React 19 migration guide", "Find docs for this API"
|
|
74
|
+
|
|
75
|
+
When to spawn "task" for video content:
|
|
76
|
+
- If user watches a YouTube video for 2+ minutes AND no task has been spawned for this video yet, spawn: "Summarize YouTube video: [title or URL from OCR]"
|
|
77
|
+
- ONLY spawn ONCE per video - do not repeat spawn for the same video in subsequent ticks
|
|
78
|
+
- Extract video title or URL from screen OCR to include in the task
|
|
79
|
+
|
|
80
|
+
When to spawn "task" for coding problems:
|
|
81
|
+
- If user is actively working on a coding problem/challenge for 1+ minutes:
|
|
82
|
+
- Spawn: "Solve coding problem: [problem description/title from OCR]"
|
|
83
|
+
- This includes LeetCode, HackerRank, interviews, coding assessments, or any visible coding challenge
|
|
84
|
+
- Look for problem signals: "Input:", "Output:", "Example", "Constraints:", problem titles, test cases
|
|
85
|
+
- Include as much context as possible from the screen OCR (problem description, examples, constraints)
|
|
86
|
+
- ONLY spawn ONCE per distinct problem - do not repeat for the same problem
|
|
87
|
+
- The spawned task should provide a complete solution with code and explanation
|
|
73
88
|
|
|
74
89
|
Audio sources: [\ud83d\udd0a]=system/speaker audio, [\ud83c\udf99]=microphone (user's voice).
|
|
75
90
|
Treat [\ud83c\udf99] as direct user speech. Treat [\ud83d\udd0a] as external audio.
|
|
@@ -193,75 +208,54 @@ function parseTask(parsed: any): string | undefined {
|
|
|
193
208
|
*/
|
|
194
209
|
export async function analyzeContext(
|
|
195
210
|
contextWindow: ContextWindow,
|
|
196
|
-
config:
|
|
211
|
+
config: AnalysisConfig,
|
|
197
212
|
recorderStatus: RecorderStatus | null = null,
|
|
198
213
|
traitSystemPrompt?: string,
|
|
199
214
|
): Promise<AgentResult> {
|
|
200
215
|
const userPrompt = buildUserPrompt(contextWindow, recorderStatus);
|
|
201
|
-
|
|
216
|
+
|
|
217
|
+
// Apply privacy gating for images based on provider
|
|
202
218
|
let images = contextWindow.images || [];
|
|
219
|
+
const privacyDest = config.provider === "ollama" ? "local_llm" : "openrouter";
|
|
203
220
|
try {
|
|
204
|
-
|
|
205
|
-
if (imgLevel === "none") {
|
|
206
|
-
images = [];
|
|
207
|
-
}
|
|
221
|
+
if (levelFor("screen_images", privacyDest) === "none") images = [];
|
|
208
222
|
} catch { /* privacy not initialized, keep images */ }
|
|
223
|
+
|
|
209
224
|
const systemPrompt = traitSystemPrompt ?? SYSTEM_PROMPT;
|
|
210
225
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
if (config.localVisionEnabled && !ollamaInFlight) {
|
|
214
|
-
ollamaInFlight = true;
|
|
215
|
-
try {
|
|
216
|
-
const result = await callOllamaVision(systemPrompt, userPrompt, images, config);
|
|
217
|
-
const mode = images.length > 0 ? "vision" : "text";
|
|
218
|
-
log(TAG, `local ollama (${config.localVisionModel}, ${mode}): success`);
|
|
219
|
-
return result;
|
|
220
|
-
} catch (err: any) {
|
|
221
|
-
log(TAG, `local ollama failed: ${err.message || err}, falling back to OpenRouter`);
|
|
222
|
-
} finally {
|
|
223
|
-
ollamaInFlight = false;
|
|
224
|
-
}
|
|
226
|
+
if (config.provider === "ollama") {
|
|
227
|
+
return await callOllama(systemPrompt, userPrompt, images, config);
|
|
225
228
|
}
|
|
226
229
|
|
|
227
|
-
//
|
|
228
|
-
if (!config.
|
|
229
|
-
|
|
230
|
-
throw new Error("local ollama failed and no OpenRouter API key — cannot analyze");
|
|
231
|
-
}
|
|
232
|
-
throw new Error("no OpenRouter API key configured");
|
|
230
|
+
// OpenRouter path: model chain with fallbacks
|
|
231
|
+
if (!config.apiKey) {
|
|
232
|
+
throw new Error("ANALYSIS_API_KEY / OPENROUTER_API_KEY not set");
|
|
233
233
|
}
|
|
234
234
|
|
|
235
235
|
const models = [config.model, ...config.fallbackModels];
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
// Insert vision model at the front if not already there
|
|
240
|
-
if (!models.includes(config.visionModel)) {
|
|
241
|
-
models.unshift(config.visionModel);
|
|
242
|
-
}
|
|
236
|
+
// Auto-upgrade to vision model when images are present
|
|
237
|
+
if (images.length > 0 && config.visionModel && !models.includes(config.visionModel)) {
|
|
238
|
+
models.unshift(config.visionModel);
|
|
243
239
|
}
|
|
244
240
|
|
|
245
241
|
let lastError: Error | null = null;
|
|
246
|
-
|
|
247
242
|
for (const model of models) {
|
|
248
243
|
try {
|
|
249
|
-
return await
|
|
244
|
+
return await callOpenRouter(systemPrompt, userPrompt, images, model, config);
|
|
250
245
|
} catch (err: any) {
|
|
251
246
|
lastError = err;
|
|
252
247
|
log(TAG, `model ${model} failed: ${err.message || err}, trying next...`);
|
|
253
248
|
}
|
|
254
249
|
}
|
|
255
|
-
|
|
256
250
|
throw lastError || new Error("all models failed");
|
|
257
251
|
}
|
|
258
252
|
|
|
259
|
-
async function
|
|
253
|
+
async function callOpenRouter(
|
|
260
254
|
systemPrompt: string,
|
|
261
255
|
userPrompt: string,
|
|
262
256
|
images: ContextWindow["images"],
|
|
263
257
|
model: string,
|
|
264
|
-
config:
|
|
258
|
+
config: AnalysisConfig,
|
|
265
259
|
): Promise<AgentResult> {
|
|
266
260
|
const start = Date.now();
|
|
267
261
|
const controller = new AbortController();
|
|
@@ -289,10 +283,10 @@ async function callModel(
|
|
|
289
283
|
|
|
290
284
|
const imageCount = images?.length || 0;
|
|
291
285
|
|
|
292
|
-
const response = await fetch(
|
|
286
|
+
const response = await fetch(config.endpoint, {
|
|
293
287
|
method: "POST",
|
|
294
288
|
headers: {
|
|
295
|
-
"Authorization": `Bearer ${config.
|
|
289
|
+
"Authorization": `Bearer ${config.apiKey}`,
|
|
296
290
|
"Content-Type": "application/json",
|
|
297
291
|
},
|
|
298
292
|
body: JSON.stringify({
|
|
@@ -324,6 +318,7 @@ async function callModel(
|
|
|
324
318
|
try {
|
|
325
319
|
const jsonStr = raw.replace(/^```\w*\s*\n?/, "").replace(/\n?\s*```\s*$/, "").trim();
|
|
326
320
|
const parsed = JSON.parse(jsonStr);
|
|
321
|
+
const apiCost = typeof data.usage?.cost === "number" ? data.usage.cost : undefined;
|
|
327
322
|
return {
|
|
328
323
|
hud: parsed.hud || "\u2014",
|
|
329
324
|
digest: parsed.digest || "\u2014",
|
|
@@ -334,10 +329,12 @@ async function callModel(
|
|
|
334
329
|
tokensOut: data.usage?.completion_tokens || 0,
|
|
335
330
|
model,
|
|
336
331
|
parsedOk: true,
|
|
332
|
+
cost: apiCost,
|
|
337
333
|
};
|
|
338
334
|
} catch {
|
|
339
335
|
// Second chance: extract embedded JSON object
|
|
340
336
|
const match = raw.match(/\{[\s\S]*\}/);
|
|
337
|
+
const apiCost = typeof data.usage?.cost === "number" ? data.usage.cost : undefined;
|
|
341
338
|
if (match) {
|
|
342
339
|
try {
|
|
343
340
|
const parsed = JSON.parse(match[0]);
|
|
@@ -352,6 +349,7 @@ async function callModel(
|
|
|
352
349
|
tokensOut: data.usage?.completion_tokens || 0,
|
|
353
350
|
model,
|
|
354
351
|
parsedOk: true,
|
|
352
|
+
cost: apiCost,
|
|
355
353
|
};
|
|
356
354
|
}
|
|
357
355
|
} catch { /* fall through */ }
|
|
@@ -367,6 +365,7 @@ async function callModel(
|
|
|
367
365
|
tokensOut: data.usage?.completion_tokens || 0,
|
|
368
366
|
model,
|
|
369
367
|
parsedOk: false,
|
|
368
|
+
cost: apiCost,
|
|
370
369
|
};
|
|
371
370
|
}
|
|
372
371
|
} finally {
|
|
@@ -375,28 +374,27 @@ async function callModel(
|
|
|
375
374
|
}
|
|
376
375
|
|
|
377
376
|
/**
|
|
378
|
-
* Call Ollama local
|
|
379
|
-
* Uses the /api/chat endpoint with base64 images.
|
|
380
|
-
* Falls back to OpenRouter on any failure.
|
|
377
|
+
* Call Ollama local model for context analysis.
|
|
378
|
+
* Uses the /api/chat endpoint with optional base64 images.
|
|
381
379
|
*/
|
|
382
|
-
async function
|
|
380
|
+
async function callOllama(
|
|
383
381
|
systemPrompt: string,
|
|
384
382
|
userPrompt: string,
|
|
385
383
|
images: ContextWindow["images"],
|
|
386
|
-
config:
|
|
384
|
+
config: AnalysisConfig,
|
|
387
385
|
): Promise<AgentResult> {
|
|
388
386
|
const start = Date.now();
|
|
389
387
|
const controller = new AbortController();
|
|
390
|
-
const timeout = setTimeout(() => controller.abort(), config.
|
|
388
|
+
const timeout = setTimeout(() => controller.abort(), config.timeout);
|
|
391
389
|
|
|
392
390
|
try {
|
|
393
391
|
const imageB64List = (images || []).map((img) => img.data);
|
|
394
392
|
|
|
395
|
-
const response = await fetch(`${config.
|
|
393
|
+
const response = await fetch(`${config.endpoint}/api/chat`, {
|
|
396
394
|
method: "POST",
|
|
397
395
|
headers: { "Content-Type": "application/json" },
|
|
398
396
|
body: JSON.stringify({
|
|
399
|
-
model: config.
|
|
397
|
+
model: config.model,
|
|
400
398
|
messages: [
|
|
401
399
|
{ role: "system", content: systemPrompt },
|
|
402
400
|
{ role: "user", content: userPrompt, images: imageB64List },
|
|
@@ -422,7 +420,7 @@ async function callOllamaVision(
|
|
|
422
420
|
const tokensIn = data.prompt_eval_count || 0;
|
|
423
421
|
const tokensOut = data.eval_count || 0;
|
|
424
422
|
|
|
425
|
-
log(TAG, `ollama vision: model=${config.
|
|
423
|
+
log(TAG, `ollama vision: model=${config.model} latency=${latencyMs}ms tokens=${tokensIn}+${tokensOut}`);
|
|
426
424
|
|
|
427
425
|
// Parse the response (same format as OpenRouter)
|
|
428
426
|
// Parse JSON response (same logic as callModel)
|
|
@@ -436,7 +434,7 @@ async function callOllamaVision(
|
|
|
436
434
|
task: parseTask(parsed),
|
|
437
435
|
latencyMs,
|
|
438
436
|
tokensIn, tokensOut,
|
|
439
|
-
model: config.
|
|
437
|
+
model: config.model,
|
|
440
438
|
parsedOk: true,
|
|
441
439
|
};
|
|
442
440
|
} catch {
|
|
@@ -452,7 +450,7 @@ async function callOllamaVision(
|
|
|
452
450
|
task: parseTask(parsed),
|
|
453
451
|
latencyMs,
|
|
454
452
|
tokensIn, tokensOut,
|
|
455
|
-
model: config.
|
|
453
|
+
model: config.model,
|
|
456
454
|
parsedOk: true,
|
|
457
455
|
};
|
|
458
456
|
}
|
|
@@ -463,7 +461,7 @@ async function callOllamaVision(
|
|
|
463
461
|
digest: content || "\u2014",
|
|
464
462
|
latencyMs,
|
|
465
463
|
tokensIn, tokensOut,
|
|
466
|
-
model: config.
|
|
464
|
+
model: config.model,
|
|
467
465
|
parsedOk: false,
|
|
468
466
|
};
|
|
469
467
|
}
|