@geravant/sinain 1.8.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  """OCR backends for UI text extraction: macOS Vision, Windows.Media.Ocr, and Tesseract."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  import io
@@ -24,8 +25,13 @@ class OCRResult:
24
25
  class LocalOCR:
25
26
  """Tesseract OCR wrapper for UI text extraction."""
26
27
 
27
- def __init__(self, lang: str = "eng", psm: int = 11,
28
- min_confidence: int = 30, enabled: bool = True):
28
+ def __init__(
29
+ self,
30
+ lang: str = "eng",
31
+ psm: int = 11,
32
+ min_confidence: int = 30,
33
+ enabled: bool = True,
34
+ ):
29
35
  self.lang = lang
30
36
  self.psm = psm
31
37
  self.min_confidence = min_confidence
@@ -87,8 +93,12 @@ class LocalOCR:
87
93
  class VisionOCR:
88
94
  """macOS Vision framework OCR using pyobjc."""
89
95
 
90
- def __init__(self, languages: list[str] | None = None,
91
- min_confidence: float = 0.5, enabled: bool = True):
96
+ def __init__(
97
+ self,
98
+ languages: list[str] | None = None,
99
+ min_confidence: float = 0.5,
100
+ enabled: bool = True,
101
+ ):
92
102
  self.languages = languages or ["en", "ru"]
93
103
  self.min_confidence = min_confidence
94
104
  self.enabled = enabled
@@ -101,8 +111,12 @@ class VisionOCR:
101
111
  import objc # noqa: F401
102
112
  import Quartz # noqa: F401
103
113
  from Foundation import NSURL, NSData # noqa: F401
104
- objc.loadBundle('Vision', bundle_path='/System/Library/Frameworks/Vision.framework',
105
- module_globals=globals())
114
+
115
+ objc.loadBundle(
116
+ "Vision",
117
+ bundle_path="/System/Library/Frameworks/Vision.framework",
118
+ module_globals=globals(),
119
+ )
106
120
  self._available = True
107
121
  except Exception as e:
108
122
  print(f"[ocr] Vision framework unavailable: {e}", flush=True)
@@ -120,9 +134,8 @@ class VisionOCR:
120
134
 
121
135
  def _do_extract(self, image: Image.Image) -> OCRResult:
122
136
  import objc
123
- import Vision
124
- from Foundation import NSData
125
137
  import Quartz
138
+ from Foundation import NSData
126
139
 
127
140
  # Convert PIL Image to CGImage via PNG bytes
128
141
  buf = io.BytesIO()
@@ -138,15 +151,17 @@ class VisionOCR:
138
151
  return OCRResult(text="", confidence=0, word_count=0)
139
152
 
140
153
  # Create and configure request
141
- request = Vision.VNRecognizeTextRequest.alloc().init()
142
- request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
154
+ request = VNRecognizeTextRequest.alloc().init()
155
+ request.setRecognitionLevel_(0) # VNRequestTextRecognitionLevelAccurate
143
156
  request.setRecognitionLanguages_(self.languages)
144
157
  request.setUsesLanguageCorrection_(True)
145
158
 
146
159
  # Execute
147
- handler = Vision.VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
148
- success = handler.performRequests_error_([request], objc.nil)
149
- if not success[0]:
160
+ handler = VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
161
+ result = handler.performRequests_error_([request], objc.nil)
162
+ # PyObjC may return (bool, error) tuple or just bool depending on version
163
+ success = result[0] if isinstance(result, tuple) else result
164
+ if not success:
150
165
  return OCRResult(text="", confidence=0, word_count=0)
151
166
 
152
167
  results = request.results()
@@ -159,7 +174,8 @@ class VisionOCR:
159
174
 
160
175
  for observation in results:
161
176
  candidate = observation.topCandidates_(1)
162
- if not candidate:
177
+ # PyObjC may return bool instead of list depending on version
178
+ if not candidate or isinstance(candidate, bool):
163
179
  continue
164
180
  text = candidate[0].string()
165
181
  conf = candidate[0].confidence()
@@ -197,8 +213,9 @@ class VisionOCR:
197
213
  class WinOCR:
198
214
  """Windows.Media.Ocr backend via winrt-python (Windows 10+)."""
199
215
 
200
- def __init__(self, language: str = "en", min_confidence: float = 0.5,
201
- enabled: bool = True):
216
+ def __init__(
217
+ self, language: str = "en", min_confidence: float = 0.5, enabled: bool = True
218
+ ):
202
219
  self.language = language
203
220
  self.min_confidence = min_confidence
204
221
  self.enabled = enabled
@@ -209,8 +226,8 @@ class WinOCR:
209
226
  return
210
227
 
211
228
  try:
212
- from winrt.windows.media.ocr import OcrEngine
213
229
  from winrt.windows.globalization import Language
230
+ from winrt.windows.media.ocr import OcrEngine
214
231
 
215
232
  lang = Language(language)
216
233
  if OcrEngine.is_language_supported(lang):
@@ -234,11 +251,15 @@ class WinOCR:
234
251
 
235
252
  def _do_extract(self, image: Image.Image) -> OCRResult:
236
253
  import asyncio
254
+
237
255
  from winrt.windows.graphics.imaging import (
238
- SoftwareBitmap, BitmapPixelFormat, BitmapAlphaMode,
256
+ BitmapAlphaMode,
257
+ BitmapPixelFormat,
258
+ SoftwareBitmap,
239
259
  )
240
260
  from winrt.windows.storage.streams import (
241
- InMemoryRandomAccessStream, DataWriter,
261
+ DataWriter,
262
+ InMemoryRandomAccessStream,
242
263
  )
243
264
 
244
265
  # Convert PIL to BMP bytes and load as SoftwareBitmap
@@ -254,13 +275,15 @@ class WinOCR:
254
275
  stream.seek(0)
255
276
 
256
277
  from winrt.windows.graphics.imaging import BitmapDecoder
278
+
257
279
  decoder = await BitmapDecoder.create_async(stream)
258
280
  bitmap = await decoder.get_software_bitmap_async()
259
281
 
260
282
  # Convert to supported pixel format if needed
261
283
  if bitmap.bitmap_pixel_format != BitmapPixelFormat.BGRA8:
262
- bitmap = SoftwareBitmap.convert(bitmap, BitmapPixelFormat.BGRA8,
263
- BitmapAlphaMode.PREMULTIPLIED)
284
+ bitmap = SoftwareBitmap.convert(
285
+ bitmap, BitmapPixelFormat.BGRA8, BitmapAlphaMode.PREMULTIPLIED
286
+ )
264
287
 
265
288
  result = await self._engine.recognize_async(bitmap)
266
289
  return result
@@ -318,10 +341,15 @@ def create_ocr(config: dict):
318
341
  enabled=enabled,
319
342
  )
320
343
  if vision._available:
321
- print(f"[ocr] using Vision backend (languages={vision.languages})", flush=True)
344
+ print(
345
+ f"[ocr] using Vision backend (languages={vision.languages})", flush=True
346
+ )
322
347
  return vision
323
348
  if backend == "vision":
324
- print("[ocr] Vision requested but unavailable, falling back to Tesseract", flush=True)
349
+ print(
350
+ "[ocr] Vision requested but unavailable, falling back to Tesseract",
351
+ flush=True,
352
+ )
325
353
 
326
354
  # Windows: try Windows.Media.Ocr
327
355
  if sys.platform == "win32" and backend in ("auto", "winocr"):
@@ -332,10 +360,15 @@ def create_ocr(config: dict):
332
360
  enabled=enabled,
333
361
  )
334
362
  if winocr._available:
335
- print(f"[ocr] using WinOCR backend (language={winocr.language})", flush=True)
363
+ print(
364
+ f"[ocr] using WinOCR backend (language={winocr.language})", flush=True
365
+ )
336
366
  return winocr
337
367
  if backend == "winocr":
338
- print("[ocr] WinOCR requested but unavailable, falling back to Tesseract", flush=True)
368
+ print(
369
+ "[ocr] WinOCR requested but unavailable, falling back to Tesseract",
370
+ flush=True,
371
+ )
339
372
 
340
373
  # Fallback to Tesseract (cross-platform)
341
374
  print("[ocr] using Tesseract backend", flush=True)
@@ -51,6 +51,8 @@ class SenseSender:
51
51
  "narrative": event.observation.narrative,
52
52
  "concepts": event.observation.concepts,
53
53
  }
54
+ if event.vision_cost:
55
+ payload["vision_cost"] = event.vision_cost
54
56
 
55
57
  for attempt in range(_MAX_RETRIES):
56
58
  try:
@@ -18,6 +18,7 @@ import json
18
18
  import logging
19
19
  import os
20
20
  import time
21
+ import uuid
21
22
  from abc import ABC, abstractmethod
22
23
  from typing import TYPE_CHECKING, Optional
23
24
 
@@ -27,14 +28,23 @@ if TYPE_CHECKING:
27
28
  logger = logging.getLogger("sinain.vision")
28
29
 
29
30
 
31
+ class VisionResult:
32
+ """Result of a vision call: text + optional cost info."""
33
+ __slots__ = ("text", "cost")
34
+
35
+ def __init__(self, text: Optional[str], cost: Optional[dict] = None):
36
+ self.text = text
37
+ self.cost = cost # {cost, tokens_in, tokens_out, model, cost_id}
38
+
39
+
30
40
  class VisionProvider(ABC):
31
41
  """Abstract base for vision inference backends."""
32
42
 
33
43
  name: str = "unknown"
34
44
 
35
45
  @abstractmethod
36
- def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
37
- """Describe image content. Returns None on failure."""
46
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
47
+ """Describe image content. Returns VisionResult (text may be None on failure)."""
38
48
  ...
39
49
 
40
50
  @abstractmethod
@@ -53,8 +63,8 @@ class OllamaVisionProvider(VisionProvider):
53
63
  timeout=timeout, max_tokens=max_tokens)
54
64
  self.name = f"ollama ({model})"
55
65
 
56
- def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
57
- return self._client.describe(image, prompt)
66
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
67
+ return VisionResult(self._client.describe(image, prompt))
58
68
 
59
69
  def is_available(self) -> bool:
60
70
  return self._client.is_available()
@@ -73,9 +83,9 @@ class OpenRouterVisionProvider(VisionProvider):
73
83
  self._max_tokens = max_tokens
74
84
  self.name = f"openrouter ({model})"
75
85
 
76
- def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> Optional[str]:
86
+ def describe(self, image: "Image.Image", prompt: Optional[str] = None) -> VisionResult:
77
87
  if not self._api_key:
78
- return None
88
+ return VisionResult(None)
79
89
 
80
90
  try:
81
91
  import requests
@@ -83,7 +93,7 @@ class OpenRouterVisionProvider(VisionProvider):
83
93
  # Encode image
84
94
  img_b64 = self._encode(image)
85
95
  if not img_b64:
86
- return None
96
+ return VisionResult(None)
87
97
 
88
98
  prompt_text = prompt or "Describe what's on this screen concisely (2-3 sentences)."
89
99
 
@@ -112,13 +122,23 @@ class OpenRouterVisionProvider(VisionProvider):
112
122
  resp.raise_for_status()
113
123
  data = resp.json()
114
124
  content = data["choices"][0]["message"]["content"].strip()
115
- logger.debug("openrouter vision: model=%s tokens=%s",
116
- self._model, data.get("usage", {}).get("total_tokens", "?"))
117
- return content if content else None
125
+ usage = data.get("usage", {})
126
+ logger.debug("openrouter vision: model=%s tokens=%s cost=%s",
127
+ self._model, usage.get("total_tokens", "?"), usage.get("cost", "?"))
128
+ cost_info = None
129
+ if usage.get("cost") is not None:
130
+ cost_info = {
131
+ "cost": usage["cost"],
132
+ "tokens_in": usage.get("prompt_tokens", 0),
133
+ "tokens_out": usage.get("completion_tokens", 0),
134
+ "model": self._model,
135
+ "cost_id": uuid.uuid4().hex[:16],
136
+ }
137
+ return VisionResult(content if content else None, cost_info)
118
138
 
119
139
  except Exception as e:
120
140
  logger.debug("openrouter vision failed: %s", e)
121
- return None
141
+ return VisionResult(None)
122
142
 
123
143
  def is_available(self) -> bool:
124
144
  return bool(self._api_key)
@@ -47,7 +47,6 @@ When responding to escalations:
47
47
 
48
48
  1. Call `sinain_heartbeat_tick` with a brief session summary
49
49
  2. The tool runs the full pipeline automatically:
50
- - Git backup of memory directory
51
50
  - Signal analysis (detects opportunities from session patterns)
52
51
  - **Session distillation** — fetches new feed items from sinain-core, distills patterns/learnings
53
52
  - **Knowledge integration** — updates playbook (working memory) and knowledge graph (long-term memory)
@@ -78,6 +78,7 @@ invoke_agent() {
78
78
  codex)
79
79
  codex exec -s danger-full-access \
80
80
  --dangerously-bypass-approvals-and-sandbox \
81
+ --skip-git-repo-check \
81
82
  "$prompt"
82
83
  ;;
83
84
  junie)
@@ -271,7 +272,7 @@ while true; do
271
272
  # MCP path: agent runs task with sinain tools available
272
273
  SPAWN_PROMPT="You have a background task to complete. Task: $SPAWN_TASK
273
274
 
274
- Complete this task thoroughly. Use sinain_get_knowledge and sinain_knowledge_query if you need context from past sessions. Use web search, file operations, and code execution as needed. Create end-to-end artifacts. Summarize your findings concisely."
275
+ Complete this task thoroughly. Use sinain_get_knowledge and sinain_knowledge_query if you need context from past sessions. Summarize your findings concisely."
275
276
  SPAWN_RESULT=$(invoke_agent "$SPAWN_PROMPT" "$SPAWN_MAX_TURNS" || echo "ERROR: agent invocation failed")
276
277
  else
277
278
  # Pipe path: agent gets task text directly
@@ -1,13 +1,10 @@
1
- import type { AgentConfig, AgentResult, ContextWindow, RecorderStatus, RecordCommand } from "../types.js";
1
+ import type { AnalysisConfig, AgentResult, ContextWindow, RecorderStatus, RecordCommand } from "../types.js";
2
2
  import { normalizeAppName } from "./context-window.js";
3
3
  import { log, error } from "../log.js";
4
4
  import { levelFor, applyLevel } from "../privacy/index.js";
5
5
 
6
6
  const TAG = "agent";
7
7
 
8
- /** Guard: only one Ollama vision call at a time (latest-wins, skip if busy). */
9
- let ollamaInFlight = false;
10
-
11
8
  /**
12
9
  * Model-specific timeouts in milliseconds.
13
10
  * Only increases timeouts for slow models to avoid false timeouts.
@@ -56,12 +53,13 @@ You produce outputs as JSON.
56
53
  Respond ONLY with valid JSON. No markdown, no code fences, no explanation.
57
54
  Your entire response must be parseable by JSON.parse().
58
55
 
59
- {"hud":"...","digest":"...","record":{"command":"start"|"stop","label":"..."}}
56
+ {"hud":"...","digest":"...","record":{"command":"start"|"stop","label":"..."},"task":"..."}
60
57
 
61
58
  Output fields:
62
59
  - "hud" (required): max 60 words describing what user is doing NOW
63
60
  - "digest" (required): 5-8 sentences with detailed activity description
64
61
  - "record" (optional): control recording — {"command":"start","label":"Meeting name"} or {"command":"stop"}
62
+ - "task" (optional): natural language instruction to spawn a background task
65
63
 
66
64
  When to use "record":
67
65
  - START when user begins a meeting, call, lecture, YouTube video, or important audio content
@@ -69,7 +67,24 @@ When to use "record":
69
67
  - Provide descriptive labels like "Team standup", "Client call", "YouTube: [video title from OCR]"
70
68
  - For YouTube/video content: extract video title from screen OCR for the label
71
69
 
72
- Do NOT set a "task" field — background tasks are spawned by user commands only.
70
+ When to use "task":
71
+ - User explicitly asks for research, lookup, or action
72
+ - Something needs external search or processing that isn't a real-time response
73
+ - Example: "Search for React 19 migration guide", "Find docs for this API"
74
+
75
+ When to spawn "task" for video content:
76
+ - If user watches a YouTube video for 2+ minutes AND no task has been spawned for this video yet, spawn: "Summarize YouTube video: [title or URL from OCR]"
77
+ - ONLY spawn ONCE per video - do not repeat spawn for the same video in subsequent ticks
78
+ - Extract video title or URL from screen OCR to include in the task
79
+
80
+ When to spawn "task" for coding problems:
81
+ - If user is actively working on a coding problem/challenge for 1+ minutes:
82
+ - Spawn: "Solve coding problem: [problem description/title from OCR]"
83
+ - This includes LeetCode, HackerRank, interviews, coding assessments, or any visible coding challenge
84
+ - Look for problem signals: "Input:", "Output:", "Example", "Constraints:", problem titles, test cases
85
+ - Include as much context as possible from the screen OCR (problem description, examples, constraints)
86
+ - ONLY spawn ONCE per distinct problem - do not repeat for the same problem
87
+ - The spawned task should provide a complete solution with code and explanation
73
88
 
74
89
  Audio sources: [\ud83d\udd0a]=system/speaker audio, [\ud83c\udf99]=microphone (user's voice).
75
90
  Treat [\ud83c\udf99] as direct user speech. Treat [\ud83d\udd0a] as external audio.
@@ -193,75 +208,54 @@ function parseTask(parsed: any): string | undefined {
193
208
  */
194
209
  export async function analyzeContext(
195
210
  contextWindow: ContextWindow,
196
- config: AgentConfig,
211
+ config: AnalysisConfig,
197
212
  recorderStatus: RecorderStatus | null = null,
198
213
  traitSystemPrompt?: string,
199
214
  ): Promise<AgentResult> {
200
215
  const userPrompt = buildUserPrompt(contextWindow, recorderStatus);
201
- // Apply privacy gating for images sent to OpenRouter
216
+
217
+ // Apply privacy gating for images based on provider
202
218
  let images = contextWindow.images || [];
219
+ const privacyDest = config.provider === "ollama" ? "local_llm" : "openrouter";
203
220
  try {
204
- const imgLevel = levelFor("screen_images", "openrouter");
205
- if (imgLevel === "none") {
206
- images = [];
207
- }
221
+ if (levelFor("screen_images", privacyDest) === "none") images = [];
208
222
  } catch { /* privacy not initialized, keep images */ }
223
+
209
224
  const systemPrompt = traitSystemPrompt ?? SYSTEM_PROMPT;
210
225
 
211
- // Try local Ollama first when enabled (handles both vision and text-only ticks)
212
- // Guard: skip if a previous Ollama call is still in-flight (avoids "no slots available")
213
- if (config.localVisionEnabled && !ollamaInFlight) {
214
- ollamaInFlight = true;
215
- try {
216
- const result = await callOllamaVision(systemPrompt, userPrompt, images, config);
217
- const mode = images.length > 0 ? "vision" : "text";
218
- log(TAG, `local ollama (${config.localVisionModel}, ${mode}): success`);
219
- return result;
220
- } catch (err: any) {
221
- log(TAG, `local ollama failed: ${err.message || err}, falling back to OpenRouter`);
222
- } finally {
223
- ollamaInFlight = false;
224
- }
226
+ if (config.provider === "ollama") {
227
+ return await callOllama(systemPrompt, userPrompt, images, config);
225
228
  }
226
229
 
227
- // Skip OpenRouter entirely if no API key (local-only mode)
228
- if (!config.openrouterApiKey) {
229
- if (config.localVisionEnabled) {
230
- throw new Error("local ollama failed and no OpenRouter API key — cannot analyze");
231
- }
232
- throw new Error("no OpenRouter API key configured");
230
+ // OpenRouter path: model chain with fallbacks
231
+ if (!config.apiKey) {
232
+ throw new Error("ANALYSIS_API_KEY / OPENROUTER_API_KEY not set");
233
233
  }
234
234
 
235
235
  const models = [config.model, ...config.fallbackModels];
236
-
237
- // Auto-upgrade: use vision model when images are present
238
- if (images.length > 0 && config.visionModel) {
239
- // Insert vision model at the front if not already there
240
- if (!models.includes(config.visionModel)) {
241
- models.unshift(config.visionModel);
242
- }
236
+ // Auto-upgrade to vision model when images are present
237
+ if (images.length > 0 && config.visionModel && !models.includes(config.visionModel)) {
238
+ models.unshift(config.visionModel);
243
239
  }
244
240
 
245
241
  let lastError: Error | null = null;
246
-
247
242
  for (const model of models) {
248
243
  try {
249
- return await callModel(systemPrompt, userPrompt, images, model, config);
244
+ return await callOpenRouter(systemPrompt, userPrompt, images, model, config);
250
245
  } catch (err: any) {
251
246
  lastError = err;
252
247
  log(TAG, `model ${model} failed: ${err.message || err}, trying next...`);
253
248
  }
254
249
  }
255
-
256
250
  throw lastError || new Error("all models failed");
257
251
  }
258
252
 
259
- async function callModel(
253
+ async function callOpenRouter(
260
254
  systemPrompt: string,
261
255
  userPrompt: string,
262
256
  images: ContextWindow["images"],
263
257
  model: string,
264
- config: AgentConfig,
258
+ config: AnalysisConfig,
265
259
  ): Promise<AgentResult> {
266
260
  const start = Date.now();
267
261
  const controller = new AbortController();
@@ -289,10 +283,10 @@ async function callModel(
289
283
 
290
284
  const imageCount = images?.length || 0;
291
285
 
292
- const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
286
+ const response = await fetch(config.endpoint, {
293
287
  method: "POST",
294
288
  headers: {
295
- "Authorization": `Bearer ${config.openrouterApiKey}`,
289
+ "Authorization": `Bearer ${config.apiKey}`,
296
290
  "Content-Type": "application/json",
297
291
  },
298
292
  body: JSON.stringify({
@@ -324,6 +318,7 @@ async function callModel(
324
318
  try {
325
319
  const jsonStr = raw.replace(/^```\w*\s*\n?/, "").replace(/\n?\s*```\s*$/, "").trim();
326
320
  const parsed = JSON.parse(jsonStr);
321
+ const apiCost = typeof data.usage?.cost === "number" ? data.usage.cost : undefined;
327
322
  return {
328
323
  hud: parsed.hud || "\u2014",
329
324
  digest: parsed.digest || "\u2014",
@@ -334,10 +329,12 @@ async function callModel(
334
329
  tokensOut: data.usage?.completion_tokens || 0,
335
330
  model,
336
331
  parsedOk: true,
332
+ cost: apiCost,
337
333
  };
338
334
  } catch {
339
335
  // Second chance: extract embedded JSON object
340
336
  const match = raw.match(/\{[\s\S]*\}/);
337
+ const apiCost = typeof data.usage?.cost === "number" ? data.usage.cost : undefined;
341
338
  if (match) {
342
339
  try {
343
340
  const parsed = JSON.parse(match[0]);
@@ -352,6 +349,7 @@ async function callModel(
352
349
  tokensOut: data.usage?.completion_tokens || 0,
353
350
  model,
354
351
  parsedOk: true,
352
+ cost: apiCost,
355
353
  };
356
354
  }
357
355
  } catch { /* fall through */ }
@@ -367,6 +365,7 @@ async function callModel(
367
365
  tokensOut: data.usage?.completion_tokens || 0,
368
366
  model,
369
367
  parsedOk: false,
368
+ cost: apiCost,
370
369
  };
371
370
  }
372
371
  } finally {
@@ -375,28 +374,27 @@ async function callModel(
375
374
  }
376
375
 
377
376
  /**
378
- * Call Ollama local vision model for image analysis.
379
- * Uses the /api/chat endpoint with base64 images.
380
- * Falls back to OpenRouter on any failure.
377
+ * Call Ollama local model for context analysis.
378
+ * Uses the /api/chat endpoint with optional base64 images.
381
379
  */
382
- async function callOllamaVision(
380
+ async function callOllama(
383
381
  systemPrompt: string,
384
382
  userPrompt: string,
385
383
  images: ContextWindow["images"],
386
- config: AgentConfig,
384
+ config: AnalysisConfig,
387
385
  ): Promise<AgentResult> {
388
386
  const start = Date.now();
389
387
  const controller = new AbortController();
390
- const timeout = setTimeout(() => controller.abort(), config.localVisionTimeout);
388
+ const timeout = setTimeout(() => controller.abort(), config.timeout);
391
389
 
392
390
  try {
393
391
  const imageB64List = (images || []).map((img) => img.data);
394
392
 
395
- const response = await fetch(`${config.localVisionUrl}/api/chat`, {
393
+ const response = await fetch(`${config.endpoint}/api/chat`, {
396
394
  method: "POST",
397
395
  headers: { "Content-Type": "application/json" },
398
396
  body: JSON.stringify({
399
- model: config.localVisionModel,
397
+ model: config.model,
400
398
  messages: [
401
399
  { role: "system", content: systemPrompt },
402
400
  { role: "user", content: userPrompt, images: imageB64List },
@@ -422,7 +420,7 @@ async function callOllamaVision(
422
420
  const tokensIn = data.prompt_eval_count || 0;
423
421
  const tokensOut = data.eval_count || 0;
424
422
 
425
- log(TAG, `ollama vision: model=${config.localVisionModel} latency=${latencyMs}ms tokens=${tokensIn}+${tokensOut}`);
423
+ log(TAG, `ollama vision: model=${config.model} latency=${latencyMs}ms tokens=${tokensIn}+${tokensOut}`);
426
424
 
427
425
  // Parse the response (same format as OpenRouter)
428
426
  // Parse JSON response (same logic as callModel)
@@ -436,7 +434,7 @@ async function callOllamaVision(
436
434
  task: parseTask(parsed),
437
435
  latencyMs,
438
436
  tokensIn, tokensOut,
439
- model: config.localVisionModel,
437
+ model: config.model,
440
438
  parsedOk: true,
441
439
  };
442
440
  } catch {
@@ -452,7 +450,7 @@ async function callOllamaVision(
452
450
  task: parseTask(parsed),
453
451
  latencyMs,
454
452
  tokensIn, tokensOut,
455
- model: config.localVisionModel,
453
+ model: config.model,
456
454
  parsedOk: true,
457
455
  };
458
456
  }
@@ -463,7 +461,7 @@ async function callOllamaVision(
463
461
  digest: content || "\u2014",
464
462
  latencyMs,
465
463
  tokensIn, tokensOut,
466
- model: config.localVisionModel,
464
+ model: config.model,
467
465
  parsedOk: false,
468
466
  };
469
467
  }