npm - @geravant/sinain - Versions diffs - 1.0.19 → 1.1.0 - Mend

@geravant/sinain 1.0.19 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/README.md +10 -1
package/cli.js +176 -0
package/install.js +11 -2
package/launcher.js +622 -0
package/openclaw.plugin.json +4 -0
package/pack-prepare.js +48 -0
package/package.json +24 -5
package/sense_client/README.md +82 -0
package/sense_client/__init__.py +1 -0
package/sense_client/__main__.py +462 -0
package/sense_client/app_detector.py +54 -0
package/sense_client/app_detector_win.py +83 -0
package/sense_client/capture.py +215 -0
package/sense_client/capture_win.py +88 -0
package/sense_client/change_detector.py +86 -0
package/sense_client/config.py +64 -0
package/sense_client/gate.py +145 -0
package/sense_client/ocr.py +347 -0
package/sense_client/privacy.py +65 -0
package/sense_client/requirements.txt +13 -0
package/sense_client/roi_extractor.py +84 -0
package/sense_client/sender.py +173 -0
package/sense_client/tests/__init__.py +0 -0
package/sense_client/tests/test_stream1_optimizations.py +234 -0
package/setup-overlay.js +82 -0
package/sinain-agent/.env.example +17 -0
package/sinain-agent/CLAUDE.md +80 -0
package/sinain-agent/mcp-config.json +12 -0
package/sinain-agent/run.sh +248 -0
package/sinain-core/.env.example +93 -0
package/sinain-core/package-lock.json +552 -0
package/sinain-core/package.json +21 -0
package/sinain-core/src/agent/analyzer.ts +366 -0
package/sinain-core/src/agent/context-window.ts +172 -0
package/sinain-core/src/agent/loop.ts +404 -0
package/sinain-core/src/agent/situation-writer.ts +187 -0
package/sinain-core/src/agent/traits.ts +520 -0
package/sinain-core/src/audio/capture-spawner-macos.ts +44 -0
package/sinain-core/src/audio/capture-spawner-win.ts +37 -0
package/sinain-core/src/audio/capture-spawner.ts +14 -0
package/sinain-core/src/audio/pipeline.ts +335 -0
package/sinain-core/src/audio/transcription-local.ts +141 -0
package/sinain-core/src/audio/transcription.ts +278 -0
package/sinain-core/src/buffers/feed-buffer.ts +71 -0
package/sinain-core/src/buffers/sense-buffer.ts +425 -0
package/sinain-core/src/config.ts +245 -0
package/sinain-core/src/escalation/escalation-slot.ts +136 -0
package/sinain-core/src/escalation/escalator.ts +812 -0
package/sinain-core/src/escalation/message-builder.ts +323 -0
package/sinain-core/src/escalation/openclaw-ws.ts +726 -0
package/sinain-core/src/escalation/scorer.ts +166 -0
package/sinain-core/src/index.ts +507 -0
package/sinain-core/src/learning/feedback-store.ts +253 -0
package/sinain-core/src/learning/signal-collector.ts +218 -0
package/sinain-core/src/log.ts +24 -0
package/sinain-core/src/overlay/commands.ts +126 -0
package/sinain-core/src/overlay/ws-handler.ts +267 -0
package/sinain-core/src/privacy/index.ts +18 -0
package/sinain-core/src/privacy/presets.ts +40 -0
package/sinain-core/src/privacy/redact.ts +92 -0
package/sinain-core/src/profiler.ts +181 -0
package/sinain-core/src/recorder.ts +186 -0
package/sinain-core/src/server.ts +417 -0
package/sinain-core/src/trace/trace-store.ts +73 -0
package/sinain-core/src/trace/tracer.ts +94 -0
package/sinain-core/src/types.ts +427 -0
package/sinain-core/src/util/dedup.ts +48 -0
package/sinain-core/src/util/task-store.ts +84 -0
package/sinain-core/tsconfig.json +18 -0
package/sinain-knowledge/data/git-store.ts +2 -0
package/sinain-mcp-server/index.ts +337 -0
package/sinain-mcp-server/package.json +19 -0
package/sinain-mcp-server/tsconfig.json +15 -0

package/sinain-core/src/agent/analyzer.ts ADDED Viewed

@@ -0,0 +1,366 @@
+import type { AgentConfig, AgentResult, ContextWindow, RecorderStatus, RecordCommand } from "../types.js";
+import { normalizeAppName } from "./context-window.js";
+import { log, error } from "../log.js";
+import { levelFor, applyLevel } from "../privacy/index.js";
+const TAG = "agent";
+/**
+ * Model-specific timeouts in milliseconds.
+ * Only increases timeouts for slow models to avoid false timeouts.
+ * Default 15s is kept for fast models.
+ */
+const MODEL_TIMEOUTS: Record<string, number> = {
+  'google/gemini-2.5-flash-lite': 15000,
+  'google/gemini-2.5-flash': 15000,
+  'google/gemini-2.0-flash': 15000,
+  'anthropic/claude-3-opus': 60000,
+  'anthropic/claude-3.5-sonnet': 30000,
+  'anthropic/claude-3-haiku': 15000,
+  'default': 15000,
+};
+/** Get timeout for a specific model. */
+function getModelTimeout(model: string): number {
+  return MODEL_TIMEOUTS[model] ?? MODEL_TIMEOUTS['default'];
+}
+/** Message part for multimodal API calls. */
+type ContentPart =
+  | { type: "text"; text: string }
+  | { type: "image_url"; image_url: { url: string; detail: "low" } };
+/**
+ * Build recorder status section for the prompt.
+ */
+function buildRecorderSection(status: RecorderStatus | null): string {
+  if (!status) return "";
+  if (!status.recording) return "\nRecorder: idle (not recording)";
+  const label = status.label ? ` "${status.label}"` : "";
+  const durationSec = Math.round(status.durationMs / 1000);
+  return `\nRecorder: RECORDING${label} (${durationSec}s, ${status.segments} segments)`;
+}
+/**
+ * Static system prompt (cached as module constant).
+ * Contains rules, output format, and behavioral instructions.
+ * Previously allocated ~3KB per tick; now zero-allocation.
+ */
+const SYSTEM_PROMPT = `You are an AI monitoring a user's screen and audio in real-time.
+You produce outputs as JSON.
+Respond ONLY with valid JSON. No markdown, no code fences, no explanation.
+Your entire response must be parseable by JSON.parse().
+{"hud":"...","digest":"...","record":{"command":"start"|"stop","label":"..."},"task":"..."}
+Output fields:
+- "hud" (required): max 60 words describing what user is doing NOW
+- "digest" (required): 5-8 sentences with detailed activity description
+- "record" (optional): control recording — {"command":"start","label":"Meeting name"} or {"command":"stop"}
+- "task" (optional): natural language instruction to spawn a background task
+When to use "record":
+- START when user begins a meeting, call, lecture, YouTube video, or important audio content
+- STOP when the content ends or user navigates away
+- Provide descriptive labels like "Team standup", "Client call", "YouTube: [video title from OCR]"
+- For YouTube/video content: extract video title from screen OCR for the label
+When to use "task":
+- User explicitly asks for research, lookup, or action
+- Something needs external search or processing that isn't a real-time response
+- Example: "Search for React 19 migration guide", "Find docs for this API"
+When to spawn "task" for video content:
+- If user watches a YouTube video for 2+ minutes AND no task has been spawned for this video yet, spawn: "Summarize YouTube video: [title or URL from OCR]"
+- ONLY spawn ONCE per video - do not repeat spawn for the same video in subsequent ticks
+- Extract video title or URL from screen OCR to include in the task
+When to spawn "task" for coding problems:
+- If user is actively working on a coding problem/challenge for 1+ minutes:
+  - Spawn: "Solve coding problem: [problem description/title from OCR]"
+- This includes LeetCode, HackerRank, interviews, coding assessments, or any visible coding challenge
+- Look for problem signals: "Input:", "Output:", "Example", "Constraints:", problem titles, test cases
+- Include as much context as possible from the screen OCR (problem description, examples, constraints)
+- ONLY spawn ONCE per distinct problem - do not repeat for the same problem
+- The spawned task should provide a complete solution with code and explanation
+Audio sources: [\ud83d\udd0a]=system/speaker audio, [\ud83c\udf99]=microphone (user's voice).
+Treat [\ud83c\udf99] as direct user speech. Treat [\ud83d\udd0a] as external audio.
+Rules:
+- "hud" is for a minimal overlay display. Example: "Editing hud-relay.mjs in IDEA"
+- "digest" is for an AI assistant to understand the full situation and offer help.
+- If nothing is happening, hud="Idle" and digest explains what was last seen.
+- Include specific filenames, URLs, error messages, UI text from OCR in digest.
+- Do NOT suggest actions in digest — just describe the situation factually.
+- Only include "record" or "task" when genuinely appropriate — most responses won't have them.
+- CRITICAL: Output ONLY the JSON object, nothing else.`;
+/**
+ * Build the dynamic user prompt (changes every tick).
+ * Contains the current context data: screen OCR, audio transcripts, app state.
+ */
+function buildUserPrompt(ctx: ContextWindow, recorderStatus: RecorderStatus | null = null): string {
+  const now = Date.now();
+  // Privacy gating: check levels for openrouter destination
+  let screenLines: string;
+  try {
+    const ocrLevel = levelFor("screen_ocr", "openrouter");
+    const titlesLevel = levelFor("window_titles", "openrouter");
+    screenLines = ctx.screen
+      .map(e => {
+        const app = normalizeAppName(e.meta.app);
+        const ago = Math.round((now - (e.ts || now)) / 1000);
+        const rawOcr = e.ocr ? e.ocr.replace(/\n/g, " ").slice(0, ctx.preset.maxOcrChars) : "(no text)";
+        const ocr = e.ocr ? applyLevel(rawOcr, ocrLevel, "ocr") : "(no text)";
+        const title = e.meta.windowTitle ? applyLevel(e.meta.windowTitle, titlesLevel, "titles") : "";
+        const titlePart = title ? ` [${title}]` : "";
+        return `[${ago}s ago] [${app}]${titlePart} ${ocr || "(no text)"}`;
+      })
+      .join("\n");
+  } catch {
+    // Privacy not yet initialized — use full text
+    screenLines = ctx.screen
+      .map(e => {
+        const app = normalizeAppName(e.meta.app);
+        const ago = Math.round((now - (e.ts || now)) / 1000);
+        const ocr = e.ocr ? e.ocr.replace(/\n/g, " ").slice(0, ctx.preset.maxOcrChars) : "(no text)";
+        return `[${ago}s ago] [${app}] ${ocr}`;
+      })
+      .join("\n");
+  }
+  let audioLines: string;
+  try {
+    const audioLevel = levelFor("audio_transcript", "openrouter");
+    audioLines = ctx.audio
+      .map(e => {
+        const ago = Math.round((now - (e.ts || now)) / 1000);
+        const text = applyLevel(e.text.slice(0, ctx.preset.maxTranscriptChars), audioLevel, "audio");
+        return `[${ago}s ago] ${text}`;
+      })
+      .join("\n");
+  } catch {
+    audioLines = ctx.audio
+      .map(e => {
+        const ago = Math.round((now - (e.ts || now)) / 1000);
+        return `[${ago}s ago] ${e.text.slice(0, ctx.preset.maxTranscriptChars)}`;
+      })
+      .join("\n");
+  }
+  const appSwitches = ctx.appHistory
+    .map(a => normalizeAppName(a.app))
+    .join(" \u2192 ");
+  const recorderSection = buildRecorderSection(recorderStatus);
+  // Gate images based on privacy level
+  let imagesForPrompt = ctx.images;
+  try {
+    const imgLevel = levelFor("screen_images", "openrouter");
+    if (imgLevel === "none") {
+      imagesForPrompt = [];
+    }
+  } catch { /* privacy not initialized, keep images */ }
+  const hasImages = imagesForPrompt && imagesForPrompt.length > 0;
+  const imageNote = hasImages ? `\n\nScreen screenshots (${imagesForPrompt!.length}) are attached below.` : "";
+  return `Active app: ${normalizeAppName(ctx.currentApp)}
+App history: ${appSwitches || "(none)"}${recorderSection}
+Screen (OCR text, newest first):
+${screenLines || "(no screen data)"}
+Audio transcript (newest first, \ud83d\udd0a=system, \ud83c\udf99=mic):
+${audioLines || "(silence)"}${imageNote}`;
+}
+/**
+ * Parse record command from LLM response.
+ */
+function parseRecord(parsed: any): RecordCommand | undefined {
+  if (!parsed.record || typeof parsed.record !== "object") return undefined;
+  const cmd = parsed.record.command;
+  if (cmd !== "start" && cmd !== "stop") return undefined;
+  return {
+    command: cmd,
+    label: typeof parsed.record.label === "string" ? parsed.record.label : undefined,
+  };
+}
+/**
+ * Parse task from LLM response.
+ */
+function parseTask(parsed: any): string | undefined {
+  if (typeof parsed.task !== "string" || !parsed.task.trim()) return undefined;
+  return parsed.task.trim();
+}
+/**
+ * Call the LLM (OpenRouter) to analyze the context window.
+ * Supports model chain: primary + fallbacks.
+ * When images are present, auto-upgrades to the vision model.
+ */
+export async function analyzeContext(
+  contextWindow: ContextWindow,
+  config: AgentConfig,
+  recorderStatus: RecorderStatus | null = null,
+  traitSystemPrompt?: string,
+): Promise<AgentResult> {
+  const userPrompt = buildUserPrompt(contextWindow, recorderStatus);
+  // Apply privacy gating for images sent to OpenRouter
+  let images = contextWindow.images || [];
+  try {
+    const imgLevel = levelFor("screen_images", "openrouter");
+    if (imgLevel === "none") {
+      images = [];
+    }
+  } catch { /* privacy not initialized, keep images */ }
+  const systemPrompt = traitSystemPrompt ?? SYSTEM_PROMPT;
+  const models = [config.model, ...config.fallbackModels];
+  // Auto-upgrade: use vision model when images are present
+  if (images.length > 0 && config.visionModel) {
+    // Insert vision model at the front if not already there
+    if (!models.includes(config.visionModel)) {
+      models.unshift(config.visionModel);
+    }
+  }
+  let lastError: Error | null = null;
+  for (const model of models) {
+    try {
+      return await callModel(systemPrompt, userPrompt, images, model, config);
+    } catch (err: any) {
+      lastError = err;
+      log(TAG, `model ${model} failed: ${err.message || err}, trying next...`);
+    }
+  }
+  throw lastError || new Error("all models failed");
+}
+async function callModel(
+  systemPrompt: string,
+  userPrompt: string,
+  images: ContextWindow["images"],
+  model: string,
+  config: AgentConfig,
+): Promise<AgentResult> {
+  const start = Date.now();
+  const controller = new AbortController();
+  const timeoutMs = getModelTimeout(model);
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+  try {
+    // Build user message content: text + optional images
+    let userContent: string | ContentPart[];
+    if (images && images.length > 0) {
+      const parts: ContentPart[] = [{ type: "text", text: userPrompt }];
+      for (const img of images) {
+        parts.push({
+          type: "image_url",
+          image_url: {
+            url: `data:image/jpeg;base64,${img.data}`,
+            detail: "low",
+          },
+        });
+      }
+      userContent = parts;
+    } else {
+      userContent = userPrompt;
+    }
+    const imageCount = images?.length || 0;
+    const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
+      method: "POST",
+      headers: {
+        "Authorization": `Bearer ${config.openrouterApiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model,
+        messages: [
+          { role: "system", content: systemPrompt },
+          { role: "user", content: userContent },
+        ],
+        max_tokens: config.maxTokens,
+        temperature: config.temperature,
+      }),
+      signal: controller.signal,
+    });
+    if (!response.ok) {
+      const body = await response.text().catch(() => "");
+      throw new Error(`HTTP ${response.status}: ${body.slice(0, 200)}`);
+    }
+    const data = await response.json() as any;
+    const latencyMs = Date.now() - start;
+    const raw = data.choices?.[0]?.message?.content?.trim() || "";
+    if (imageCount > 0) {
+      log(TAG, `multimodal call: model=${model}, images=${imageCount}`);
+    }
+    // Parse JSON response — try direct parse, then extract embedded JSON, then fallback
+    try {
+      const jsonStr = raw.replace(/^```\w*\s*\n?/, "").replace(/\n?\s*```\s*$/, "").trim();
+      const parsed = JSON.parse(jsonStr);
+      return {
+        hud: parsed.hud || "\u2014",
+        digest: parsed.digest || "\u2014",
+        record: parseRecord(parsed),
+        task: parseTask(parsed),
+        latencyMs,
+        tokensIn: data.usage?.prompt_tokens || 0,
+        tokensOut: data.usage?.completion_tokens || 0,
+        model,
+        parsedOk: true,
+      };
+    } catch {
+      // Second chance: extract embedded JSON object
+      const match = raw.match(/\{[\s\S]*\}/);
+      if (match) {
+        try {
+          const parsed = JSON.parse(match[0]);
+          if (parsed.hud) {
+            return {
+              hud: parsed.hud,
+              digest: parsed.digest || "\u2014",
+              record: parseRecord(parsed),
+              task: parseTask(parsed),
+              latencyMs,
+              tokensIn: data.usage?.prompt_tokens || 0,
+              tokensOut: data.usage?.completion_tokens || 0,
+              model,
+              parsedOk: true,
+            };
+          }
+        } catch { /* fall through */ }
+      }
+      // Final fallback: use raw text
+      log(TAG, `JSON parse failed (model=${model}), raw: "${raw.slice(0, 120)}"`);
+      return {
+        hud: raw.slice(0, 160) || "\u2014",
+        digest: raw || "\u2014",
+        latencyMs,
+        tokensIn: data.usage?.prompt_tokens || 0,
+        tokensOut: data.usage?.completion_tokens || 0,
+        model,
+        parsedOk: false,
+      };
+    }
+  } finally {
+    clearTimeout(timeout);
+  }
+}

package/sinain-core/src/agent/context-window.ts ADDED Viewed

@@ -0,0 +1,172 @@
+import type { FeedBuffer } from "../buffers/feed-buffer.js";
+import type { SenseBuffer } from "../buffers/sense-buffer.js";
+import type { ContextWindow, ContextRichness, RichnessPreset } from "../types.js";
+/**
+ * Track recently sent image hashes to avoid sending duplicates to vision model.
+ * Uses simple content hash: length + first 1000 chars.
+ */
+const recentlySentImageHashes = new Set<string>();
+const MAX_IMAGE_HASH_CACHE = 20;
+let imageHashCacheOrder: string[] = [];
+/**
+ * Richness presets — control how much context goes into agent analysis and escalation.
+ *
+ * lean:     For selective mode. Minimal context, fast + cheap.
+ * standard: For focus mode. Moderate detail.
+ * rich:     Full context. Maximum detail for thorough agent analysis.
+ */
+export const RICHNESS_PRESETS: Record<ContextRichness, RichnessPreset> = {
+  lean:     { maxScreenEvents: 10, maxAudioEntries: 5,  maxOcrChars: 400,  maxTranscriptChars: 400,  maxImages: 0 },
+  standard: { maxScreenEvents: 20, maxAudioEntries: 10, maxOcrChars: 1000, maxTranscriptChars: 800,  maxImages: 1 },
+  rich:     { maxScreenEvents: 50, maxAudioEntries: 30, maxOcrChars: 4000, maxTranscriptChars: 2000, maxImages: 2 },
+} as const;
+/** App name normalization map (consistent display names). */
+const APP_NAMES: Record<string, string> = {
+  "idea": "IntelliJ IDEA",
+  "code": "VS Code",
+  "code - insiders": "VS Code Insiders",
+  "webstorm": "WebStorm",
+  "pycharm": "PyCharm",
+  "datagrip": "DataGrip",
+  "google chrome": "Chrome",
+  "firefox": "Firefox",
+  "safari": "Safari",
+  "telegram lite": "Telegram",
+  "telegram": "Telegram",
+  "iterm2": "iTerm",
+  "terminal": "Terminal",
+  "finder": "Finder",
+  "audio midi setup": "Audio MIDI Setup",
+};
+export function normalizeAppName(app: string): string {
+  return APP_NAMES[app.toLowerCase()] || app;
+}
+/** Short app names for overlay feed (compact display). */
+const APP_SHORT_NAMES: Record<string, string> = {
+  "IntelliJ IDEA": "IDEA",
+  "IntelliJ IDEA Ultimate": "IDEA",
+  "idea": "IDEA",
+  "Google Chrome": "Chrome",
+  "Visual Studio Code": "Code",
+  "Code - Insiders": "Code",
+  "iTerm2": "iTerm",
+  "Terminal": "Term",
+  "Telegram": "TG",
+  "WebStorm": "WS",
+  "PyCharm": "PyCharm",
+  "DataGrip": "DG",
+  "Finder": "Finder",
+};
+export function shortAppName(app: string): string {
+  if (APP_SHORT_NAMES[app]) return APP_SHORT_NAMES[app];
+  const lower = app.toLowerCase();
+  for (const [key, value] of Object.entries(APP_SHORT_NAMES)) {
+    if (key.toLowerCase() === lower) return value;
+  }
+  return app;
+}
+/**
+ * Build a unified context window from in-process buffers.
+ * Replaces both relay's buildContextWindow() and bridge's ContextManager.
+ *
+ * No HTTP round-trips — direct access to feed and sense buffers.
+ */
+export function buildContextWindow(
+  feedBuffer: FeedBuffer,
+  senseBuffer: SenseBuffer,
+  richness: ContextRichness = "standard",
+  maxAgeMs = 120_000,
+): ContextWindow {
+  const preset = RICHNESS_PRESETS[richness];
+  const cutoff = Date.now() - maxAgeMs;
+  // Audio: extract transcript text from feed items tagged as 'audio'
+  const audioItems = feedBuffer.queryBySource("audio", cutoff)
+    .slice(-preset.maxAudioEntries);
+  // Screen: get sense events within the time window
+  const screenEvents = senseBuffer.queryByTime(cutoff);
+  // Current app
+  const latestSense = screenEvents[screenEvents.length - 1];
+  const currentApp = latestSense?.meta.app || "unknown";
+  // Deduplicate OCR text (consecutive identical OCR is noise)
+  const dedupedScreen = [];
+  let lastOcr = "";
+  for (const e of screenEvents) {
+    if (e.ocr && e.ocr !== lastOcr) {
+      dedupedScreen.push(e);
+      lastOcr = e.ocr;
+    } else if (!e.ocr && e.type === "context") {
+      dedupedScreen.push(e);
+    }
+  }
+  // App transition timeline
+  const appHistory = senseBuffer.appHistory(cutoff);
+  // Limit to preset maximums, newest first for recency weighting
+  const sortedScreen = dedupedScreen.slice(-preset.maxScreenEvents).reverse();
+  // Compute newest event timestamp
+  const newestEventTs = Math.max(
+    audioItems[audioItems.length - 1]?.ts || 0,
+    screenEvents[screenEvents.length - 1]?.ts || 0
+  );
+  // Extract recent images for multimodal vision (with content-based deduplication)
+  let images: { data: string; app: string; ts: number }[] | undefined;
+  if (preset.maxImages > 0) {
+    const rawImages = senseBuffer.recentImages(preset.maxImages);
+    images = [];
+    for (const e of rawImages) {
+      if (!e.imageData) continue;
+      // Simple content hash: length + first 1000 chars
+      const hash = `${e.imageData.length}:${e.imageData.slice(0, 1000)}`;
+      // Skip if recently sent to vision model (avoid duplicate API calls)
+      if (recentlySentImageHashes.has(hash)) {
+        continue;
+      }
+      // Track this hash (LRU eviction)
+      recentlySentImageHashes.add(hash);
+      imageHashCacheOrder.push(hash);
+      while (imageHashCacheOrder.length > MAX_IMAGE_HASH_CACHE) {
+        const oldest = imageHashCacheOrder.shift()!;
+        recentlySentImageHashes.delete(oldest);
+      }
+      images.push({
+        data: e.imageData,
+        app: e.meta.app || "unknown",
+        ts: e.ts,
+      });
+    }
+    if (images.length === 0) images = undefined;
+  }
+  return {
+    audio: audioItems,
+    screen: sortedScreen,
+    images,
+    currentApp,
+    appHistory,
+    audioCount: audioItems.length,
+    screenCount: screenEvents.length,
+    windowMs: maxAgeMs,
+    newestEventTs,
+    preset,
+  };
+}