@geravant/sinain 1.8.0 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -6
- package/cli.js +16 -2
- package/config-shared.js +469 -0
- package/config.js +152 -0
- package/launcher.js +7 -1
- package/onboard.js +345 -0
- package/package.json +8 -2
- package/sense_client/__main__.py +8 -4
- package/sense_client/gate.py +1 -0
- package/sense_client/ocr.py +52 -22
- package/sense_client/sender.py +2 -0
- package/sense_client/vision.py +31 -11
- package/sinain-agent/.env.example +23 -0
- package/sinain-agent/run.sh +7 -12
- package/sinain-core/src/agent/analyzer.ts +25 -2
- package/sinain-core/src/agent/loop.ts +26 -1
- package/sinain-core/src/audio/transcription.ts +20 -5
- package/sinain-core/src/config.ts +3 -2
- package/sinain-core/src/cost/tracker.ts +64 -0
- package/sinain-core/src/escalation/escalator.ts +31 -59
- package/sinain-core/src/index.ts +41 -45
- package/sinain-core/src/overlay/commands.ts +12 -9
- package/sinain-core/src/overlay/ws-handler.ts +27 -3
- package/sinain-core/src/server.ts +41 -0
- package/sinain-core/src/types.ts +33 -1
package/launcher.js
CHANGED
|
@@ -685,7 +685,13 @@ function loadUserEnv() {
|
|
|
685
685
|
const eq = trimmed.indexOf("=");
|
|
686
686
|
if (eq === -1) continue;
|
|
687
687
|
const key = trimmed.slice(0, eq).trim();
|
|
688
|
-
|
|
688
|
+
let val = trimmed.slice(eq + 1).trim();
|
|
689
|
+
if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) {
|
|
690
|
+
val = val.slice(1, -1);
|
|
691
|
+
} else {
|
|
692
|
+
const ci = val.search(/\s+#/);
|
|
693
|
+
if (ci !== -1) val = val.slice(0, ci).trimEnd();
|
|
694
|
+
}
|
|
689
695
|
// Don't override existing env vars
|
|
690
696
|
if (!process.env[key]) {
|
|
691
697
|
process.env[key] = val;
|
package/onboard.js
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* sinain onboard — interactive setup wizard using @clack/prompts
|
|
4
|
+
* Modeled after `openclaw onboard` for a familiar, polished experience.
|
|
5
|
+
*/
|
|
6
|
+
import * as p from "@clack/prompts";
|
|
7
|
+
import fs from "fs";
|
|
8
|
+
import path from "path";
|
|
9
|
+
import { execFileSync } from "child_process";
|
|
10
|
+
import {
|
|
11
|
+
c, guard, maskKey, readEnv, writeEnv, summarizeConfig, runHealthCheck,
|
|
12
|
+
stepApiKey, stepTranscription, stepGateway, stepPrivacy, stepModel,
|
|
13
|
+
HOME, SINAIN_DIR, ENV_PATH, PKG_DIR, IS_WINDOWS, IS_MAC,
|
|
14
|
+
} from "./config-shared.js";
|
|
15
|
+
|
|
16
|
+
// ── Header ──────────────────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
function printHeader() {
|
|
19
|
+
console.log();
|
|
20
|
+
console.log(c.bold(" ┌─────────────────────────────────────────┐"));
|
|
21
|
+
console.log(c.bold(" │ │"));
|
|
22
|
+
console.log(c.bold(" │") + c.cyan(" ╔═╗╦╔╗╔╔═╗╦╔╗╔ ╦ ╦╦ ╦╔╦╗ ") + c.bold("│"));
|
|
23
|
+
console.log(c.bold(" │") + c.cyan(" ╚═╗║║║║╠═╣║║║║ ╠═╣║ ║ ║║ ") + c.bold("│"));
|
|
24
|
+
console.log(c.bold(" │") + c.cyan(" ╚═╝╩╝╚╝╩ ╩╩╝╚╝ ╩ ╩╚═╝═╩╝ ") + c.bold("│"));
|
|
25
|
+
console.log(c.bold(" │") + c.dim(" Privacy-first AI overlay ") + c.bold("│"));
|
|
26
|
+
console.log(c.bold(" │ │"));
|
|
27
|
+
console.log(c.bold(" └─────────────────────────────────────────┘"));
|
|
28
|
+
console.log();
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ── Steps (imported from config-shared.js) ──────────────────────────────────
|
|
32
|
+
// stepApiKey, stepTranscription, stepGateway, stepPrivacy, stepModel
|
|
33
|
+
// are imported above and accept an optional label parameter.
|
|
34
|
+
|
|
35
|
+
async function stepOverlay(existing) {
|
|
36
|
+
// Check if overlay is already installed
|
|
37
|
+
const overlayPaths = [
|
|
38
|
+
path.join(SINAIN_DIR, "overlay", "SinainHUD.app"),
|
|
39
|
+
path.join(SINAIN_DIR, "overlay", "sinain_hud.exe"),
|
|
40
|
+
];
|
|
41
|
+
const overlayInstalled = overlayPaths.some((p) => fs.existsSync(p));
|
|
42
|
+
|
|
43
|
+
const choice = guard(await p.select({
|
|
44
|
+
message: "Install overlay",
|
|
45
|
+
options: [
|
|
46
|
+
{
|
|
47
|
+
value: "download",
|
|
48
|
+
label: "Download pre-built app (recommended)",
|
|
49
|
+
hint: "No Flutter SDK needed",
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
value: "source",
|
|
53
|
+
label: "Build from source",
|
|
54
|
+
hint: "Requires Flutter SDK",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
value: "skip",
|
|
58
|
+
label: overlayInstalled ? "Skip (already installed)" : "Skip for now",
|
|
59
|
+
hint: overlayInstalled ? "SinainHUD.app detected" : "Install later: sinain setup-overlay",
|
|
60
|
+
},
|
|
61
|
+
],
|
|
62
|
+
initialValue: overlayInstalled ? "skip" : "download",
|
|
63
|
+
}));
|
|
64
|
+
|
|
65
|
+
if (choice === "download" || choice === "source") {
|
|
66
|
+
const s = p.spinner();
|
|
67
|
+
const label = choice === "download" ? "Downloading overlay..." : "Building overlay from source...";
|
|
68
|
+
s.start(label);
|
|
69
|
+
try {
|
|
70
|
+
// setup-overlay.js handles both modes via process.argv
|
|
71
|
+
if (choice === "source") process.argv.push("--from-source");
|
|
72
|
+
await import("./setup-overlay.js");
|
|
73
|
+
s.stop(c.green("Overlay installed."));
|
|
74
|
+
} catch (err) {
|
|
75
|
+
s.stop(c.yellow(`Failed: ${err.message}`));
|
|
76
|
+
p.note("Install manually: sinain setup-overlay", "Overlay");
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ── Main ────────────────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
export async function runOnboard(args = {}) {
|
|
84
|
+
printHeader();
|
|
85
|
+
p.intro("SinainHUD setup");
|
|
86
|
+
|
|
87
|
+
const existing = readEnv(ENV_PATH);
|
|
88
|
+
const hasExisting = Object.keys(existing).length > 0;
|
|
89
|
+
|
|
90
|
+
// ── Existing config handling ────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
let configAction = "fresh";
|
|
93
|
+
if (hasExisting) {
|
|
94
|
+
p.note(summarizeConfig(existing).join("\n"), "Existing config detected");
|
|
95
|
+
|
|
96
|
+
configAction = guard(await p.select({
|
|
97
|
+
message: "Config handling",
|
|
98
|
+
options: [
|
|
99
|
+
{ value: "keep", label: "Use existing values" },
|
|
100
|
+
{ value: "update", label: "Update values" },
|
|
101
|
+
{ value: "reset", label: "Reset (start fresh)" },
|
|
102
|
+
],
|
|
103
|
+
initialValue: "keep",
|
|
104
|
+
}));
|
|
105
|
+
|
|
106
|
+
if (configAction === "keep") {
|
|
107
|
+
p.log.success("Using existing configuration.");
|
|
108
|
+
await stepOverlay(existing);
|
|
109
|
+
await runHealthCheck();
|
|
110
|
+
printOutro();
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (configAction === "reset") {
|
|
115
|
+
fs.unlinkSync(ENV_PATH);
|
|
116
|
+
p.log.info("Config reset.");
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const base = configAction === "update" ? existing : {};
|
|
121
|
+
|
|
122
|
+
// ── Flow selection ──────────────────────────────────────────────────────
|
|
123
|
+
|
|
124
|
+
const flow = args.flow || guard(await p.select({
|
|
125
|
+
message: "Setup mode",
|
|
126
|
+
options: [
|
|
127
|
+
{
|
|
128
|
+
value: "quickstart",
|
|
129
|
+
label: "QuickStart",
|
|
130
|
+
hint: "Get running in 2 minutes. Configure details later.",
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
value: "advanced",
|
|
134
|
+
label: "Advanced",
|
|
135
|
+
hint: "Full control over privacy, models, and connections.",
|
|
136
|
+
},
|
|
137
|
+
],
|
|
138
|
+
initialValue: "quickstart",
|
|
139
|
+
}));
|
|
140
|
+
|
|
141
|
+
const totalSteps = flow === "quickstart" ? 2 : 5;
|
|
142
|
+
|
|
143
|
+
// ── Collect vars ────────────────────────────────────────────────────────
|
|
144
|
+
|
|
145
|
+
const vars = { ...base };
|
|
146
|
+
|
|
147
|
+
// Step 1: API key (both flows)
|
|
148
|
+
const apiKey = await stepApiKey(base, `[1/${totalSteps}] OpenRouter API key`);
|
|
149
|
+
vars.OPENROUTER_API_KEY = apiKey;
|
|
150
|
+
p.log.success("API key saved.");
|
|
151
|
+
|
|
152
|
+
if (flow === "quickstart") {
|
|
153
|
+
// QuickStart: sensible defaults
|
|
154
|
+
vars.TRANSCRIPTION_BACKEND = base.TRANSCRIPTION_BACKEND || "openrouter";
|
|
155
|
+
vars.PRIVACY_MODE = base.PRIVACY_MODE || "standard";
|
|
156
|
+
vars.AGENT_MODEL = base.AGENT_MODEL || "google/gemini-2.5-flash-lite";
|
|
157
|
+
vars.ESCALATION_MODE = base.ESCALATION_MODE || "off";
|
|
158
|
+
vars.SINAIN_AGENT = base.SINAIN_AGENT || "claude";
|
|
159
|
+
if (!vars.OPENCLAW_WS_URL) {
|
|
160
|
+
vars.OPENCLAW_WS_URL = "";
|
|
161
|
+
vars.OPENCLAW_HTTP_URL = "";
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
p.note(
|
|
165
|
+
[
|
|
166
|
+
`Transcription: ${vars.TRANSCRIPTION_BACKEND}`,
|
|
167
|
+
`Privacy: ${vars.PRIVACY_MODE}`,
|
|
168
|
+
`Model: ${vars.AGENT_MODEL}`,
|
|
169
|
+
`Escalation: ${vars.ESCALATION_MODE}`,
|
|
170
|
+
"",
|
|
171
|
+
`Change later: sinain config`,
|
|
172
|
+
].join("\n"),
|
|
173
|
+
"QuickStart defaults",
|
|
174
|
+
);
|
|
175
|
+
} else {
|
|
176
|
+
// Advanced flow: steps 2-5
|
|
177
|
+
const transcription = await stepTranscription(base, "[2/5] Audio transcription");
|
|
178
|
+
vars.TRANSCRIPTION_BACKEND = transcription;
|
|
179
|
+
p.log.success(`Using ${transcription === "openrouter" ? "cloud" : "local"} transcription.`);
|
|
180
|
+
|
|
181
|
+
if (transcription === "local") {
|
|
182
|
+
const modelDir = path.join(HOME, "models");
|
|
183
|
+
const modelPath = path.join(modelDir, "ggml-large-v3-turbo.bin");
|
|
184
|
+
if (!fs.existsSync(modelPath)) {
|
|
185
|
+
const download = guard(await p.confirm({
|
|
186
|
+
message: "Download Whisper model (~1.5 GB)?",
|
|
187
|
+
initialValue: true,
|
|
188
|
+
}));
|
|
189
|
+
if (download) {
|
|
190
|
+
const s = p.spinner();
|
|
191
|
+
s.start("Downloading Whisper model...");
|
|
192
|
+
try {
|
|
193
|
+
fs.mkdirSync(modelDir, { recursive: true });
|
|
194
|
+
execFileSync("curl", [
|
|
195
|
+
"-L", "--progress-bar",
|
|
196
|
+
"-o", modelPath,
|
|
197
|
+
"https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin",
|
|
198
|
+
], { stdio: "inherit" });
|
|
199
|
+
s.stop(c.green("Model downloaded."));
|
|
200
|
+
vars.LOCAL_WHISPER_MODEL = modelPath;
|
|
201
|
+
} catch {
|
|
202
|
+
s.stop(c.yellow("Download failed. You can download manually later."));
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
} else {
|
|
206
|
+
vars.LOCAL_WHISPER_MODEL = modelPath;
|
|
207
|
+
p.log.info(`Whisper model found: ${c.dim(modelPath)}`);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const gatewayVars = await stepGateway(base, "[3/5] OpenClaw gateway");
|
|
212
|
+
Object.assign(vars, gatewayVars);
|
|
213
|
+
if (gatewayVars.ESCALATION_MODE === "off") {
|
|
214
|
+
p.log.info("Standalone mode (no gateway).");
|
|
215
|
+
} else {
|
|
216
|
+
p.log.success("Gateway configured.");
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const privacy = await stepPrivacy(base, "[4/5] Privacy mode");
|
|
220
|
+
vars.PRIVACY_MODE = privacy;
|
|
221
|
+
p.log.success(`Privacy: ${privacy}.`);
|
|
222
|
+
|
|
223
|
+
const model = await stepModel(base, "[5/5] AI model for HUD analysis");
|
|
224
|
+
vars.AGENT_MODEL = model;
|
|
225
|
+
p.log.success(`Model: ${model}.`);
|
|
226
|
+
|
|
227
|
+
vars.SINAIN_AGENT = base.SINAIN_AGENT || "claude";
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// ── Common defaults ───────────────────────────────────────────────────
|
|
231
|
+
|
|
232
|
+
vars.SINAIN_CORE_URL = vars.SINAIN_CORE_URL || "http://localhost:9500";
|
|
233
|
+
vars.SINAIN_POLL_INTERVAL = vars.SINAIN_POLL_INTERVAL || "5";
|
|
234
|
+
vars.SINAIN_HEARTBEAT_INTERVAL = vars.SINAIN_HEARTBEAT_INTERVAL || "900";
|
|
235
|
+
vars.AUDIO_CAPTURE_CMD = vars.AUDIO_CAPTURE_CMD || "screencapturekit";
|
|
236
|
+
vars.AUDIO_AUTO_START = vars.AUDIO_AUTO_START || "true";
|
|
237
|
+
vars.PORT = vars.PORT || "9500";
|
|
238
|
+
|
|
239
|
+
// ── Write config ──────────────────────────────────────────────────────
|
|
240
|
+
|
|
241
|
+
const s = p.spinner();
|
|
242
|
+
s.start("Writing configuration...");
|
|
243
|
+
writeEnv(vars);
|
|
244
|
+
s.stop(c.green(`Config saved: ${c.dim(ENV_PATH)}`));
|
|
245
|
+
|
|
246
|
+
// ── Overlay ───────────────────────────────────────────────────────────
|
|
247
|
+
|
|
248
|
+
await stepOverlay(base);
|
|
249
|
+
|
|
250
|
+
// ── Health check ──────────────────────────────────────────────────────
|
|
251
|
+
|
|
252
|
+
await runHealthCheck();
|
|
253
|
+
|
|
254
|
+
// ── What now ──────────────────────────────────────────────────────────
|
|
255
|
+
|
|
256
|
+
printOutro();
|
|
257
|
+
|
|
258
|
+
// ── Start? ────────────────────────────────────────────────────────────
|
|
259
|
+
|
|
260
|
+
const startNow = guard(await p.confirm({
|
|
261
|
+
message: "Start sinain now?",
|
|
262
|
+
initialValue: true,
|
|
263
|
+
}));
|
|
264
|
+
|
|
265
|
+
if (startNow) {
|
|
266
|
+
p.outro("Launching sinain...");
|
|
267
|
+
try {
|
|
268
|
+
await import("./launcher.js");
|
|
269
|
+
} catch (err) {
|
|
270
|
+
console.log(c.yellow(` Launch failed: ${err.message}`));
|
|
271
|
+
console.log(c.dim(" Try manually: sinain start"));
|
|
272
|
+
}
|
|
273
|
+
} else {
|
|
274
|
+
p.outro("Run when ready: sinain start");
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function printOutro() {
|
|
279
|
+
const hotkey = IS_WINDOWS ? "Ctrl+Shift" : "Cmd+Shift";
|
|
280
|
+
p.note(
|
|
281
|
+
[
|
|
282
|
+
"Hotkeys:",
|
|
283
|
+
` ${hotkey}+Space — Show/hide overlay`,
|
|
284
|
+
` ${hotkey}+E — Cycle tabs (Stream → Agent → Tasks)`,
|
|
285
|
+
` ${hotkey}+C — Toggle click-through`,
|
|
286
|
+
` ${hotkey}+M — Cycle display mode`,
|
|
287
|
+
"",
|
|
288
|
+
"Docs: https://github.com/geravant/sinain-hud",
|
|
289
|
+
"Re-run: sinain onboard (or sinain onboard --advanced)",
|
|
290
|
+
].join("\n"),
|
|
291
|
+
"What now",
|
|
292
|
+
);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// ── CLI entry point ─────────────────────────────────────────────────────────
|
|
296
|
+
|
|
297
|
+
const cliArgs = process.argv.slice(2);
|
|
298
|
+
const flags = {};
|
|
299
|
+
for (const arg of cliArgs) {
|
|
300
|
+
if (arg === "--advanced") flags.flow = "advanced";
|
|
301
|
+
if (arg === "--quickstart") flags.flow = "quickstart";
|
|
302
|
+
if (arg.startsWith("--key=")) flags.key = arg.slice(6);
|
|
303
|
+
if (arg === "--non-interactive") flags.nonInteractive = true;
|
|
304
|
+
if (arg === "--reset") flags.reset = true;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
if (flags.reset) {
|
|
308
|
+
if (fs.existsSync(ENV_PATH)) {
|
|
309
|
+
fs.unlinkSync(ENV_PATH);
|
|
310
|
+
console.log(c.green(" Config reset."));
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
if (flags.nonInteractive) {
|
|
315
|
+
const vars = {
|
|
316
|
+
OPENROUTER_API_KEY: flags.key || process.env.OPENROUTER_API_KEY || "",
|
|
317
|
+
TRANSCRIPTION_BACKEND: "openrouter",
|
|
318
|
+
PRIVACY_MODE: "standard",
|
|
319
|
+
AGENT_MODEL: "google/gemini-2.5-flash-lite",
|
|
320
|
+
ESCALATION_MODE: "off",
|
|
321
|
+
SINAIN_AGENT: "claude",
|
|
322
|
+
OPENCLAW_WS_URL: "",
|
|
323
|
+
OPENCLAW_HTTP_URL: "",
|
|
324
|
+
PORT: "9500",
|
|
325
|
+
AUDIO_CAPTURE_CMD: "screencapturekit",
|
|
326
|
+
AUDIO_AUTO_START: "true",
|
|
327
|
+
SINAIN_CORE_URL: "http://localhost:9500",
|
|
328
|
+
SINAIN_POLL_INTERVAL: "5",
|
|
329
|
+
SINAIN_HEARTBEAT_INTERVAL: "900",
|
|
330
|
+
};
|
|
331
|
+
|
|
332
|
+
if (!vars.OPENROUTER_API_KEY) {
|
|
333
|
+
console.error(c.red(" --key=<key> or OPENROUTER_API_KEY required for non-interactive mode."));
|
|
334
|
+
process.exit(1);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
writeEnv(vars);
|
|
338
|
+
console.log(c.green(` Config written to ${ENV_PATH}`));
|
|
339
|
+
process.exit(0);
|
|
340
|
+
} else {
|
|
341
|
+
runOnboard(flags).catch((err) => {
|
|
342
|
+
console.error(c.red(` Error: ${err.message}`));
|
|
343
|
+
process.exit(1);
|
|
344
|
+
});
|
|
345
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@geravant/sinain",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.9.0",
|
|
4
4
|
"description": "Ambient intelligence that sees what you see, hears what you hear, and acts on your behalf",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -14,6 +14,9 @@
|
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
16
|
"cli.js",
|
|
17
|
+
"config.js",
|
|
18
|
+
"config-shared.js",
|
|
19
|
+
"onboard.js",
|
|
17
20
|
"launcher.js",
|
|
18
21
|
"setup-overlay.js",
|
|
19
22
|
"setup-sck-capture.js",
|
|
@@ -47,5 +50,8 @@
|
|
|
47
50
|
"./index.ts"
|
|
48
51
|
]
|
|
49
52
|
},
|
|
50
|
-
"license": "MIT"
|
|
53
|
+
"license": "MIT",
|
|
54
|
+
"dependencies": {
|
|
55
|
+
"@clack/prompts": "^1.1.0"
|
|
56
|
+
}
|
|
51
57
|
}
|
package/sense_client/__main__.py
CHANGED
|
@@ -371,13 +371,17 @@ def main():
|
|
|
371
371
|
try:
|
|
372
372
|
from PIL import Image as PILImage
|
|
373
373
|
pil = PILImage.fromarray(frame) if isinstance(frame, np.ndarray) else frame
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
374
|
+
result = vision_provider.describe(pil, prompt=prompt or None)
|
|
375
|
+
scene = result.text
|
|
376
|
+
v_cost = result.cost
|
|
377
|
+
if scene or v_cost:
|
|
378
|
+
if scene:
|
|
379
|
+
log(f"vision: {scene[:80]}...")
|
|
377
380
|
ctx_ev = SenseEvent(type="context", ts=ts)
|
|
378
|
-
ctx_ev.observation = SenseObservation(scene=scene)
|
|
381
|
+
ctx_ev.observation = SenseObservation(scene=scene or "")
|
|
379
382
|
ctx_ev.meta = meta
|
|
380
383
|
ctx_ev.roi = package_full_frame(frame)
|
|
384
|
+
ctx_ev.vision_cost = v_cost
|
|
381
385
|
sender.send(ctx_ev)
|
|
382
386
|
except Exception as e:
|
|
383
387
|
log(f"vision error: {e}")
|
package/sense_client/gate.py
CHANGED
|
@@ -43,6 +43,7 @@ class SenseEvent:
|
|
|
43
43
|
diff: dict | None = None
|
|
44
44
|
meta: SenseMeta = field(default_factory=SenseMeta)
|
|
45
45
|
observation: SenseObservation = field(default_factory=SenseObservation)
|
|
46
|
+
vision_cost: dict | None = None # {cost, tokens_in, tokens_out, model}
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class DecisionGate:
|
package/sense_client/ocr.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""OCR backends for UI text extraction: macOS Vision, Windows.Media.Ocr, and Tesseract."""
|
|
2
|
+
|
|
2
3
|
from __future__ import annotations
|
|
3
4
|
|
|
4
5
|
import io
|
|
@@ -24,8 +25,13 @@ class OCRResult:
|
|
|
24
25
|
class LocalOCR:
|
|
25
26
|
"""Tesseract OCR wrapper for UI text extraction."""
|
|
26
27
|
|
|
27
|
-
def __init__(
|
|
28
|
-
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
lang: str = "eng",
|
|
31
|
+
psm: int = 11,
|
|
32
|
+
min_confidence: int = 30,
|
|
33
|
+
enabled: bool = True,
|
|
34
|
+
):
|
|
29
35
|
self.lang = lang
|
|
30
36
|
self.psm = psm
|
|
31
37
|
self.min_confidence = min_confidence
|
|
@@ -87,8 +93,12 @@ class LocalOCR:
|
|
|
87
93
|
class VisionOCR:
|
|
88
94
|
"""macOS Vision framework OCR using pyobjc."""
|
|
89
95
|
|
|
90
|
-
def __init__(
|
|
91
|
-
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
languages: list[str] | None = None,
|
|
99
|
+
min_confidence: float = 0.5,
|
|
100
|
+
enabled: bool = True,
|
|
101
|
+
):
|
|
92
102
|
self.languages = languages or ["en", "ru"]
|
|
93
103
|
self.min_confidence = min_confidence
|
|
94
104
|
self.enabled = enabled
|
|
@@ -101,8 +111,12 @@ class VisionOCR:
|
|
|
101
111
|
import objc # noqa: F401
|
|
102
112
|
import Quartz # noqa: F401
|
|
103
113
|
from Foundation import NSURL, NSData # noqa: F401
|
|
104
|
-
|
|
105
|
-
|
|
114
|
+
|
|
115
|
+
objc.loadBundle(
|
|
116
|
+
"Vision",
|
|
117
|
+
bundle_path="/System/Library/Frameworks/Vision.framework",
|
|
118
|
+
module_globals=globals(),
|
|
119
|
+
)
|
|
106
120
|
self._available = True
|
|
107
121
|
except Exception as e:
|
|
108
122
|
print(f"[ocr] Vision framework unavailable: {e}", flush=True)
|
|
@@ -120,9 +134,8 @@ class VisionOCR:
|
|
|
120
134
|
|
|
121
135
|
def _do_extract(self, image: Image.Image) -> OCRResult:
|
|
122
136
|
import objc
|
|
123
|
-
import Vision
|
|
124
|
-
from Foundation import NSData
|
|
125
137
|
import Quartz
|
|
138
|
+
from Foundation import NSData
|
|
126
139
|
|
|
127
140
|
# Convert PIL Image to CGImage via PNG bytes
|
|
128
141
|
buf = io.BytesIO()
|
|
@@ -138,13 +151,13 @@ class VisionOCR:
|
|
|
138
151
|
return OCRResult(text="", confidence=0, word_count=0)
|
|
139
152
|
|
|
140
153
|
# Create and configure request
|
|
141
|
-
request =
|
|
142
|
-
request.setRecognitionLevel_(
|
|
154
|
+
request = VNRecognizeTextRequest.alloc().init()
|
|
155
|
+
request.setRecognitionLevel_(0) # VNRequestTextRecognitionLevelAccurate
|
|
143
156
|
request.setRecognitionLanguages_(self.languages)
|
|
144
157
|
request.setUsesLanguageCorrection_(True)
|
|
145
158
|
|
|
146
159
|
# Execute
|
|
147
|
-
handler =
|
|
160
|
+
handler = VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
|
|
148
161
|
success = handler.performRequests_error_([request], objc.nil)
|
|
149
162
|
if not success[0]:
|
|
150
163
|
return OCRResult(text="", confidence=0, word_count=0)
|
|
@@ -197,8 +210,9 @@ class VisionOCR:
|
|
|
197
210
|
class WinOCR:
|
|
198
211
|
"""Windows.Media.Ocr backend via winrt-python (Windows 10+)."""
|
|
199
212
|
|
|
200
|
-
def __init__(
|
|
201
|
-
|
|
213
|
+
def __init__(
|
|
214
|
+
self, language: str = "en", min_confidence: float = 0.5, enabled: bool = True
|
|
215
|
+
):
|
|
202
216
|
self.language = language
|
|
203
217
|
self.min_confidence = min_confidence
|
|
204
218
|
self.enabled = enabled
|
|
@@ -209,8 +223,8 @@ class WinOCR:
|
|
|
209
223
|
return
|
|
210
224
|
|
|
211
225
|
try:
|
|
212
|
-
from winrt.windows.media.ocr import OcrEngine
|
|
213
226
|
from winrt.windows.globalization import Language
|
|
227
|
+
from winrt.windows.media.ocr import OcrEngine
|
|
214
228
|
|
|
215
229
|
lang = Language(language)
|
|
216
230
|
if OcrEngine.is_language_supported(lang):
|
|
@@ -234,11 +248,15 @@ class WinOCR:
|
|
|
234
248
|
|
|
235
249
|
def _do_extract(self, image: Image.Image) -> OCRResult:
|
|
236
250
|
import asyncio
|
|
251
|
+
|
|
237
252
|
from winrt.windows.graphics.imaging import (
|
|
238
|
-
|
|
253
|
+
BitmapAlphaMode,
|
|
254
|
+
BitmapPixelFormat,
|
|
255
|
+
SoftwareBitmap,
|
|
239
256
|
)
|
|
240
257
|
from winrt.windows.storage.streams import (
|
|
241
|
-
|
|
258
|
+
DataWriter,
|
|
259
|
+
InMemoryRandomAccessStream,
|
|
242
260
|
)
|
|
243
261
|
|
|
244
262
|
# Convert PIL to BMP bytes and load as SoftwareBitmap
|
|
@@ -254,13 +272,15 @@ class WinOCR:
|
|
|
254
272
|
stream.seek(0)
|
|
255
273
|
|
|
256
274
|
from winrt.windows.graphics.imaging import BitmapDecoder
|
|
275
|
+
|
|
257
276
|
decoder = await BitmapDecoder.create_async(stream)
|
|
258
277
|
bitmap = await decoder.get_software_bitmap_async()
|
|
259
278
|
|
|
260
279
|
# Convert to supported pixel format if needed
|
|
261
280
|
if bitmap.bitmap_pixel_format != BitmapPixelFormat.BGRA8:
|
|
262
|
-
bitmap = SoftwareBitmap.convert(
|
|
263
|
-
|
|
281
|
+
bitmap = SoftwareBitmap.convert(
|
|
282
|
+
bitmap, BitmapPixelFormat.BGRA8, BitmapAlphaMode.PREMULTIPLIED
|
|
283
|
+
)
|
|
264
284
|
|
|
265
285
|
result = await self._engine.recognize_async(bitmap)
|
|
266
286
|
return result
|
|
@@ -318,10 +338,15 @@ def create_ocr(config: dict):
|
|
|
318
338
|
enabled=enabled,
|
|
319
339
|
)
|
|
320
340
|
if vision._available:
|
|
321
|
-
print(
|
|
341
|
+
print(
|
|
342
|
+
f"[ocr] using Vision backend (languages={vision.languages})", flush=True
|
|
343
|
+
)
|
|
322
344
|
return vision
|
|
323
345
|
if backend == "vision":
|
|
324
|
-
print(
|
|
346
|
+
print(
|
|
347
|
+
"[ocr] Vision requested but unavailable, falling back to Tesseract",
|
|
348
|
+
flush=True,
|
|
349
|
+
)
|
|
325
350
|
|
|
326
351
|
# Windows: try Windows.Media.Ocr
|
|
327
352
|
if sys.platform == "win32" and backend in ("auto", "winocr"):
|
|
@@ -332,10 +357,15 @@ def create_ocr(config: dict):
|
|
|
332
357
|
enabled=enabled,
|
|
333
358
|
)
|
|
334
359
|
if winocr._available:
|
|
335
|
-
print(
|
|
360
|
+
print(
|
|
361
|
+
f"[ocr] using WinOCR backend (language={winocr.language})", flush=True
|
|
362
|
+
)
|
|
336
363
|
return winocr
|
|
337
364
|
if backend == "winocr":
|
|
338
|
-
print(
|
|
365
|
+
print(
|
|
366
|
+
"[ocr] WinOCR requested but unavailable, falling back to Tesseract",
|
|
367
|
+
flush=True,
|
|
368
|
+
)
|
|
339
369
|
|
|
340
370
|
# Fallback to Tesseract (cross-platform)
|
|
341
371
|
print("[ocr] using Tesseract backend", flush=True)
|
package/sense_client/sender.py
CHANGED