@vortex-os/computer-use 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,135 +1,135 @@
1
- // @vortex-os/computer-use — noise filter for the watch layer (design §22.1).
2
- //
3
- // Problem: video / games / scrolling change every frame, so a raw per-frame change threshold
4
- // floods the agent with one event per ripple of the SAME activity (each event costs a capture
5
- // + an LLM look). Measured calibration: a playing video jitters ~2.5–4% frame-to-frame, a real
6
- // scene cut jumps ~16.8%. We want to ignore the steady jitter and report only meaningful,
7
- // settled changes — without ever going completely silent on a screen that keeps moving.
8
- //
9
- // Design: debounce + cooldown COMBINED, with hysteresis. Neither alone works:
10
- // - debounce alone ("report once it goes quiet") starves on video/games that never go quiet.
11
- // - cooldown alone ("report at most once per N s") fires on a half-drawn transition frame (bad quality).
12
- // So they split roles:
13
- // - debounce = QUALITY — emit the frame AFTER motion settles (a clean, stable shot).
14
- // - cooldown = FREQUENCY — never emit more than once per cooldownMs (suppress the ripples).
15
- // - maxWait = ANTI-STARVATION — once WOKEN, if motion never settles, emit anyway every maxWaitMs, so a
16
- // screen with sustained ABOVE-threshold motion still yields periodic snapshots (not silence).
17
- // - hysteresis = two thresholds so ambient jitter never even wakes the filter (no flapping).
18
- // Note on the silence boundary: a screen that changes only BELOW activityThreshold (e.g. steady video jitter
19
- // at 2.5-4%) is intentionally treated as "nothing happening" and produces no events — that IS the goal of
20
- // ignoring the ripple, and it means maxWait only applies once a real (above-threshold) change has woken the
21
- // filter. Lower activityThreshold if you want fainter sustained motion to count as activity.
22
- //
23
- // This module is pure (no I/O, no timers of its own — the caller feeds it samples with an explicit
24
- // `now`), so the state machine is deterministically unit-testable from a synthetic change sequence.
25
-
26
- // Calibrated against the measured numbers above. All tunable per start_watch.
27
- export const FILTER_DEFAULTS = {
28
- // % frame-to-frame change to WAKE the filter (enter an "active" episode). Set well above the
29
- // measured video jitter ceiling (~4%) so steady playback never registers as a new event, yet
30
- // below a scene cut (~16.8%) so real transitions do. The whole-frame metric dilutes tiny local
31
- // changes (a clock, a cursor), so target a meaningful region/window for small UI events.
32
- activityThreshold: 8,
33
- // % below which a frame counts as "still". Hysteresis: must be < activityThreshold so a woken
34
- // episode keeps tracking motion down to a lower floor before it's declared settled.
35
- quietThreshold: 5,
36
- // Motion must stay below quietThreshold this long before we emit a "settled" event (quality gate).
37
- // Keep it a small multiple of the poll interval so a couple of quiet polls confirm the settle.
38
- debounceQuietMs: 900,
39
- // Minimum gap between emitted events (frequency cap). Suppresses the ripples of one activity.
40
- cooldownMs: 6000,
41
- // If an episode never settles (sustained motion: video, ongoing battle), emit anyway this often
42
- // so a continuously-moving screen still produces periodic snapshots instead of going silent.
43
- maxWaitMs: 8000,
44
- };
45
-
46
- const clampNum = (v, lo, hi, dflt) => {
47
- const n = Number(v);
48
- if (!Number.isFinite(n)) return dflt;
49
- return Math.min(hi, Math.max(lo, n));
50
- };
51
-
52
- // Sanitize caller-supplied options into a coherent, safe config. Enforces the hysteresis invariant
53
- // (quietThreshold < activityThreshold) so a misconfiguration can't make the filter flap or never wake.
54
- export function resolveFilterConfig(opts = {}) {
55
- const d = FILTER_DEFAULTS;
56
- let activityThreshold = clampNum(opts.activityThreshold, 0.1, 100, d.activityThreshold);
57
- let quietThreshold = clampNum(opts.quietThreshold, 0, 100, d.quietThreshold);
58
- // Keep the quiet floor strictly below the wake threshold (hysteresis). If a caller inverts them,
59
- // pull the quiet floor down to just under the wake threshold rather than silently swapping intent.
60
- if (quietThreshold >= activityThreshold) quietThreshold = Math.max(0, activityThreshold - 1);
61
- return {
62
- activityThreshold,
63
- quietThreshold,
64
- debounceQuietMs: clampNum(opts.debounceQuietMs, 0, 60000, d.debounceQuietMs),
65
- cooldownMs: clampNum(opts.cooldownMs, 0, 600000, d.cooldownMs),
66
- maxWaitMs: clampNum(opts.maxWaitMs, 100, 600000, d.maxWaitMs),
67
- };
68
- }
69
-
70
- export class NoiseFilter {
71
- constructor(opts = {}) {
72
- this.cfg = resolveFilterConfig(opts);
73
- this.phase = 'idle'; // idle | active | cooldown
74
- this.activeStart = 0; // ts the current episode woke
75
- this.lastMotionTs = 0; // last ts change was >= quietThreshold (i.e. "still moving")
76
- this.lastEmitTs = -Infinity; // ts of the last emitted event
77
- this.peakPct = 0; // strongest change seen during the current episode
78
- this.samples = 0; // total samples fed (diagnostics)
79
- }
80
-
81
- // Feed one polled sample. Returns an emit descriptor { reason, peakPct, activeMs } when this sample
82
- // triggers an event, otherwise null. `now` is the caller's clock (ms); `baseline` true means the
83
- // change metric is not meaningful for this sample (fresh baseline / lost watch state) — reset the episode.
84
- push({ changePct, now, baseline = false }) {
85
- this.samples++;
86
- const c = Number.isFinite(Number(changePct)) ? Number(changePct) : 0;
87
- const cfg = this.cfg;
88
-
89
- if (baseline) {
90
- // A (re)baseline carries no comparable diff — abandon any in-progress episode, stay armed.
91
- this.phase = 'idle';
92
- this.peakPct = 0;
93
- return null;
94
- }
95
-
96
- // Re-arm after cooldown expires, then evaluate this same sample as idle (so a sample that arrives
97
- // already above the wake threshold starts the next episode immediately, no wasted poll).
98
- if (this.phase === 'cooldown') {
99
- if (now - this.lastEmitTs >= cfg.cooldownMs) this.phase = 'idle';
100
- else return null;
101
- }
102
-
103
- if (this.phase === 'idle') {
104
- if (c >= cfg.activityThreshold) {
105
- this.phase = 'active';
106
- this.activeStart = now;
107
- this.lastMotionTs = now;
108
- this.peakPct = c;
109
- }
110
- return null;
111
- }
112
-
113
- // phase === 'active'
114
- if (c > this.peakPct) this.peakPct = c;
115
- if (c >= cfg.quietThreshold) this.lastMotionTs = now; // still moving — push the settle clock back
116
- const quietFor = now - this.lastMotionTs;
117
- const activeFor = now - this.activeStart;
118
- if (quietFor >= cfg.debounceQuietMs) return this._emit('settled', now, activeFor);
119
- if (activeFor >= cfg.maxWaitMs) return this._emit('maxwait', now, activeFor);
120
- return null;
121
- }
122
-
123
- _emit(reason, now, activeMs) {
124
- const peakPct = this.peakPct;
125
- this.lastEmitTs = now;
126
- this.phase = 'cooldown';
127
- this.peakPct = 0;
128
- return { reason, peakPct, activeMs };
129
- }
130
-
131
- // Snapshot for status reporting / tests.
132
- get status() {
133
- return { phase: this.phase, samples: this.samples, peakPct: this.peakPct };
134
- }
135
- }
1
+ // @vortex-os/computer-use — noise filter for the watch layer (design §22.1).
2
+ //
3
+ // Problem: video / games / scrolling change every frame, so a raw per-frame change threshold
4
+ // floods the agent with one event per ripple of the SAME activity (each event costs a capture
5
+ // + an LLM look). Measured calibration: a playing video jitters ~2.5–4% frame-to-frame, a real
6
+ // scene cut jumps ~16.8%. We want to ignore the steady jitter and report only meaningful,
7
+ // settled changes — without ever going completely silent on a screen that keeps moving.
8
+ //
9
+ // Design: debounce + cooldown COMBINED, with hysteresis. Neither alone works:
10
+ // - debounce alone ("report once it goes quiet") starves on video/games that never go quiet.
11
+ // - cooldown alone ("report at most once per N s") fires on a half-drawn transition frame (bad quality).
12
+ // So they split roles:
13
+ // - debounce = QUALITY — emit the frame AFTER motion settles (a clean, stable shot).
14
+ // - cooldown = FREQUENCY — never emit more than once per cooldownMs (suppress the ripples).
15
+ // - maxWait = ANTI-STARVATION — once WOKEN, if motion never settles, emit anyway every maxWaitMs, so a
16
+ // screen with sustained ABOVE-threshold motion still yields periodic snapshots (not silence).
17
+ // - hysteresis = two thresholds so ambient jitter never even wakes the filter (no flapping).
18
+ // Note on the silence boundary: a screen that changes only BELOW activityThreshold (e.g. steady video jitter
19
+ // at 2.5-4%) is intentionally treated as "nothing happening" and produces no events — that IS the goal of
20
+ // ignoring the ripple, and it means maxWait only applies once a real (above-threshold) change has woken the
21
+ // filter. Lower activityThreshold if you want fainter sustained motion to count as activity.
22
+ //
23
+ // This module is pure (no I/O, no timers of its own — the caller feeds it samples with an explicit
24
+ // `now`), so the state machine is deterministically unit-testable from a synthetic change sequence.
25
+
26
+ // Calibrated against the measured numbers above. All tunable per start_watch.
27
+ export const FILTER_DEFAULTS = {
28
+ // % frame-to-frame change to WAKE the filter (enter an "active" episode). Set well above the
29
+ // measured video jitter ceiling (~4%) so steady playback never registers as a new event, yet
30
+ // below a scene cut (~16.8%) so real transitions do. The whole-frame metric dilutes tiny local
31
+ // changes (a clock, a cursor), so target a meaningful region/window for small UI events.
32
+ activityThreshold: 8,
33
+ // % below which a frame counts as "still". Hysteresis: must be < activityThreshold so a woken
34
+ // episode keeps tracking motion down to a lower floor before it's declared settled.
35
+ quietThreshold: 5,
36
+ // Motion must stay below quietThreshold this long before we emit a "settled" event (quality gate).
37
+ // Keep it a small multiple of the poll interval so a couple of quiet polls confirm the settle.
38
+ debounceQuietMs: 900,
39
+ // Minimum gap between emitted events (frequency cap). Suppresses the ripples of one activity.
40
+ cooldownMs: 6000,
41
+ // If an episode never settles (sustained motion: video, ongoing battle), emit anyway this often
42
+ // so a continuously-moving screen still produces periodic snapshots instead of going silent.
43
+ maxWaitMs: 8000,
44
+ };
45
+
46
+ const clampNum = (v, lo, hi, dflt) => {
47
+ const n = Number(v);
48
+ if (!Number.isFinite(n)) return dflt;
49
+ return Math.min(hi, Math.max(lo, n));
50
+ };
51
+
52
+ // Sanitize caller-supplied options into a coherent, safe config. Enforces the hysteresis invariant
53
+ // (quietThreshold < activityThreshold) so a misconfiguration can't make the filter flap or never wake.
54
+ export function resolveFilterConfig(opts = {}) {
55
+ const d = FILTER_DEFAULTS;
56
+ let activityThreshold = clampNum(opts.activityThreshold, 0.1, 100, d.activityThreshold);
57
+ let quietThreshold = clampNum(opts.quietThreshold, 0, 100, d.quietThreshold);
58
+ // Keep the quiet floor strictly below the wake threshold (hysteresis). If a caller inverts them,
59
+ // pull the quiet floor down to just under the wake threshold rather than silently swapping intent.
60
+ if (quietThreshold >= activityThreshold) quietThreshold = Math.max(0, activityThreshold - 1);
61
+ return {
62
+ activityThreshold,
63
+ quietThreshold,
64
+ debounceQuietMs: clampNum(opts.debounceQuietMs, 0, 60000, d.debounceQuietMs),
65
+ cooldownMs: clampNum(opts.cooldownMs, 0, 600000, d.cooldownMs),
66
+ maxWaitMs: clampNum(opts.maxWaitMs, 100, 600000, d.maxWaitMs),
67
+ };
68
+ }
69
+
70
+ export class NoiseFilter {
71
+ constructor(opts = {}) {
72
+ this.cfg = resolveFilterConfig(opts);
73
+ this.phase = 'idle'; // idle | active | cooldown
74
+ this.activeStart = 0; // ts the current episode woke
75
+ this.lastMotionTs = 0; // last ts change was >= quietThreshold (i.e. "still moving")
76
+ this.lastEmitTs = -Infinity; // ts of the last emitted event
77
+ this.peakPct = 0; // strongest change seen during the current episode
78
+ this.samples = 0; // total samples fed (diagnostics)
79
+ }
80
+
81
+ // Feed one polled sample. Returns an emit descriptor { reason, peakPct, activeMs } when this sample
82
+ // triggers an event, otherwise null. `now` is the caller's clock (ms); `baseline` true means the
83
+ // change metric is not meaningful for this sample (fresh baseline / lost watch state) — reset the episode.
84
+ push({ changePct, now, baseline = false }) {
85
+ this.samples++;
86
+ const c = Number.isFinite(Number(changePct)) ? Number(changePct) : 0;
87
+ const cfg = this.cfg;
88
+
89
+ if (baseline) {
90
+ // A (re)baseline carries no comparable diff — abandon any in-progress episode, stay armed.
91
+ this.phase = 'idle';
92
+ this.peakPct = 0;
93
+ return null;
94
+ }
95
+
96
+ // Re-arm after cooldown expires, then evaluate this same sample as idle (so a sample that arrives
97
+ // already above the wake threshold starts the next episode immediately, no wasted poll).
98
+ if (this.phase === 'cooldown') {
99
+ if (now - this.lastEmitTs >= cfg.cooldownMs) this.phase = 'idle';
100
+ else return null;
101
+ }
102
+
103
+ if (this.phase === 'idle') {
104
+ if (c >= cfg.activityThreshold) {
105
+ this.phase = 'active';
106
+ this.activeStart = now;
107
+ this.lastMotionTs = now;
108
+ this.peakPct = c;
109
+ }
110
+ return null;
111
+ }
112
+
113
+ // phase === 'active'
114
+ if (c > this.peakPct) this.peakPct = c;
115
+ if (c >= cfg.quietThreshold) this.lastMotionTs = now; // still moving — push the settle clock back
116
+ const quietFor = now - this.lastMotionTs;
117
+ const activeFor = now - this.activeStart;
118
+ if (quietFor >= cfg.debounceQuietMs) return this._emit('settled', now, activeFor);
119
+ if (activeFor >= cfg.maxWaitMs) return this._emit('maxwait', now, activeFor);
120
+ return null;
121
+ }
122
+
123
+ _emit(reason, now, activeMs) {
124
+ const peakPct = this.peakPct;
125
+ this.lastEmitTs = now;
126
+ this.phase = 'cooldown';
127
+ this.peakPct = 0;
128
+ return { reason, peakPct, activeMs };
129
+ }
130
+
131
+ // Snapshot for status reporting / tests.
132
+ get status() {
133
+ return { phase: this.phase, samples: this.samples, peakPct: this.peakPct };
134
+ }
135
+ }
package/scripts/ocr.ps1 CHANGED
@@ -1,92 +1,92 @@
1
- # computer-use — OCR helper (Windows PowerShell 5.1 ONLY; invoked via powershell.exe).
2
- #
3
- # Why a separate helper: the resident worker runs under pwsh 7 (.NET Core), which dropped the WinRT
4
- # projection, so Windows.Media.Ocr (built-in, offline, NO install) cannot be loaded there. pwsh 7
5
- # delegates the OCR step to this script run via the in-box `powershell.exe` (Windows PowerShell 5.1),
6
- # where the WinRT types load. This powers the reflex path: read on-screen text WITHOUT a cloud LLM
7
- # round-trip, so an alert can be spoken in well under a second.
8
- #
9
- # Contract: input -ImagePath <png>; output exactly ONE JSON line on stdout — {ok, text, lang, ms} on
10
- # success, {ok:false, error} on failure (exit 1). Nothing else is written to stdout (clean to parse).
11
- # The PARENT must invoke this with an argument array (never a shell string): powershell.exe -NoProfile
12
- # -NonInteractive -ExecutionPolicy Bypass -File <abs ocr.ps1> -ImagePath <abs temp png>, and must impose
13
- # its OWN hard spawn timeout + temp-PNG cleanup + non-JSON-stdout handling (codex r1).
14
- #
15
- # SECURITY: the recognized text is UNTRUSTED input (it is screen content). It is for narration/alerting
16
- # ONLY — it must never be turned into an action or an instruction the agent follows, and a speaking
17
- # caller MUST add a provenance prefix + shape/cap it before TTS (design §14/§16/§24, codex r1 HIGH).
18
- # Defence-in-depth here: only OCR files under the allowed temp root, bound every WinRT await with a
19
- # timeout, and pre-shape/cap the emitted text so a hung or hostile crop can't stall or flood the caller.
20
- param(
21
- [Parameter(Mandatory = $true)][string]$ImagePath,
22
- [string]$Lang = '', # optional BCP-47 tag (e.g. 'ko', 'en-US'); else user-profile default
23
- [int]$TimeoutMs = 4000, # per-await hard cap so a stuck WinRT call can't block forever
24
- [string]$AllowRoot = '', # only OCR files under this root (default: the user TEMP dir)
25
- [int]$MaxChars = 600 # cap emitted text (a long spoken utterance is itself a risk)
26
- )
27
- $ErrorActionPreference = 'Stop'
28
- try { [Console]::OutputEncoding = [System.Text.UTF8Encoding]::new($false) } catch {}
29
- function Emit($o) { [Console]::Out.WriteLine(($o | ConvertTo-Json -Compress)) }
30
-
31
- try {
32
- # Path policy: resolve to a full path and require it under the allowed root (worker-owned temp crops only).
33
- # The real guarantee is caller-side (it passes a denylist-gated, worker-created temp PNG); this is defence-in-depth.
34
- $root = if ($AllowRoot) { $AllowRoot } else { $env:TEMP }
35
- $rootFull = [System.IO.Path]::GetFullPath($root).TrimEnd('\') + '\'
36
- $full = [System.IO.Path]::GetFullPath($ImagePath)
37
- if (-not $full.StartsWith($rootFull, [System.StringComparison]::OrdinalIgnoreCase)) {
38
- Emit @{ ok = $false; error = 'image path is outside the allowed temp root' }; exit 1
39
- }
40
- if (-not (Test-Path -LiteralPath $full -PathType Leaf)) { Emit @{ ok = $false; error = 'image not found' }; exit 1 }
41
-
42
- $null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
43
- Add-Type -AssemblyName System.Runtime.WindowsRuntime
44
- # The single-arg generic AsTask projection (IAsyncOperation<T> -> Task<T>) so we can synchronously await WinRT calls.
45
- $asTask = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
46
- $_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1'
47
- })[0]
48
- if (-not $asTask) { Emit @{ ok = $false; error = 'WinRT AsTask projection unavailable' }; exit 1 }
49
- # Bound every await: if a WinRT call hangs, .Wait(timeout) returns false and we fail cleanly instead of blocking.
50
- function Await($op, $t) {
51
- $m = $asTask.MakeGenericMethod($t); $task = $m.Invoke($null, @($op))
52
- if (-not $task.Wait($TimeoutMs)) { throw "OCR step timed out after ${TimeoutMs}ms" }
53
- $task.Result
54
- }
55
-
56
- # Engine: a specific recognizer language if requested and installed, otherwise the user-profile default.
57
- $eng = $null
58
- if ($Lang) {
59
- try {
60
- $L = New-Object Windows.Globalization.Language $Lang
61
- if ([Windows.Media.Ocr.OcrEngine]::IsLanguageSupported($L)) { $eng = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($L) }
62
- } catch {}
63
- }
64
- if (-not $eng) { $eng = [Windows.Media.Ocr.OcrEngine]::TryCreateFromUserProfileLanguages() }
65
- if (-not $eng) { Emit @{ ok = $false; error = 'no OCR recognizer language installed on this machine' }; exit 1 }
66
-
67
- $sw = [System.Diagnostics.Stopwatch]::StartNew()
68
- $file = Await ([Windows.Storage.StorageFile, Windows.Storage, ContentType = WindowsRuntime]::GetFileFromPathAsync($full)) ([Windows.Storage.StorageFile])
69
- $stream = Await ($file.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
70
- $decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder, Windows.Graphics, ContentType = WindowsRuntime]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
71
- $soft = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
72
- # Windows OCR is picky about pixel format on some decoders — normalize to Bgra8 for robustness (codex r1 low/med).
73
- if ($soft.BitmapPixelFormat -ne [Windows.Graphics.Imaging.BitmapPixelFormat]::Bgra8) {
74
- $soft = [Windows.Graphics.Imaging.SoftwareBitmap]::Convert($soft, [Windows.Graphics.Imaging.BitmapPixelFormat]::Bgra8, [Windows.Graphics.Imaging.BitmapAlphaMode]::Premultiplied)
75
- }
76
- $res = Await ($eng.RecognizeAsync($soft)) ([Windows.Media.Ocr.OcrResult])
77
- $sw.Stop()
78
-
79
- # Pre-shape the recognized text defensively: strip control/format chars (incl. bidi marks), collapse
80
- # whitespace, cap length. The SPEAKING caller still adds a provenance prefix + full sanitization/budget.
81
- $txt = [string]$res.Text
82
- $txt = ($txt -replace '\p{C}', ' ') -replace '\s+', ' '
83
- $txt = $txt.Trim()
84
- $truncated = $false
85
- if ($txt.Length -gt $MaxChars) { $txt = $txt.Substring(0, $MaxChars); $truncated = $true }
86
-
87
- Emit @{ ok = $true; text = $txt; lang = $eng.RecognizerLanguage.LanguageTag; ms = [int]$sw.Elapsed.TotalMilliseconds; truncated = $truncated }
88
- } catch {
89
- # Keep the reason generic: don't surface raw paths / exception internals into cloud-visible agent context (codex r1 low).
90
- Emit @{ ok = $false; error = 'ocr failed' }
91
- exit 1
92
- }
1
+ # computer-use — OCR helper (Windows PowerShell 5.1 ONLY; invoked via powershell.exe).
2
+ #
3
+ # Why a separate helper: the resident worker runs under pwsh 7 (.NET Core), which dropped the WinRT
4
+ # projection, so Windows.Media.Ocr (built-in, offline, NO install) cannot be loaded there. pwsh 7
5
+ # delegates the OCR step to this script run via the in-box `powershell.exe` (Windows PowerShell 5.1),
6
+ # where the WinRT types load. This powers the reflex path: read on-screen text WITHOUT a cloud LLM
7
+ # round-trip, so an alert can be spoken in well under a second.
8
+ #
9
+ # Contract: input -ImagePath <png>; output exactly ONE JSON line on stdout — {ok, text, lang, ms} on
10
+ # success, {ok:false, error} on failure (exit 1). Nothing else is written to stdout (clean to parse).
11
+ # The PARENT must invoke this with an argument array (never a shell string): powershell.exe -NoProfile
12
+ # -NonInteractive -ExecutionPolicy Bypass -File <abs ocr.ps1> -ImagePath <abs temp png>, and must impose
13
+ # its OWN hard spawn timeout + temp-PNG cleanup + non-JSON-stdout handling (codex r1).
14
+ #
15
+ # SECURITY: the recognized text is UNTRUSTED input (it is screen content). It is for narration/alerting
16
+ # ONLY — it must never be turned into an action or an instruction the agent follows, and a speaking
17
+ # caller MUST add a provenance prefix + shape/cap it before TTS (design §14/§16/§24, codex r1 HIGH).
18
+ # Defence-in-depth here: only OCR files under the allowed temp root, bound every WinRT await with a
19
+ # timeout, and pre-shape/cap the emitted text so a hung or hostile crop can't stall or flood the caller.
20
+ param(
21
+ [Parameter(Mandatory = $true)][string]$ImagePath,
22
+ [string]$Lang = '', # optional BCP-47 tag (e.g. 'ko', 'en-US'); else user-profile default
23
+ [int]$TimeoutMs = 4000, # per-await hard cap so a stuck WinRT call can't block forever
24
+ [string]$AllowRoot = '', # only OCR files under this root (default: the user TEMP dir)
25
+ [int]$MaxChars = 600 # cap emitted text (a long spoken utterance is itself a risk)
26
+ )
27
+ $ErrorActionPreference = 'Stop'
28
+ try { [Console]::OutputEncoding = [System.Text.UTF8Encoding]::new($false) } catch {}
29
+ function Emit($o) { [Console]::Out.WriteLine(($o | ConvertTo-Json -Compress)) }
30
+
31
+ try {
32
+ # Path policy: resolve to a full path and require it under the allowed root (worker-owned temp crops only).
33
+ # The real guarantee is caller-side (it passes a denylist-gated, worker-created temp PNG); this is defence-in-depth.
34
+ $root = if ($AllowRoot) { $AllowRoot } else { $env:TEMP }
35
+ $rootFull = [System.IO.Path]::GetFullPath($root).TrimEnd('\') + '\'
36
+ $full = [System.IO.Path]::GetFullPath($ImagePath)
37
+ if (-not $full.StartsWith($rootFull, [System.StringComparison]::OrdinalIgnoreCase)) {
38
+ Emit @{ ok = $false; error = 'image path is outside the allowed temp root' }; exit 1
39
+ }
40
+ if (-not (Test-Path -LiteralPath $full -PathType Leaf)) { Emit @{ ok = $false; error = 'image not found' }; exit 1 }
41
+
42
+ $null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
43
+ Add-Type -AssemblyName System.Runtime.WindowsRuntime
44
+ # The single-arg generic AsTask projection (IAsyncOperation<T> -> Task<T>) so we can synchronously await WinRT calls.
45
+ $asTask = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
46
+ $_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1'
47
+ })[0]
48
+ if (-not $asTask) { Emit @{ ok = $false; error = 'WinRT AsTask projection unavailable' }; exit 1 }
49
+ # Bound every await: if a WinRT call hangs, .Wait(timeout) returns false and we fail cleanly instead of blocking.
50
+ function Await($op, $t) {
51
+ $m = $asTask.MakeGenericMethod($t); $task = $m.Invoke($null, @($op))
52
+ if (-not $task.Wait($TimeoutMs)) { throw "OCR step timed out after ${TimeoutMs}ms" }
53
+ $task.Result
54
+ }
55
+
56
+ # Engine: a specific recognizer language if requested and installed, otherwise the user-profile default.
57
+ $eng = $null
58
+ if ($Lang) {
59
+ try {
60
+ $L = New-Object Windows.Globalization.Language $Lang
61
+ if ([Windows.Media.Ocr.OcrEngine]::IsLanguageSupported($L)) { $eng = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($L) }
62
+ } catch {}
63
+ }
64
+ if (-not $eng) { $eng = [Windows.Media.Ocr.OcrEngine]::TryCreateFromUserProfileLanguages() }
65
+ if (-not $eng) { Emit @{ ok = $false; error = 'no OCR recognizer language installed on this machine' }; exit 1 }
66
+
67
+ $sw = [System.Diagnostics.Stopwatch]::StartNew()
68
+ $file = Await ([Windows.Storage.StorageFile, Windows.Storage, ContentType = WindowsRuntime]::GetFileFromPathAsync($full)) ([Windows.Storage.StorageFile])
69
+ $stream = Await ($file.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
70
+ $decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder, Windows.Graphics, ContentType = WindowsRuntime]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
71
+ $soft = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
72
+ # Windows OCR is picky about pixel format on some decoders — normalize to Bgra8 for robustness (codex r1 low/med).
73
+ if ($soft.BitmapPixelFormat -ne [Windows.Graphics.Imaging.BitmapPixelFormat]::Bgra8) {
74
+ $soft = [Windows.Graphics.Imaging.SoftwareBitmap]::Convert($soft, [Windows.Graphics.Imaging.BitmapPixelFormat]::Bgra8, [Windows.Graphics.Imaging.BitmapAlphaMode]::Premultiplied)
75
+ }
76
+ $res = Await ($eng.RecognizeAsync($soft)) ([Windows.Media.Ocr.OcrResult])
77
+ $sw.Stop()
78
+
79
+ # Pre-shape the recognized text defensively: strip control/format chars (incl. bidi marks), collapse
80
+ # whitespace, cap length. The SPEAKING caller still adds a provenance prefix + full sanitization/budget.
81
+ $txt = [string]$res.Text
82
+ $txt = ($txt -replace '\p{C}', ' ') -replace '\s+', ' '
83
+ $txt = $txt.Trim()
84
+ $truncated = $false
85
+ if ($txt.Length -gt $MaxChars) { $txt = $txt.Substring(0, $MaxChars); $truncated = $true }
86
+
87
+ Emit @{ ok = $true; text = $txt; lang = $eng.RecognizerLanguage.LanguageTag; ms = [int]$sw.Elapsed.TotalMilliseconds; truncated = $truncated }
88
+ } catch {
89
+ # Keep the reason generic: don't surface raw paths / exception internals into cloud-visible agent context (codex r1 low).
90
+ Emit @{ ok = $false; error = 'ocr failed' }
91
+ exit 1
92
+ }