@auxiora/screen 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +191 -0
- package/dist/analyzer.d.ts +16 -0
- package/dist/analyzer.d.ts.map +1 -0
- package/dist/analyzer.js +23 -0
- package/dist/analyzer.js.map +1 -0
- package/dist/automation.d.ts +38 -0
- package/dist/automation.d.ts.map +1 -0
- package/dist/automation.js +85 -0
- package/dist/automation.js.map +1 -0
- package/dist/capture.d.ts +19 -0
- package/dist/capture.d.ts.map +1 -0
- package/dist/capture.js +45 -0
- package/dist/capture.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/ocr.d.ts +15 -0
- package/dist/ocr.d.ts.map +1 -0
- package/dist/ocr.js +90 -0
- package/dist/ocr.js.map +1 -0
- package/dist/types.d.ts +99 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +10 -0
- package/dist/types.js.map +1 -0
- package/package.json +26 -0
- package/src/analyzer.ts +27 -0
- package/src/automation.ts +122 -0
- package/src/capture.ts +53 -0
- package/src/index.ts +17 -0
- package/src/ocr.ts +111 -0
- package/src/types.ts +110 -0
- package/tests/screen.test.ts +223 -0
- package/tests/wiring.test.ts +12 -0
- package/tsconfig.json +12 -0
- package/tsconfig.tsbuildinfo +1 -0
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/** Rectangular region on screen. */
|
|
2
|
+
export interface ScreenBounds {
|
|
3
|
+
x: number;
|
|
4
|
+
y: number;
|
|
5
|
+
width: number;
|
|
6
|
+
height: number;
|
|
7
|
+
}
|
|
8
|
+
/** A captured screenshot or region. */
|
|
9
|
+
export interface ScreenCapture {
|
|
10
|
+
/** Raw image data (PNG). */
|
|
11
|
+
image: Buffer;
|
|
12
|
+
/** When the capture was taken. */
|
|
13
|
+
timestamp: number;
|
|
14
|
+
/** Pixel dimensions. */
|
|
15
|
+
dimensions: {
|
|
16
|
+
width: number;
|
|
17
|
+
height: number;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
/** Result of OCR text extraction. */
|
|
21
|
+
export interface OCRResult {
|
|
22
|
+
/** Full extracted text. */
|
|
23
|
+
text: string;
|
|
24
|
+
/** Detected text regions with positions. */
|
|
25
|
+
regions: OCRRegion[];
|
|
26
|
+
/** Overall confidence score (0-1). */
|
|
27
|
+
confidence: number;
|
|
28
|
+
}
|
|
29
|
+
/** A single region of detected text. */
|
|
30
|
+
export interface OCRRegion {
|
|
31
|
+
text: string;
|
|
32
|
+
bounds: ScreenBounds;
|
|
33
|
+
confidence: number;
|
|
34
|
+
}
|
|
35
|
+
/** A UI element detected on screen. */
|
|
36
|
+
export interface ScreenElement {
|
|
37
|
+
/** Element type (button, input, link, text, image, icon, etc.). */
|
|
38
|
+
type: string;
|
|
39
|
+
/** Bounding box on screen. */
|
|
40
|
+
bounds: ScreenBounds;
|
|
41
|
+
/** Visible text or label. */
|
|
42
|
+
text?: string;
|
|
43
|
+
/** Whether the element appears interactable. */
|
|
44
|
+
interactable: boolean;
|
|
45
|
+
}
|
|
46
|
+
/** Types of desktop automation actions. */
|
|
47
|
+
export type DesktopActionType = 'click' | 'type' | 'scroll' | 'keypress';
|
|
48
|
+
/** A desktop automation action to perform. */
|
|
49
|
+
export interface DesktopAction {
|
|
50
|
+
type: DesktopActionType;
|
|
51
|
+
/** Target location or element description. */
|
|
52
|
+
target?: {
|
|
53
|
+
x: number;
|
|
54
|
+
y: number;
|
|
55
|
+
} | string;
|
|
56
|
+
/** Action-specific parameters. */
|
|
57
|
+
params?: {
|
|
58
|
+
/** Text to type (for 'type' action). */
|
|
59
|
+
text?: string;
|
|
60
|
+
/** Key combo to press (for 'keypress' action). */
|
|
61
|
+
key?: string;
|
|
62
|
+
/** Scroll delta (for 'scroll' action). */
|
|
63
|
+
deltaX?: number;
|
|
64
|
+
deltaY?: number;
|
|
65
|
+
/** Mouse button (for 'click' action). */
|
|
66
|
+
button?: 'left' | 'right' | 'middle';
|
|
67
|
+
/** Number of clicks (for 'click' action). */
|
|
68
|
+
clickCount?: number;
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
/** Configuration for the screen system. */
|
|
72
|
+
export interface ScreenConfig {
|
|
73
|
+
/** Whether screen capture is allowed. */
|
|
74
|
+
captureEnabled: boolean;
|
|
75
|
+
/** Whether OCR is enabled. */
|
|
76
|
+
ocrEnabled: boolean;
|
|
77
|
+
/** Whether desktop automation is enabled. */
|
|
78
|
+
automationEnabled: boolean;
|
|
79
|
+
/** Maximum capture width in pixels. */
|
|
80
|
+
maxCaptureWidth: number;
|
|
81
|
+
/** Maximum capture height in pixels. */
|
|
82
|
+
maxCaptureHeight: number;
|
|
83
|
+
/** Trust level required for screen capture. */
|
|
84
|
+
captureRequiredTrust: number;
|
|
85
|
+
/** Trust level required for automation actions. */
|
|
86
|
+
automationRequiredTrust: number;
|
|
87
|
+
}
|
|
88
|
+
export declare const DEFAULT_SCREEN_CONFIG: ScreenConfig;
|
|
89
|
+
/** Interface for capture backends (Tauri bridge, mock, etc.). */
|
|
90
|
+
export interface CaptureBackend {
|
|
91
|
+
captureScreen(): Promise<ScreenCapture>;
|
|
92
|
+
captureRegion(bounds: ScreenBounds): Promise<ScreenCapture>;
|
|
93
|
+
captureWindow(title: string): Promise<ScreenCapture>;
|
|
94
|
+
}
|
|
95
|
+
/** Interface for vision model backend used by OCR and analyzer. */
|
|
96
|
+
export interface VisionBackend {
|
|
97
|
+
analyzeImage(image: Buffer, prompt: string): Promise<string>;
|
|
98
|
+
}
|
|
99
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,oCAAoC;AACpC,MAAM,WAAW,YAAY;IAC3B,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,uCAAuC;AACvC,MAAM,WAAW,aAAa;IAC5B,4BAA4B;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,wBAAwB;IACxB,UAAU,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;CAC/C;AAED,qCAAqC;AACrC,MAAM,WAAW,SAAS;IACxB,2BAA2B;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,4CAA4C;IAC5C,OAAO,EAAE,SAAS,EAAE,CAAC;IACrB,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,wCAAwC;AACxC,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,YAAY,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,uCAAuC;AACvC,MAAM,WAAW,aAAa;IAC5B,mEAAmE;IACnE,IAAI,EAAE,MAAM,CAAC;IACb,8BAA8B;IAC9B,MAAM,EAAE,YAAY,CAAC;IACrB,6BAA6B;IAC7B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,gDAAgD;IAChD,YAAY,EAAE,OAAO,CAAC;CACvB;AAED,2CAA2C;AAC3C,MAAM,MAAM,iBAAiB,GAAG,OAAO,GAAG,MAAM,GAAG,QAAQ,GAAG,UAAU,CAAC;AAEzE,8CAA8C;AAC9C,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,iBAAiB,CAAC;IACxB,8CAA8C;IAC9C,MAAM,CAAC,EAAE;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,MAAM,CAAC;IAC3C,kCAAkC;IAClC,MAAM,CAAC,EAAE;QACP,wCAAwC;QACxC,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,kDAAkD;QAClD,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,0CAA0C;QAC1C,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,yCAAyC;QACzC,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,CAAC;QACrC,6CAA6C;QAC7C,UAAU,CAAC,EAAE,MAAM,CAAC;KACrB,CAAC;CACH;AAED,2CAA2C;AAC3C,MAAM,WAAW,YAAY;IAC3B,yCAAyC;IACzC,cAAc,EAAE,OAAO,CAAC;IACxB,8BAA8B;IAC9B,UAAU,EAAE,OAAO,CAAC;IACpB,6CAA6C;IAC7C,iBAAiB,EAAE,OAAO,CAAC;IAC3B,uCAAuC;IACvC,eAAe,EAAE,MAAM,CAAC;IACxB,wCAAwC;IACxC,gBAAgB,EAAE,MAAM,CAAC;IACzB,+CAA+C;IAC/C,oBAAoB,EAAE,MAAM,CAAC;IAC7B,mDAAmD;IACnD,uBAAuB,EAAE,MAAM,CAAC;CACjC;AAED,eAAO,MAAM,qBAAqB,EAAE,YAQnC,CAAC;AAEF,iEAAiE;AACjE,MAAM,WAAW,cAAc;IAC7B,aAAa,IAAI,OAAO,CAAC,aAAa,CAAC,CAAC;IACxC,aAAa,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;IAC5D,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;CACtD;AAED,mEAAmE;AACnE,MAAM,WAAW,aAAa;IAC5B,YAAY,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CAC9D"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAyFA,MAAM,CAAC,MAAM,qBAAqB,GAAiB;IACjD,cAAc,EAAE,IAAI;IACpB,UAAU,EAAE,IAAI;IAChB,iBAAiB,EAAE,KAAK;IACxB,eAAe,EAAE,IAAI;IACrB,gBAAgB,EAAE,IAAI;IACtB,oBAAoB,EAAE,CAAC;IACvB,uBAAuB,EAAE,CAAC;CAC3B,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@auxiora/screen",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Screen capture, OCR, desktop automation, and screen analysis",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"@auxiora/core": "1.0.0",
|
|
16
|
+
"@auxiora/autonomy": "1.0.0"
|
|
17
|
+
},
|
|
18
|
+
"engines": {
|
|
19
|
+
"node": ">=22.0.0"
|
|
20
|
+
},
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "tsc",
|
|
23
|
+
"clean": "rm -rf dist",
|
|
24
|
+
"typecheck": "tsc --noEmit"
|
|
25
|
+
}
|
|
26
|
+
}
|
package/src/analyzer.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { VisionBackend } from './types.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Screen analyzer — sends screenshots to a vision model for analysis.
|
|
5
|
+
*/
|
|
6
|
+
export class ScreenAnalyzer {
|
|
7
|
+
private vision: VisionBackend;
|
|
8
|
+
|
|
9
|
+
constructor(vision: VisionBackend) {
|
|
10
|
+
this.vision = vision;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Analyze a screenshot, optionally answering a specific question about it.
|
|
15
|
+
* @param image - PNG screenshot buffer.
|
|
16
|
+
* @param question - Optional question to answer about the screen content.
|
|
17
|
+
* @returns Natural language analysis of the screen.
|
|
18
|
+
*/
|
|
19
|
+
async analyzeScreen(image: Buffer, question?: string): Promise<string> {
|
|
20
|
+
const prompt = question
|
|
21
|
+
? `Look at this screenshot and answer: ${question}`
|
|
22
|
+
: 'Describe what is visible on this screen. Include any visible text, UI elements, ' +
|
|
23
|
+
'active applications, and notable content. Be concise but thorough.';
|
|
24
|
+
|
|
25
|
+
return this.vision.analyzeImage(image, prompt);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import type { TrustGate } from '@auxiora/autonomy';
|
|
2
|
+
import type { DesktopAction, ScreenConfig } from './types.js';
|
|
3
|
+
import { DEFAULT_SCREEN_CONFIG } from './types.js';
|
|
4
|
+
|
|
5
|
+
/** Result of an automation action. */
|
|
6
|
+
export interface AutomationResult {
|
|
7
|
+
success: boolean;
|
|
8
|
+
action: DesktopAction;
|
|
9
|
+
error?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/** Backend interface for performing desktop automation. */
|
|
13
|
+
export interface AutomationBackend {
|
|
14
|
+
click(x: number, y: number, button?: string, clickCount?: number): Promise<void>;
|
|
15
|
+
typeText(text: string): Promise<void>;
|
|
16
|
+
keypress(key: string): Promise<void>;
|
|
17
|
+
scroll(deltaX: number, deltaY: number): Promise<void>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Desktop automation with trust-gated actions.
|
|
22
|
+
* Every action is checked against TrustGate before execution.
|
|
23
|
+
*/
|
|
24
|
+
export class DesktopAutomation {
|
|
25
|
+
private backend: AutomationBackend;
|
|
26
|
+
private gate: TrustGate;
|
|
27
|
+
private config: ScreenConfig;
|
|
28
|
+
|
|
29
|
+
constructor(backend: AutomationBackend, gate: TrustGate, config?: Partial<ScreenConfig>) {
|
|
30
|
+
this.backend = backend;
|
|
31
|
+
this.gate = gate;
|
|
32
|
+
this.config = { ...DEFAULT_SCREEN_CONFIG, ...config };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Execute a desktop action (trust-gated). */
|
|
36
|
+
async execute(action: DesktopAction): Promise<AutomationResult> {
|
|
37
|
+
if (!this.config.automationEnabled) {
|
|
38
|
+
return { success: false, action, error: 'Desktop automation is disabled' };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const gateResult = this.gate.gate(
|
|
42
|
+
'system',
|
|
43
|
+
`screen:${action.type}`,
|
|
44
|
+
this.config.automationRequiredTrust as 0 | 1 | 2 | 3 | 4,
|
|
45
|
+
);
|
|
46
|
+
if (!gateResult.allowed) {
|
|
47
|
+
return { success: false, action, error: gateResult.message };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
try {
|
|
51
|
+
switch (action.type) {
|
|
52
|
+
case 'click': {
|
|
53
|
+
const target = action.target;
|
|
54
|
+
if (!target || typeof target === 'string') {
|
|
55
|
+
return { success: false, action, error: 'Click requires x,y coordinates' };
|
|
56
|
+
}
|
|
57
|
+
await this.backend.click(
|
|
58
|
+
target.x,
|
|
59
|
+
target.y,
|
|
60
|
+
action.params?.button ?? 'left',
|
|
61
|
+
action.params?.clickCount ?? 1,
|
|
62
|
+
);
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
case 'type': {
|
|
66
|
+
const text = action.params?.text;
|
|
67
|
+
if (!text) {
|
|
68
|
+
return { success: false, action, error: 'Type action requires text' };
|
|
69
|
+
}
|
|
70
|
+
await this.backend.typeText(text);
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
case 'keypress': {
|
|
74
|
+
const key = action.params?.key;
|
|
75
|
+
if (!key) {
|
|
76
|
+
return { success: false, action, error: 'Keypress action requires key' };
|
|
77
|
+
}
|
|
78
|
+
await this.backend.keypress(key);
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
81
|
+
case 'scroll': {
|
|
82
|
+
await this.backend.scroll(
|
|
83
|
+
action.params?.deltaX ?? 0,
|
|
84
|
+
action.params?.deltaY ?? 0,
|
|
85
|
+
);
|
|
86
|
+
break;
|
|
87
|
+
}
|
|
88
|
+
default:
|
|
89
|
+
return { success: false, action, error: `Unknown action type: ${(action as any).type}` };
|
|
90
|
+
}
|
|
91
|
+
return { success: true, action };
|
|
92
|
+
} catch (error) {
|
|
93
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
94
|
+
return { success: false, action, error: message };
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/** Click at coordinates (trust-gated). */
|
|
99
|
+
async click(x: number, y: number, button: 'left' | 'right' | 'middle' = 'left'): Promise<AutomationResult> {
|
|
100
|
+
return this.execute({ type: 'click', target: { x, y }, params: { button } });
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/** Type text (trust-gated). */
|
|
104
|
+
async type(text: string): Promise<AutomationResult> {
|
|
105
|
+
return this.execute({ type: 'type', params: { text } });
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Press a key combo (trust-gated). */
|
|
109
|
+
async keypress(key: string): Promise<AutomationResult> {
|
|
110
|
+
return this.execute({ type: 'keypress', params: { key } });
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Scroll (trust-gated). */
|
|
114
|
+
async scroll(deltaX: number, deltaY: number): Promise<AutomationResult> {
|
|
115
|
+
return this.execute({ type: 'scroll', params: { deltaX, deltaY } });
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/** Find an element by description and click it (trust-gated). */
|
|
119
|
+
async findAndClick(description: string): Promise<AutomationResult> {
|
|
120
|
+
return this.execute({ type: 'click', target: description });
|
|
121
|
+
}
|
|
122
|
+
}
|
package/src/capture.ts
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import type { CaptureBackend, ScreenBounds, ScreenCapture, ScreenConfig } from './types.js';
|
|
2
|
+
import { DEFAULT_SCREEN_CONFIG } from './types.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Screen capturer with injectable backend.
|
|
6
|
+
* Uses a CaptureBackend for actual capture (Tauri bridge on desktop, mock in tests).
|
|
7
|
+
*/
|
|
8
|
+
export class ScreenCapturer {
|
|
9
|
+
private backend: CaptureBackend;
|
|
10
|
+
private config: ScreenConfig;
|
|
11
|
+
|
|
12
|
+
constructor(backend: CaptureBackend, config?: Partial<ScreenConfig>) {
|
|
13
|
+
this.backend = backend;
|
|
14
|
+
this.config = { ...DEFAULT_SCREEN_CONFIG, ...config };
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/** Capture the entire screen. */
|
|
18
|
+
async captureScreen(): Promise<ScreenCapture> {
|
|
19
|
+
if (!this.config.captureEnabled) {
|
|
20
|
+
throw new Error('Screen capture is disabled');
|
|
21
|
+
}
|
|
22
|
+
return this.backend.captureScreen();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Capture a specific region of the screen. */
|
|
26
|
+
async captureRegion(bounds: ScreenBounds): Promise<ScreenCapture> {
|
|
27
|
+
if (!this.config.captureEnabled) {
|
|
28
|
+
throw new Error('Screen capture is disabled');
|
|
29
|
+
}
|
|
30
|
+
if (bounds.width > this.config.maxCaptureWidth || bounds.height > this.config.maxCaptureHeight) {
|
|
31
|
+
throw new Error(
|
|
32
|
+
`Region exceeds max capture dimensions (${this.config.maxCaptureWidth}x${this.config.maxCaptureHeight})`
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
return this.backend.captureRegion(bounds);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Capture a window by title. */
|
|
39
|
+
async captureWindow(title: string): Promise<ScreenCapture> {
|
|
40
|
+
if (!this.config.captureEnabled) {
|
|
41
|
+
throw new Error('Screen capture is disabled');
|
|
42
|
+
}
|
|
43
|
+
if (!title || title.trim().length === 0) {
|
|
44
|
+
throw new Error('Window title must not be empty');
|
|
45
|
+
}
|
|
46
|
+
return this.backend.captureWindow(title);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Get current config. */
|
|
50
|
+
getConfig(): ScreenConfig {
|
|
51
|
+
return { ...this.config };
|
|
52
|
+
}
|
|
53
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export type {
|
|
2
|
+
ScreenBounds,
|
|
3
|
+
ScreenCapture,
|
|
4
|
+
OCRResult,
|
|
5
|
+
OCRRegion,
|
|
6
|
+
ScreenElement,
|
|
7
|
+
DesktopAction,
|
|
8
|
+
DesktopActionType,
|
|
9
|
+
ScreenConfig,
|
|
10
|
+
CaptureBackend,
|
|
11
|
+
VisionBackend,
|
|
12
|
+
} from './types.js';
|
|
13
|
+
export { DEFAULT_SCREEN_CONFIG } from './types.js';
|
|
14
|
+
export { ScreenCapturer } from './capture.js';
|
|
15
|
+
export { OCREngine } from './ocr.js';
|
|
16
|
+
export { DesktopAutomation, type AutomationResult, type AutomationBackend } from './automation.js';
|
|
17
|
+
export { ScreenAnalyzer } from './analyzer.js';
|
package/src/ocr.ts
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import type { OCRResult, ScreenElement, VisionBackend } from './types.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* OCR engine that uses a vision model to extract text and detect UI elements.
|
|
5
|
+
*/
|
|
6
|
+
export class OCREngine {
|
|
7
|
+
private vision: VisionBackend;
|
|
8
|
+
|
|
9
|
+
constructor(vision: VisionBackend) {
|
|
10
|
+
this.vision = vision;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** Extract text from an image using the vision model. */
|
|
14
|
+
async extractText(image: Buffer): Promise<OCRResult> {
|
|
15
|
+
const response = await this.vision.analyzeImage(
|
|
16
|
+
image,
|
|
17
|
+
'Extract all visible text from this screenshot. Return the text content organized by position, ' +
|
|
18
|
+
'from top-left to bottom-right. For each distinct text block, provide the text and its approximate ' +
|
|
19
|
+
'position as a percentage of the image dimensions (x%, y%, width%, height%). ' +
|
|
20
|
+
'Format: TEXT|||x,y,w,h|||confidence\nOne per line.'
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
return this.parseTextResponse(response);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Find UI elements in an image using the vision model. */
|
|
27
|
+
async findElements(image: Buffer): Promise<ScreenElement[]> {
|
|
28
|
+
const response = await this.vision.analyzeImage(
|
|
29
|
+
image,
|
|
30
|
+
'Identify all UI elements in this screenshot. For each element, provide:\n' +
|
|
31
|
+
'- type (button, input, link, text, image, icon, checkbox, dropdown, menu, tab)\n' +
|
|
32
|
+
'- approximate bounding box as percentages (x%, y%, width%, height%)\n' +
|
|
33
|
+
'- visible text or label\n' +
|
|
34
|
+
'- whether it appears interactable (true/false)\n' +
|
|
35
|
+
'Format: type|||x,y,w,h|||text|||interactable\nOne per line.'
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
return this.parseElementsResponse(response);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
private parseTextResponse(response: string): OCRResult {
|
|
42
|
+
const lines = response.split('\n').filter(l => l.trim().length > 0);
|
|
43
|
+
const regions: OCRResult['regions'] = [];
|
|
44
|
+
let totalConfidence = 0;
|
|
45
|
+
|
|
46
|
+
for (const line of lines) {
|
|
47
|
+
const parts = line.split('|||').map(s => s.trim());
|
|
48
|
+
if (parts.length < 2) continue;
|
|
49
|
+
|
|
50
|
+
const text = parts[0];
|
|
51
|
+
const boundsStr = parts[1];
|
|
52
|
+
const confidence = parts[2] ? parseFloat(parts[2]) : 0.8;
|
|
53
|
+
|
|
54
|
+
const coords = boundsStr.split(',').map(s => parseFloat(s.trim()));
|
|
55
|
+
if (coords.length < 4 || coords.some(n => isNaN(n))) continue;
|
|
56
|
+
|
|
57
|
+
regions.push({
|
|
58
|
+
text,
|
|
59
|
+
bounds: {
|
|
60
|
+
x: Math.round(coords[0]),
|
|
61
|
+
y: Math.round(coords[1]),
|
|
62
|
+
width: Math.round(coords[2]),
|
|
63
|
+
height: Math.round(coords[3]),
|
|
64
|
+
},
|
|
65
|
+
confidence: isNaN(confidence) ? 0.8 : Math.max(0, Math.min(1, confidence)),
|
|
66
|
+
});
|
|
67
|
+
totalConfidence += confidence;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const fullText = regions.map(r => r.text).join('\n');
|
|
71
|
+
const avgConfidence = regions.length > 0 ? totalConfidence / regions.length : 0;
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
text: fullText,
|
|
75
|
+
regions,
|
|
76
|
+
confidence: Math.max(0, Math.min(1, avgConfidence)),
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
private parseElementsResponse(response: string): ScreenElement[] {
|
|
81
|
+
const lines = response.split('\n').filter(l => l.trim().length > 0);
|
|
82
|
+
const elements: ScreenElement[] = [];
|
|
83
|
+
|
|
84
|
+
for (const line of lines) {
|
|
85
|
+
const parts = line.split('|||').map(s => s.trim());
|
|
86
|
+
if (parts.length < 4) continue;
|
|
87
|
+
|
|
88
|
+
const type = parts[0];
|
|
89
|
+
const boundsStr = parts[1];
|
|
90
|
+
const text = parts[2] || undefined;
|
|
91
|
+
const interactable = parts[3] === 'true';
|
|
92
|
+
|
|
93
|
+
const coords = boundsStr.split(',').map(s => parseFloat(s.trim()));
|
|
94
|
+
if (coords.length < 4 || coords.some(n => isNaN(n))) continue;
|
|
95
|
+
|
|
96
|
+
elements.push({
|
|
97
|
+
type,
|
|
98
|
+
bounds: {
|
|
99
|
+
x: Math.round(coords[0]),
|
|
100
|
+
y: Math.round(coords[1]),
|
|
101
|
+
width: Math.round(coords[2]),
|
|
102
|
+
height: Math.round(coords[3]),
|
|
103
|
+
},
|
|
104
|
+
text,
|
|
105
|
+
interactable,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return elements;
|
|
110
|
+
}
|
|
111
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/** Rectangular region on screen. */
|
|
2
|
+
export interface ScreenBounds {
|
|
3
|
+
x: number;
|
|
4
|
+
y: number;
|
|
5
|
+
width: number;
|
|
6
|
+
height: number;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
/** A captured screenshot or region. */
|
|
10
|
+
export interface ScreenCapture {
|
|
11
|
+
/** Raw image data (PNG). */
|
|
12
|
+
image: Buffer;
|
|
13
|
+
/** When the capture was taken. */
|
|
14
|
+
timestamp: number;
|
|
15
|
+
/** Pixel dimensions. */
|
|
16
|
+
dimensions: { width: number; height: number };
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Result of OCR text extraction. */
|
|
20
|
+
export interface OCRResult {
|
|
21
|
+
/** Full extracted text. */
|
|
22
|
+
text: string;
|
|
23
|
+
/** Detected text regions with positions. */
|
|
24
|
+
regions: OCRRegion[];
|
|
25
|
+
/** Overall confidence score (0-1). */
|
|
26
|
+
confidence: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** A single region of detected text. */
|
|
30
|
+
export interface OCRRegion {
|
|
31
|
+
text: string;
|
|
32
|
+
bounds: ScreenBounds;
|
|
33
|
+
confidence: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** A UI element detected on screen. */
|
|
37
|
+
export interface ScreenElement {
|
|
38
|
+
/** Element type (button, input, link, text, image, icon, etc.). */
|
|
39
|
+
type: string;
|
|
40
|
+
/** Bounding box on screen. */
|
|
41
|
+
bounds: ScreenBounds;
|
|
42
|
+
/** Visible text or label. */
|
|
43
|
+
text?: string;
|
|
44
|
+
/** Whether the element appears interactable. */
|
|
45
|
+
interactable: boolean;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Types of desktop automation actions. */
|
|
49
|
+
export type DesktopActionType = 'click' | 'type' | 'scroll' | 'keypress';
|
|
50
|
+
|
|
51
|
+
/** A desktop automation action to perform. */
|
|
52
|
+
export interface DesktopAction {
|
|
53
|
+
type: DesktopActionType;
|
|
54
|
+
/** Target location or element description. */
|
|
55
|
+
target?: { x: number; y: number } | string;
|
|
56
|
+
/** Action-specific parameters. */
|
|
57
|
+
params?: {
|
|
58
|
+
/** Text to type (for 'type' action). */
|
|
59
|
+
text?: string;
|
|
60
|
+
/** Key combo to press (for 'keypress' action). */
|
|
61
|
+
key?: string;
|
|
62
|
+
/** Scroll delta (for 'scroll' action). */
|
|
63
|
+
deltaX?: number;
|
|
64
|
+
deltaY?: number;
|
|
65
|
+
/** Mouse button (for 'click' action). */
|
|
66
|
+
button?: 'left' | 'right' | 'middle';
|
|
67
|
+
/** Number of clicks (for 'click' action). */
|
|
68
|
+
clickCount?: number;
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** Configuration for the screen system. */
|
|
73
|
+
export interface ScreenConfig {
|
|
74
|
+
/** Whether screen capture is allowed. */
|
|
75
|
+
captureEnabled: boolean;
|
|
76
|
+
/** Whether OCR is enabled. */
|
|
77
|
+
ocrEnabled: boolean;
|
|
78
|
+
/** Whether desktop automation is enabled. */
|
|
79
|
+
automationEnabled: boolean;
|
|
80
|
+
/** Maximum capture width in pixels. */
|
|
81
|
+
maxCaptureWidth: number;
|
|
82
|
+
/** Maximum capture height in pixels. */
|
|
83
|
+
maxCaptureHeight: number;
|
|
84
|
+
/** Trust level required for screen capture. */
|
|
85
|
+
captureRequiredTrust: number;
|
|
86
|
+
/** Trust level required for automation actions. */
|
|
87
|
+
automationRequiredTrust: number;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export const DEFAULT_SCREEN_CONFIG: ScreenConfig = {
|
|
91
|
+
captureEnabled: true,
|
|
92
|
+
ocrEnabled: true,
|
|
93
|
+
automationEnabled: false,
|
|
94
|
+
maxCaptureWidth: 3840,
|
|
95
|
+
maxCaptureHeight: 2160,
|
|
96
|
+
captureRequiredTrust: 2,
|
|
97
|
+
automationRequiredTrust: 3,
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/** Interface for capture backends (Tauri bridge, mock, etc.). */
|
|
101
|
+
export interface CaptureBackend {
|
|
102
|
+
captureScreen(): Promise<ScreenCapture>;
|
|
103
|
+
captureRegion(bounds: ScreenBounds): Promise<ScreenCapture>;
|
|
104
|
+
captureWindow(title: string): Promise<ScreenCapture>;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** Interface for vision model backend used by OCR and analyzer. */
|
|
108
|
+
export interface VisionBackend {
|
|
109
|
+
analyzeImage(image: Buffer, prompt: string): Promise<string>;
|
|
110
|
+
}
|