@acmecloud/core 1.0.8 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm/vision.d.ts +2 -2
- package/dist/llm/vision.js +16 -13
- package/dist/tools/browser.d.ts +1 -1
- package/dist/tools/browser.js +24 -23
- package/package.json +1 -1
- package/src/llm/vision.ts +47 -43
- package/src/tools/browser.ts +109 -95
package/dist/llm/vision.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { ModelConfig } from
|
|
1
|
+
import { ModelConfig } from "../config/index.js";
|
|
2
2
|
/**
|
|
3
3
|
* Analyzes an image using a specialized vision model and returns a textual description.
|
|
4
4
|
* This allows non-vision primary models to "see" via delegation.
|
|
5
5
|
*/
|
|
6
6
|
export declare function analyzeImage(imageData: string, // base64
|
|
7
|
-
config: ModelConfig): Promise<string>;
|
|
7
|
+
config: ModelConfig, abortSignal?: AbortSignal): Promise<string>;
|
package/dist/llm/vision.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { generateText } from
|
|
2
|
-
import { getModel } from
|
|
1
|
+
import { generateText } from "ai";
|
|
2
|
+
import { getModel } from "./provider.js";
|
|
3
3
|
/**
|
|
4
4
|
* Analyzes an image using a specialized vision model and returns a textual description.
|
|
5
5
|
* This allows non-vision primary models to "see" via delegation.
|
|
6
6
|
*/
|
|
7
7
|
export async function analyzeImage(imageData, // base64
|
|
8
|
-
config) {
|
|
8
|
+
config, abortSignal) {
|
|
9
9
|
if (!config.visionProvider || !config.visionModel) {
|
|
10
10
|
return "Error: Vision model not configured. Please set visionProvider and visionModel in config.";
|
|
11
11
|
}
|
|
@@ -13,24 +13,27 @@ config) {
|
|
|
13
13
|
try {
|
|
14
14
|
const { text } = await generateText({
|
|
15
15
|
model,
|
|
16
|
-
abortSignal: AbortSignal.timeout(
|
|
17
|
-
maxRetries:
|
|
16
|
+
abortSignal: abortSignal || AbortSignal.timeout(90000), // 90 seconds timeout
|
|
17
|
+
maxRetries: 2,
|
|
18
18
|
messages: [
|
|
19
19
|
{
|
|
20
|
-
role:
|
|
20
|
+
role: "user",
|
|
21
21
|
content: [
|
|
22
|
-
{
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
22
|
+
{
|
|
23
|
+
type: "text",
|
|
24
|
+
text: "Please describe this screenshot in detail. Focus on the layout, visible text, interactive elements (buttons, inputs), and any apparent errors or status messages. This description will be used by another AI to understand the state of the web application.",
|
|
25
|
+
},
|
|
26
|
+
{ type: "image", image: imageData },
|
|
27
|
+
],
|
|
28
|
+
},
|
|
29
|
+
],
|
|
27
30
|
});
|
|
28
31
|
return `[Vision Model Analysis (${config.visionModel})]:\n${text}`;
|
|
29
32
|
}
|
|
30
33
|
catch (err) {
|
|
31
34
|
let msg = err.message;
|
|
32
|
-
if (err.name ===
|
|
33
|
-
msg = `Analysis timed out after
|
|
35
|
+
if (err.name === "AbortError" || err.message.includes("timeout")) {
|
|
36
|
+
msg = `Analysis timed out after 90s. The image might be too complex or the provider is slow.`;
|
|
34
37
|
}
|
|
35
38
|
return `Error during vision analysis: ${msg}`;
|
|
36
39
|
}
|
package/dist/tools/browser.d.ts
CHANGED
package/dist/tools/browser.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { chromium } from
|
|
2
|
-
import { analyzeImage } from
|
|
3
|
-
import { loadModelConfig } from
|
|
4
|
-
import * as fs from
|
|
5
|
-
import { existsSync, mkdirSync } from
|
|
6
|
-
import path from
|
|
1
|
+
import { chromium } from "playwright";
|
|
2
|
+
import { analyzeImage } from "../llm/vision.js";
|
|
3
|
+
import { loadModelConfig } from "../config/index.js";
|
|
4
|
+
import * as fs from "fs/promises";
|
|
5
|
+
import { existsSync, mkdirSync } from "fs";
|
|
6
|
+
import path from "path";
|
|
7
7
|
let browser = null;
|
|
8
8
|
let page = null;
|
|
9
9
|
async function getBrowser() {
|
|
@@ -13,61 +13,62 @@ async function getBrowser() {
|
|
|
13
13
|
}
|
|
14
14
|
if (!page) {
|
|
15
15
|
const context = await browser.newContext({
|
|
16
|
-
viewport: { width: 1280, height: 720 }
|
|
16
|
+
viewport: { width: 1280, height: 720 },
|
|
17
17
|
});
|
|
18
18
|
page = await context.newPage();
|
|
19
19
|
}
|
|
20
20
|
return { browser, page };
|
|
21
21
|
}
|
|
22
22
|
catch (err) {
|
|
23
|
-
if (err.message.includes(
|
|
23
|
+
if (err.message.includes("executable") ||
|
|
24
|
+
err.message.includes("not found")) {
|
|
24
25
|
throw new Error(`Browser not found. Please run: npx playwright install chromium`);
|
|
25
26
|
}
|
|
26
27
|
throw err;
|
|
27
28
|
}
|
|
28
29
|
}
|
|
29
|
-
export async function executeBrowserAction(args) {
|
|
30
|
+
export async function executeBrowserAction(args, abortSignal) {
|
|
30
31
|
const { page } = await getBrowser();
|
|
31
32
|
const config = loadModelConfig();
|
|
32
33
|
try {
|
|
33
34
|
switch (args.action) {
|
|
34
|
-
case
|
|
35
|
+
case "navigate":
|
|
35
36
|
if (!args.url)
|
|
36
|
-
return
|
|
37
|
-
await page.goto(args.url, { waitUntil:
|
|
37
|
+
return "Error: URL is required for navigate action.";
|
|
38
|
+
await page.goto(args.url, { waitUntil: "networkidle" });
|
|
38
39
|
const title = await page.title();
|
|
39
40
|
return `Successfully navigated to ${args.url}. Page title: ${title}`;
|
|
40
|
-
case
|
|
41
|
+
case "screenshot":
|
|
41
42
|
const screenshot = await page.screenshot({ fullPage: false });
|
|
42
|
-
const base64 = screenshot.toString(
|
|
43
|
+
const base64 = screenshot.toString("base64");
|
|
43
44
|
// Save to disk
|
|
44
|
-
const screenshotDir = path.resolve(process.cwd(),
|
|
45
|
+
const screenshotDir = path.resolve(process.cwd(), ".acmecode", "screenshots");
|
|
45
46
|
if (!existsSync(screenshotDir)) {
|
|
46
47
|
mkdirSync(screenshotDir, { recursive: true });
|
|
47
48
|
}
|
|
48
|
-
const filename = `screenshot_${new Date().toISOString().replace(/[:.]/g,
|
|
49
|
+
const filename = `screenshot_${new Date().toISOString().replace(/[:.]/g, "-")}.png`;
|
|
49
50
|
const filePath = path.join(screenshotDir, filename);
|
|
50
51
|
await fs.writeFile(filePath, screenshot);
|
|
51
52
|
const stats = `Screenshot saved to: ${path.relative(process.cwd(), filePath)}`;
|
|
52
53
|
// Delegate to vision model if configured
|
|
53
54
|
if (config.visionModel) {
|
|
54
|
-
const analysis = await analyzeImage(base64, config);
|
|
55
|
+
const analysis = await analyzeImage(base64, config, abortSignal);
|
|
55
56
|
return `${stats}\n\n${analysis}`;
|
|
56
57
|
}
|
|
57
58
|
return stats;
|
|
58
|
-
case
|
|
59
|
+
case "click":
|
|
59
60
|
if (!args.selector)
|
|
60
|
-
return
|
|
61
|
+
return "Error: Selector is required for click action.";
|
|
61
62
|
await page.click(args.selector);
|
|
62
63
|
return `Clicked element: ${args.selector}`;
|
|
63
|
-
case
|
|
64
|
+
case "type":
|
|
64
65
|
if (!args.selector || !args.text)
|
|
65
|
-
return
|
|
66
|
+
return "Error: Selector and text are required for type action.";
|
|
66
67
|
await page.fill(args.selector, args.text);
|
|
67
68
|
return `Typed "${args.text}" into ${args.selector}`;
|
|
68
|
-
case
|
|
69
|
+
case "scroll":
|
|
69
70
|
await page.mouse.wheel(0, 500);
|
|
70
|
-
return
|
|
71
|
+
return "Scrolled down.";
|
|
71
72
|
default:
|
|
72
73
|
return `Error: Unknown action "${args.action}"`;
|
|
73
74
|
}
|
package/package.json
CHANGED
package/src/llm/vision.ts
CHANGED
|
@@ -1,43 +1,47 @@
|
|
|
1
|
-
import { generateText } from
|
|
2
|
-
import { getModel } from
|
|
3
|
-
import { ModelConfig } from
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Analyzes an image using a specialized vision model and returns a textual description.
|
|
7
|
-
* This allows non-vision primary models to "see" via delegation.
|
|
8
|
-
*/
|
|
9
|
-
export async function analyzeImage(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
1
|
+
import { generateText } from "ai";
|
|
2
|
+
import { getModel } from "./provider.js";
|
|
3
|
+
import { ModelConfig } from "../config/index.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Analyzes an image using a specialized vision model and returns a textual description.
|
|
7
|
+
* This allows non-vision primary models to "see" via delegation.
|
|
8
|
+
*/
|
|
9
|
+
export async function analyzeImage(
|
|
10
|
+
imageData: string, // base64
|
|
11
|
+
config: ModelConfig,
|
|
12
|
+
abortSignal?: AbortSignal,
|
|
13
|
+
): Promise<string> {
|
|
14
|
+
if (!config.visionProvider || !config.visionModel) {
|
|
15
|
+
return "Error: Vision model not configured. Please set visionProvider and visionModel in config.";
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const model = getModel(config.visionProvider as any, config.visionModel);
|
|
19
|
+
|
|
20
|
+
try {
|
|
21
|
+
const { text } = await generateText({
|
|
22
|
+
model,
|
|
23
|
+
abortSignal: abortSignal || AbortSignal.timeout(90000), // 90 seconds timeout
|
|
24
|
+
maxRetries: 2,
|
|
25
|
+
messages: [
|
|
26
|
+
{
|
|
27
|
+
role: "user",
|
|
28
|
+
content: [
|
|
29
|
+
{
|
|
30
|
+
type: "text",
|
|
31
|
+
text: "Please describe this screenshot in detail. Focus on the layout, visible text, interactive elements (buttons, inputs), and any apparent errors or status messages. This description will be used by another AI to understand the state of the web application.",
|
|
32
|
+
},
|
|
33
|
+
{ type: "image", image: imageData },
|
|
34
|
+
],
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
return `[Vision Model Analysis (${config.visionModel})]:\n${text}`;
|
|
40
|
+
} catch (err: any) {
|
|
41
|
+
let msg = err.message;
|
|
42
|
+
if (err.name === "AbortError" || err.message.includes("timeout")) {
|
|
43
|
+
msg = `Analysis timed out after 90s. The image might be too complex or the provider is slow.`;
|
|
44
|
+
}
|
|
45
|
+
return `Error during vision analysis: ${msg}`;
|
|
46
|
+
}
|
|
47
|
+
}
|
package/src/tools/browser.ts
CHANGED
|
@@ -1,95 +1,109 @@
|
|
|
1
|
-
import { chromium, Browser, Page } from
|
|
2
|
-
import { analyzeImage } from
|
|
3
|
-
import { loadModelConfig } from
|
|
4
|
-
import * as fs from
|
|
5
|
-
import { existsSync, mkdirSync } from
|
|
6
|
-
import path from
|
|
7
|
-
|
|
8
|
-
let browser: Browser | null = null;
|
|
9
|
-
let page: Page | null = null;
|
|
10
|
-
|
|
11
|
-
async function getBrowser() {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
await
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
}
|
|
1
|
+
import { chromium, Browser, Page } from "playwright";
|
|
2
|
+
import { analyzeImage } from "../llm/vision.js";
|
|
3
|
+
import { loadModelConfig } from "../config/index.js";
|
|
4
|
+
import * as fs from "fs/promises";
|
|
5
|
+
import { existsSync, mkdirSync } from "fs";
|
|
6
|
+
import path from "path";
|
|
7
|
+
|
|
8
|
+
let browser: Browser | null = null;
|
|
9
|
+
let page: Page | null = null;
|
|
10
|
+
|
|
11
|
+
async function getBrowser() {
|
|
12
|
+
try {
|
|
13
|
+
if (!browser) {
|
|
14
|
+
browser = await chromium.launch({ headless: true });
|
|
15
|
+
}
|
|
16
|
+
if (!page) {
|
|
17
|
+
const context = await browser.newContext({
|
|
18
|
+
viewport: { width: 1280, height: 720 },
|
|
19
|
+
});
|
|
20
|
+
page = await context.newPage();
|
|
21
|
+
}
|
|
22
|
+
return { browser, page };
|
|
23
|
+
} catch (err: any) {
|
|
24
|
+
if (
|
|
25
|
+
err.message.includes("executable") ||
|
|
26
|
+
err.message.includes("not found")
|
|
27
|
+
) {
|
|
28
|
+
throw new Error(
|
|
29
|
+
`Browser not found. Please run: npx playwright install chromium`,
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
throw err;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export async function executeBrowserAction(
|
|
37
|
+
args: { action: string; url?: string; selector?: string; text?: string },
|
|
38
|
+
abortSignal?: AbortSignal,
|
|
39
|
+
): Promise<string> {
|
|
40
|
+
const { page } = await getBrowser();
|
|
41
|
+
const config = loadModelConfig();
|
|
42
|
+
|
|
43
|
+
try {
|
|
44
|
+
switch (args.action) {
|
|
45
|
+
case "navigate":
|
|
46
|
+
if (!args.url) return "Error: URL is required for navigate action.";
|
|
47
|
+
await page.goto(args.url, { waitUntil: "networkidle" });
|
|
48
|
+
const title = await page.title();
|
|
49
|
+
return `Successfully navigated to ${args.url}. Page title: ${title}`;
|
|
50
|
+
|
|
51
|
+
case "screenshot":
|
|
52
|
+
const screenshot = await page.screenshot({ fullPage: false });
|
|
53
|
+
const base64 = screenshot.toString("base64");
|
|
54
|
+
|
|
55
|
+
// Save to disk
|
|
56
|
+
const screenshotDir = path.resolve(
|
|
57
|
+
process.cwd(),
|
|
58
|
+
".acmecode",
|
|
59
|
+
"screenshots",
|
|
60
|
+
);
|
|
61
|
+
if (!existsSync(screenshotDir)) {
|
|
62
|
+
mkdirSync(screenshotDir, { recursive: true });
|
|
63
|
+
}
|
|
64
|
+
const filename = `screenshot_${new Date().toISOString().replace(/[:.]/g, "-")}.png`;
|
|
65
|
+
const filePath = path.join(screenshotDir, filename);
|
|
66
|
+
await fs.writeFile(filePath, screenshot);
|
|
67
|
+
|
|
68
|
+
const stats = `Screenshot saved to: ${path.relative(process.cwd(), filePath)}`;
|
|
69
|
+
|
|
70
|
+
// Delegate to vision model if configured
|
|
71
|
+
if (config.visionModel) {
|
|
72
|
+
const analysis = await analyzeImage(base64, config, abortSignal);
|
|
73
|
+
return `${stats}\n\n${analysis}`;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return stats;
|
|
77
|
+
|
|
78
|
+
case "click":
|
|
79
|
+
if (!args.selector)
|
|
80
|
+
return "Error: Selector is required for click action.";
|
|
81
|
+
await page.click(args.selector);
|
|
82
|
+
return `Clicked element: ${args.selector}`;
|
|
83
|
+
|
|
84
|
+
case "type":
|
|
85
|
+
if (!args.selector || !args.text)
|
|
86
|
+
return "Error: Selector and text are required for type action.";
|
|
87
|
+
await page.fill(args.selector, args.text);
|
|
88
|
+
return `Typed "${args.text}" into ${args.selector}`;
|
|
89
|
+
|
|
90
|
+
case "scroll":
|
|
91
|
+
await page.mouse.wheel(0, 500);
|
|
92
|
+
return "Scrolled down.";
|
|
93
|
+
|
|
94
|
+
default:
|
|
95
|
+
return `Error: Unknown action "${args.action}"`;
|
|
96
|
+
}
|
|
97
|
+
} catch (err: any) {
|
|
98
|
+
return `Browser error: ${err.message}`;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Cleanup function to be called on process exit
|
|
103
|
+
export async function closeBrowser() {
|
|
104
|
+
if (browser) {
|
|
105
|
+
await browser.close();
|
|
106
|
+
browser = null;
|
|
107
|
+
page = null;
|
|
108
|
+
}
|
|
109
|
+
}
|