alvin-bot 4.20.2 → 4.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +81 -0
- package/README.md +7 -0
- package/bin/cli.js +64 -15
- package/dist/handlers/commands.js +6 -1
- package/dist/services/embeddings/auto-detect.js +74 -0
- package/dist/services/embeddings/fts5.js +108 -0
- package/dist/services/embeddings/gemini.js +65 -0
- package/dist/services/embeddings/index.js +499 -0
- package/dist/services/embeddings/ollama.js +78 -0
- package/dist/services/embeddings/openai.js +49 -0
- package/dist/services/embeddings/provider.js +22 -0
- package/dist/services/embeddings/vector-base.js +113 -0
- package/dist/services/embeddings.js +6 -502
- package/dist/services/memory-inject-mode.js +43 -0
- package/dist/services/memory-layers.js +24 -15
- package/dist/services/memory.js +19 -13
- package/package.json +1 -1
- package/skills/agent-browser/SKILL.md +183 -0
- package/skills/browse/SKILL.md +8 -0
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
import fs from "fs";
|
|
18
18
|
import path from "path";
|
|
19
19
|
import { IDENTITY_FILE, PREFERENCES_FILE, PROJECTS_MEMORY_DIR, MEMORY_FILE, } from "../paths.js";
|
|
20
|
+
import { getEffectiveInjectMode } from "./memory-inject-mode.js";
|
|
20
21
|
const MAX_L0_L1_CHARS = 5000;
|
|
21
22
|
const MAX_L2_PROJECT_CHARS = 1500;
|
|
22
23
|
const MAX_L2_TOTAL_CHARS = 3000;
|
|
@@ -96,6 +97,9 @@ export function buildLayeredContext(query) {
|
|
|
96
97
|
const layers = loadMemoryLayers();
|
|
97
98
|
const parts = [];
|
|
98
99
|
let l0l1Chars = 0;
|
|
100
|
+
// identity.md (L0) and preferences.md (L1) are ALWAYS plain-text injected,
|
|
101
|
+
// regardless of inject mode. They're tiny, manually curated, and contain
|
|
102
|
+
// always-on rules that semantic search may miss for short / generic queries.
|
|
99
103
|
if (layers.identity) {
|
|
100
104
|
const truncated = layers.identity.length > MAX_L0_L1_CHARS
|
|
101
105
|
? layers.identity.slice(0, MAX_L0_L1_CHARS) + "\n[...truncated]"
|
|
@@ -111,21 +115,26 @@ export function buildLayeredContext(query) {
|
|
|
111
115
|
parts.push("## Preferences (L1)\n" + truncated);
|
|
112
116
|
l0l1Chars += truncated.length;
|
|
113
117
|
}
|
|
114
|
-
//
|
|
115
|
-
//
|
|
116
|
-
//
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
118
|
+
// The monolithic MEMORY.md plain-text inject is gated by the effective
|
|
119
|
+
// inject mode (v4.22):
|
|
120
|
+
// legacy → inject as before (full or secondary, depending on split-file presence)
|
|
121
|
+
// sqlite → skip; the same content lives in the SQLite store and is
|
|
122
|
+
// surfaced on-demand via searchMemory() in personality.ts
|
|
123
|
+
const mode = getEffectiveInjectMode();
|
|
124
|
+
if (mode === "legacy" && layers.longTerm) {
|
|
125
|
+
if (!layers.identity && !layers.preferences) {
|
|
126
|
+
const truncated = layers.longTerm.length > MAX_L0_L1_CHARS
|
|
127
|
+
? layers.longTerm.slice(0, MAX_L0_L1_CHARS) + "\n[...truncated]"
|
|
128
|
+
: layers.longTerm;
|
|
129
|
+
parts.push("## Long-term Memory (L1, monolithic)\n" + truncated);
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
const SECONDARY_CAP = 1500;
|
|
133
|
+
const truncated = layers.longTerm.length > SECONDARY_CAP
|
|
134
|
+
? layers.longTerm.slice(0, SECONDARY_CAP) + "\n[...truncated]"
|
|
135
|
+
: layers.longTerm;
|
|
136
|
+
parts.push("## Long-term Memory (L1, legacy MEMORY.md)\n" + truncated);
|
|
137
|
+
}
|
|
129
138
|
}
|
|
130
139
|
// L2: project-specific, only when a query is provided
|
|
131
140
|
if (query && layers.projects.length > 0) {
|
package/dist/services/memory.js
CHANGED
|
@@ -12,6 +12,7 @@ import { resolve } from "path";
|
|
|
12
12
|
import { MEMORY_DIR, MEMORY_FILE } from "../paths.js";
|
|
13
13
|
import { reindexMemory } from "./embeddings.js";
|
|
14
14
|
import { buildLayeredContext } from "./memory-layers.js";
|
|
15
|
+
import { getEffectiveInjectMode } from "./memory-inject-mode.js";
|
|
15
16
|
// Ensure dirs exist
|
|
16
17
|
if (!fs.existsSync(MEMORY_DIR))
|
|
17
18
|
fs.mkdirSync(MEMORY_DIR, { recursive: true });
|
|
@@ -78,23 +79,28 @@ export function appendDailyLog(entry) {
|
|
|
78
79
|
*/
|
|
79
80
|
export function buildMemoryContext(query) {
|
|
80
81
|
const parts = [];
|
|
81
|
-
|
|
82
|
+
const mode = getEffectiveInjectMode();
|
|
83
|
+
// L0+L1 (+ matched L2 if query) via layered loader. The loader itself
|
|
84
|
+
// respects MEMORY_INJECT_MODE for the monolithic MEMORY.md slice.
|
|
82
85
|
const layered = buildLayeredContext(query);
|
|
83
86
|
if (layered) {
|
|
84
87
|
parts.push(layered);
|
|
85
88
|
}
|
|
86
|
-
//
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
const
|
|
97
|
-
|
|
89
|
+
// Daily logs are bulk-injected only in legacy mode. In sqlite mode they're
|
|
90
|
+
// discoverable via searchMemory() — every log file is indexed individually
|
|
91
|
+
// and surfaced when relevant to the user's query.
|
|
92
|
+
if (mode === "legacy") {
|
|
93
|
+
const todayLog = loadDailyLog();
|
|
94
|
+
if (todayLog) {
|
|
95
|
+
const truncated = todayLog.length > 1500 ? todayLog.slice(-1500) : todayLog;
|
|
96
|
+
parts.push(`## Today's Log\n${truncated}`);
|
|
97
|
+
}
|
|
98
|
+
const yesterday = new Date(Date.now() - 86_400_000).toISOString().slice(0, 10);
|
|
99
|
+
const yesterdayLog = loadDailyLog(yesterday);
|
|
100
|
+
if (yesterdayLog) {
|
|
101
|
+
const truncated = yesterdayLog.length > 500 ? yesterdayLog.slice(-500) : yesterdayLog;
|
|
102
|
+
parts.push(`## Yesterday's Log (summary)\n${truncated}`);
|
|
103
|
+
}
|
|
98
104
|
}
|
|
99
105
|
if (parts.length === 0)
|
|
100
106
|
return "";
|
package/package.json
CHANGED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Agent Browser (Snapshot+Ref)
|
|
3
|
+
description: Token-efficient browser automation via the `agent-browser` CLI (Vercel Labs). Uses accessibility-tree snapshots with @eN refs (~200–400 tokens per page) instead of raw HTML parsing — typically 90%+ cheaper than Playwright/Puppeteer. Use for click-fill-extract on public pages, single-page test flows, structured form submission, and screenshots-with-refs. Optional dependency — only active if `agent-browser` is on the PATH; otherwise the regular Browser Automation skill takes over.
|
|
4
|
+
triggers: snapshot the page, get refs, list interactive elements, click @e, fill @e, agent-browser, click button on, click the button, fill in the field, extract from page, find on page, scrape page interactively, visit and click, open page and click, navigate and fill, semantic locator, accessibility tree, snapshot+ref, schau auf der Seite nach, klicke auf den Button, fülle das Feld, formular ausfüllen
|
|
5
|
+
priority: 9
|
|
6
|
+
category: automation
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Agent Browser — Token-Efficient Snapshot+Ref Workflow
|
|
10
|
+
|
|
11
|
+
Use this skill when interactive browser automation is needed (click, fill,
|
|
12
|
+
extract, screenshot) AND `agent-browser` is installed. The accessibility-tree
|
|
13
|
+
snapshot makes per-page interaction roughly an order of magnitude cheaper in
|
|
14
|
+
tokens than parsing rendered HTML with Playwright.
|
|
15
|
+
|
|
16
|
+
## Pre-flight: is the CLI installed?
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
command -v agent-browser >/dev/null 2>&1 \
|
|
20
|
+
&& echo "agent-browser ok" \
|
|
21
|
+
|| echo "fall back to the Browser Automation skill"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
If absent: **stop and use the regular Browser Automation skill** (Tier 1
|
|
25
|
+
Stealth / Tier 2 CDP). Don't suggest installing it unless the user asks —
|
|
26
|
+
it's an opt-in tool, see `alvin-bot doctor` for installation hints.
|
|
27
|
+
|
|
28
|
+
## Core loop
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
agent-browser open <url>
|
|
32
|
+
agent-browser snapshot -i # interactive elements, with @e1..@eN refs
|
|
33
|
+
agent-browser click @e3 # act on a ref
|
|
34
|
+
agent-browser snapshot -i # CRITICAL — re-snapshot after every page change
|
|
35
|
+
agent-browser close
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Refs (`@e1`, `@e2`, …) are **assigned fresh every snapshot**. They go stale
|
|
39
|
+
the moment the page changes (click that navigates, form submit, dynamic
|
|
40
|
+
re-render, modal open). Always re-snapshot before the next ref interaction.
|
|
41
|
+
This single rule is the most common pitfall.
|
|
42
|
+
|
|
43
|
+
A snapshot looks like:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
Page: Example - Log in
|
|
47
|
+
URL: https://example.com/login
|
|
48
|
+
|
|
49
|
+
@e1 [heading] "Log in"
|
|
50
|
+
@e2 [form]
|
|
51
|
+
@e3 [input type="email"] placeholder="Email"
|
|
52
|
+
@e4 [input type="password"] placeholder="Password"
|
|
53
|
+
@e5 [button type="submit"] "Continue"
|
|
54
|
+
@e6 [link] "Forgot password?"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Common patterns
|
|
58
|
+
|
|
59
|
+
### Read a page
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
agent-browser snapshot -i # interactive only (preferred)
|
|
63
|
+
agent-browser snapshot -i -u # include href URLs on links
|
|
64
|
+
agent-browser snapshot -i --json # machine-readable
|
|
65
|
+
agent-browser get text @e1 # visible text of an element
|
|
66
|
+
agent-browser get attr @e10 href # any attribute
|
|
67
|
+
agent-browser get url # current URL
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Interact
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
agent-browser click @e1
|
|
74
|
+
agent-browser fill @e2 "user@example.com" # clear + type
|
|
75
|
+
agent-browser type @e2 " more text" # type without clearing
|
|
76
|
+
agent-browser press Enter
|
|
77
|
+
agent-browser select @e4 "option-value"
|
|
78
|
+
agent-browser upload @e5 file.pdf
|
|
79
|
+
agent-browser scroll down 500
|
|
80
|
+
agent-browser screenshot result.png
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Wait for the right thing (most failures come from bad waits)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
agent-browser wait @e1 # until an element appears
|
|
87
|
+
agent-browser wait --text "Success" # until specific text on the page
|
|
88
|
+
agent-browser wait --url "**/dashboard" # until URL matches glob
|
|
89
|
+
agent-browser wait --load networkidle # post-navigation catch-all
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Avoid bare `wait 2000` except in throwaway debugging. Default timeout: 25 s.
|
|
93
|
+
|
|
94
|
+
### Find by semantics when refs aren't ergonomic
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
agent-browser find role button click --name "Submit"
|
|
98
|
+
agent-browser find text "Sign In" click --exact
|
|
99
|
+
agent-browser find label "Email" fill "user@example.com"
|
|
100
|
+
agent-browser find placeholder "Search" type "query"
|
|
101
|
+
agent-browser find testid "submit-btn" click
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Multiple isolated browser sessions (parallel users)
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
agent-browser --session a open https://app.example.com
|
|
108
|
+
agent-browser --session b open https://app.example.com
|
|
109
|
+
agent-browser --session a fill @e1 "alice@test.com"
|
|
110
|
+
agent-browser --session b fill @e1 "bob@test.com"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Persist login across runs
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Save once after a successful login:
|
|
117
|
+
agent-browser state save ./auth.json
|
|
118
|
+
|
|
119
|
+
# Resume already-logged-in:
|
|
120
|
+
agent-browser --state ./auth.json open https://app.example.com
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Auth vault (don't put passwords in shell history)
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
agent-browser auth save my-app --url https://app.example.com/login \
|
|
127
|
+
--username user@example.com --password-stdin
|
|
128
|
+
# (paste password, Ctrl+D)
|
|
129
|
+
|
|
130
|
+
agent-browser auth login my-app
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Iframes
|
|
134
|
+
|
|
135
|
+
Iframes are inlined in the snapshot — refs work transparently. To scope a
|
|
136
|
+
snapshot to one iframe:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
agent-browser frame @e3
|
|
140
|
+
agent-browser snapshot -i
|
|
141
|
+
agent-browser frame main
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Mock network (testing)
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
agent-browser network route "**/api/users" --body '{"users":[]}'
|
|
148
|
+
agent-browser network route "**/analytics" --abort
|
|
149
|
+
agent-browser network har start /tmp/trace.har
|
|
150
|
+
# ... do stuff ...
|
|
151
|
+
agent-browser network har stop
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## When NOT to use this skill
|
|
155
|
+
|
|
156
|
+
| Situation | Skill |
|
|
157
|
+
|---|---|
|
|
158
|
+
| Bot-protected site (Cloudflare, DataDome) | regular **Browser Automation** skill, Tier 1 Stealth |
|
|
159
|
+
| Logged-in personal account on LinkedIn / Gmail | **Browser Automation**, Tier 2 CDP (`alvin-bot browser …`) |
|
|
160
|
+
| User wants to watch a complex flow live | **Browser Automation**, Tier 3 Extension |
|
|
161
|
+
| Static HTML / public JSON / RSS / API | `curl` / WebFetch — no browser engine needed |
|
|
162
|
+
|
|
163
|
+
agent-browser is great for **task automation on cooperative pages** (your
|
|
164
|
+
own apps, public data sites, form submissions). It is *not* a stealth tool.
|
|
165
|
+
|
|
166
|
+
## Diagnostics
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
agent-browser doctor # full env check
|
|
170
|
+
agent-browser doctor --quick # local-only
|
|
171
|
+
agent-browser dashboard start # observability UI on :4848
|
|
172
|
+
agent-browser skills get core # the upstream tool's own usage guide
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## One-liner sanity test
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
agent-browser open https://example.com \
|
|
179
|
+
&& agent-browser snapshot -i \
|
|
180
|
+
&& agent-browser close
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Expect two `@e` refs (heading + link). If that works, the tool is healthy.
|
package/skills/browse/SKILL.md
CHANGED
|
@@ -15,12 +15,20 @@ Du hast drei Browser-Strategien plus WebFetch. **Wähle die billigste passende S
|
|
|
15
15
|
| Task | Tool | Warum |
|
|
16
16
|
|------|------|-------|
|
|
17
17
|
| Einzelne öffentliche Seite, nur Text | `curl` oder WebFetch | Am schnellsten, keine Browser-Engine |
|
|
18
|
+
| Interaktiv (klicken/füllen/extrahieren) auf kooperativer Seite | **Tier 1.5 agent-browser** *(falls installiert)* | Snapshot+Ref-Workflow ist ~90 % token-günstiger als rohes Playwright. Siehe Skill „Agent Browser". |
|
|
18
19
|
| Öffentliche Seite mit JS / Cloudflare | **Tier 1 Stealth** | Headless + Fingerprint-Masking |
|
|
19
20
|
| Login-pflichtige Seite (LinkedIn, Gmail, …) | **Tier 2 CDP** | Echtes Chromium, persistente Cookies |
|
|
20
21
|
| Komplexer Multi-Step-Flow, User soll zusehen | **Tier 3 Extension** | Nur in interaktiven CLI-Sessions |
|
|
21
22
|
|
|
22
23
|
**NIEMALS** nacktes `node -e "const {chromium}…"` für externe Seiten — wird sofort geblockt.
|
|
23
24
|
|
|
25
|
+
**Vorab prüfen ob agent-browser verfügbar ist:**
|
|
26
|
+
```bash
|
|
27
|
+
command -v agent-browser >/dev/null 2>&1 && echo "Tier 1.5 verfügbar"
|
|
28
|
+
```
|
|
29
|
+
Falls ja und der Task ist „klick X, lies Y, fülle Z aus" → den `agent-browser`-Skill nehmen.
|
|
30
|
+
Falls nein → mit Tier 1/2/3 weitermachen wie unten. Installation auf Wunsch des Users: `npm i -g agent-browser && agent-browser install`.
|
|
31
|
+
|
|
24
32
|
---
|
|
25
33
|
|
|
26
34
|
## Tier 0 — curl / WebFetch (schnellster Pfad)
|