typeclaw 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -15
- package/auth.schema.json +113 -0
- package/package.json +1 -1
- package/secrets.schema.json +113 -0
- package/src/agent/auth.ts +4 -2
- package/src/agent/index.ts +16 -28
- package/src/agent/model-fallback.ts +127 -0
- package/src/agent/session-meta.ts +1 -1
- package/src/agent/session-origin.ts +3 -2
- package/src/agent/tools/curl-impersonate.ts +300 -0
- package/src/agent/tools/ddg.ts +13 -88
- package/src/agent/tools/webfetch/fetch.ts +105 -2
- package/src/agent/tools/webfetch/tool.ts +4 -0
- package/src/bundled-plugins/agent-browser/shim.ts +47 -0
- package/src/bundled-plugins/backup/subagents.ts +2 -0
- package/src/bundled-plugins/memory/README.md +49 -12
- package/src/bundled-plugins/memory/citation-superset.ts +63 -0
- package/src/bundled-plugins/memory/dreaming.ts +105 -17
- package/src/bundled-plugins/memory/index.ts +2 -2
- package/src/bundled-plugins/memory/memory-logger.ts +45 -26
- package/src/bundled-plugins/memory/strength.ts +127 -0
- package/src/bundled-plugins/memory/topics.ts +75 -0
- package/src/bundled-plugins/security/index.ts +88 -43
- package/src/bundled-plugins/security/permissions.ts +36 -0
- package/src/bundled-plugins/security/policies/git-exfil.ts +20 -0
- package/src/bundled-plugins/security/policies/outbound-secret-scan.ts +12 -0
- package/src/bundled-plugins/security/policies/prompt-injection.ts +23 -3
- package/src/bundled-plugins/security/policies/secret-exfil-bash.ts +7 -0
- package/src/bundled-plugins/security/policies/secret-exfil-read.ts +6 -0
- package/src/bundled-plugins/security/policies/session-search-secrets.ts +9 -0
- package/src/bundled-plugins/security/policies/ssrf.ts +6 -0
- package/src/bundled-plugins/security/policies/system-prompt-leak.ts +7 -0
- package/src/channels/adapters/github/auth-app.ts +120 -0
- package/src/channels/adapters/github/auth-pat.ts +50 -0
- package/src/channels/adapters/github/auth.ts +33 -0
- package/src/channels/adapters/github/channel-resolver.ts +30 -0
- package/src/channels/adapters/github/dedup.ts +26 -0
- package/src/channels/adapters/github/event-allowlist.ts +8 -0
- package/src/channels/adapters/github/fetch-attachment.ts +5 -0
- package/src/channels/adapters/github/history.ts +63 -0
- package/src/channels/adapters/github/inbound.ts +286 -0
- package/src/channels/adapters/github/index.ts +370 -0
- package/src/channels/adapters/github/managed-path.ts +54 -0
- package/src/channels/adapters/github/membership.ts +35 -0
- package/src/channels/adapters/github/outbound.ts +145 -0
- package/src/channels/adapters/github/webhook-register.ts +349 -0
- package/src/channels/manager.ts +94 -9
- package/src/channels/router.ts +194 -28
- package/src/channels/schema.ts +31 -1
- package/src/channels/tunnel-bridge.ts +51 -0
- package/src/channels/types.ts +3 -1
- package/src/cli/builtins.ts +28 -0
- package/src/cli/channel.ts +511 -25
- package/src/cli/container-command-client.ts +244 -0
- package/src/cli/cron.ts +173 -0
- package/src/cli/host-command-runner.ts +150 -0
- package/src/cli/index.ts +42 -1
- package/src/cli/init.ts +400 -67
- package/src/cli/model.ts +14 -4
- package/src/cli/oauth-callbacks.ts +49 -0
- package/src/cli/plugin-command-help.ts +49 -0
- package/src/cli/plugin-commands-dispatch.ts +112 -0
- package/src/cli/plugin-commands.ts +118 -0
- package/src/cli/provider.ts +3 -20
- package/src/cli/tui.ts +10 -2
- package/src/cli/tunnel.ts +533 -0
- package/src/cli/ui.ts +8 -3
- package/src/config/config.ts +134 -24
- package/src/config/models-mutation.ts +42 -8
- package/src/config/providers-mutation.ts +12 -8
- package/src/container/start.ts +48 -4
- package/src/cron/bridge.ts +136 -0
- package/src/cron/consumer.ts +174 -48
- package/src/cron/index.ts +19 -2
- package/src/cron/list.ts +105 -0
- package/src/cron/scheduler.ts +12 -3
- package/src/cron/schema.ts +11 -3
- package/src/doctor/checks.ts +0 -50
- package/src/init/dockerfile.ts +165 -13
- package/src/init/ensure-deps.ts +15 -4
- package/src/init/github-webhook-install.ts +109 -0
- package/src/init/hatching.ts +2 -2
- package/src/init/index.ts +519 -12
- package/src/init/oauth-login.ts +17 -3
- package/src/init/run-bun-install.ts +17 -3
- package/src/init/run-owner-claim.ts +11 -2
- package/src/permissions/builtins.ts +29 -2
- package/src/permissions/match-rule.ts +24 -2
- package/src/permissions/permissions.ts +24 -7
- package/src/permissions/resolve.ts +1 -0
- package/src/plugin/define.ts +44 -1
- package/src/plugin/index.ts +18 -3
- package/src/plugin/manager.ts +16 -0
- package/src/plugin/registry.ts +85 -3
- package/src/plugin/types.ts +144 -1
- package/src/plugin/zod-introspect.ts +100 -0
- package/src/role-claim/match-rule.ts +2 -1
- package/src/run/index.ts +112 -4
- package/src/secrets/index.ts +1 -1
- package/src/secrets/schema.ts +21 -0
- package/src/server/command-runner.ts +476 -0
- package/src/server/index.ts +388 -5
- package/src/shared/index.ts +8 -0
- package/src/shared/protocol.ts +80 -1
- package/src/skills/typeclaw-channel-github/SKILL.md +24 -0
- package/src/skills/typeclaw-config/SKILL.md +27 -26
- package/src/skills/typeclaw-cron/SKILL.md +234 -3
- package/src/skills/typeclaw-memory/SKILL.md +25 -15
- package/src/skills/typeclaw-monorepo/SKILL.md +2 -2
- package/src/skills/typeclaw-permissions/SKILL.md +35 -16
- package/src/skills/typeclaw-plugins/SKILL.md +251 -5
- package/src/skills/typeclaw-tunnels/SKILL.md +111 -0
- package/src/test-helpers/wait-for.ts +50 -0
- package/src/tui/index.ts +70 -7
- package/src/tunnels/__fixtures__/cloudflared-quick-stderr.txt +11 -0
- package/src/tunnels/events.ts +14 -0
- package/src/tunnels/index.ts +12 -0
- package/src/tunnels/log-ring.ts +54 -0
- package/src/tunnels/manager.ts +139 -0
- package/src/tunnels/providers/cloudflare-quick.ts +189 -0
- package/src/tunnels/providers/external.ts +53 -0
- package/src/tunnels/quick-url-parser.ts +5 -0
- package/src/tunnels/types.ts +43 -0
- package/src/usage/report.ts +15 -12
- package/typeclaw.schema.json +311 -26
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
// Shared curl-impersonate spawn primitive.
|
|
2
|
+
//
|
|
3
|
+
// Why this exists: by 2026, every non-trivial public site (DDG, Reuters via
|
|
4
|
+
// Akamai, MarketWatch via Cloudflare, etc.) fingerprints incoming traffic at
|
|
5
|
+
// the TLS handshake (JA3/JA4) and HTTP/2 SETTINGS frame BEFORE any HTTP header
|
|
6
|
+
// is read. Bun's native fetch cannot match Chrome's handshake (upstream issue
|
|
7
|
+
// #11368), so outbound requests get gated by anomaly checks regardless of
|
|
8
|
+
// headers, body shape, or pacing. The fix is to shell out to curl-impersonate
|
|
9
|
+
// (lexiforest fork), which replays Chrome's exact TLS handshake, HTTP/2
|
|
10
|
+
// settings, and header ordering. Pinned by the typeclaw Dockerfile at
|
|
11
|
+
// /usr/local/bin/curl_chrome136 — see src/init/dockerfile.ts for the version
|
|
12
|
+
// and SHA pin.
|
|
13
|
+
//
|
|
14
|
+
// AGENTS.md explicitly warns against adding `-H` overrides because the
|
|
15
|
+
// curl_chrome wrapper already sends the full Chrome header set (correct
|
|
16
|
+
// ordering, sec-ch-ua, sec-fetch-*, accept-encoding, etc.) and any custom
|
|
17
|
+
// header corrupts the impersonation. We therefore expose NO header-override
|
|
18
|
+
// surface from this primitive; add one only when a real caller needs it AND
|
|
19
|
+
// the override is something curl_chrome can't be told to send another way.
|
|
20
|
+
|
|
21
|
+
import { randomBytes } from 'node:crypto'
|
|
22
|
+
|
|
23
|
+
import { spawn } from 'bun'
|
|
24
|
+
|
|
25
|
+
export const CURL_IMPERSONATE_BINARY = 'curl_chrome136'
|
|
26
|
+
export const DEFAULT_TIMEOUT_SECONDS = 30
|
|
27
|
+
|
|
28
|
+
let curlBinary: string = CURL_IMPERSONATE_BINARY
|
|
29
|
+
|
|
30
|
+
// Test-only seam: lets *.test.ts point the spawn at a fake `curl_chrome136`
|
|
31
|
+
// script in a tmpdir so we exercise the real Bun.spawn path without depending
|
|
32
|
+
// on a curl-impersonate install on the test host. Production code never calls
|
|
33
|
+
// this — the module-level default above is what production sees.
|
|
34
|
+
export function _setCurlBinaryForTest(binary: string | null): void {
|
|
35
|
+
curlBinary = binary ?? CURL_IMPERSONATE_BINARY
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export type CurlImpersonateRequest = {
|
|
39
|
+
url: string
|
|
40
|
+
method?: 'GET' | 'POST'
|
|
41
|
+
// Form-urlencoded body fields for POST. Each entry is passed as a separate
|
|
42
|
+
// --data-urlencode argument so curl handles the encoding. Required if
|
|
43
|
+
// method is 'POST' and you want a body.
|
|
44
|
+
formFields?: Array<{ name: string; value: string }>
|
|
45
|
+
// Hard cap on bytes accepted from the response (passed as --max-filesize).
|
|
46
|
+
// The actual buffer is still bounded by the caller; this just makes curl
|
|
47
|
+
// bail early instead of streaming gigabytes.
|
|
48
|
+
maxBytes?: number
|
|
49
|
+
timeoutSeconds?: number
|
|
50
|
+
signal?: AbortSignal
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export type CurlImpersonateResponse = {
|
|
54
|
+
body: string
|
|
55
|
+
finalUrl: string
|
|
56
|
+
httpStatus: number
|
|
57
|
+
contentType: string
|
|
58
|
+
bytesIn: number
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Specific curl exit codes we map to typed errors. The full list is in
|
|
62
|
+
// `man curl` § "EXIT CODES"; these are the only ones we translate at the
|
|
63
|
+
// primitive layer. Everything else surfaces as a generic CurlImpersonateError
|
|
64
|
+
// with stderr attached for caller-side diagnostics.
|
|
65
|
+
export const CURL_EXIT_TIMEOUT = 28
|
|
66
|
+
export const CURL_EXIT_MAX_FILESIZE_PRECHECK = 63
|
|
67
|
+
// Observed empirically (and corroborated by Oracle review): curl returns
|
|
68
|
+
// exit 56 with stderr `Exceeded the maximum allowed file size (...)` when
|
|
69
|
+
// --max-filesize is hit at TRANSFER time (e.g. server omitted Content-Length
|
|
70
|
+
// and curl discovered the overflow mid-stream). The Linux man page lists 56
|
|
71
|
+
// as the more general "Failure in receiving network data," so we additionally
|
|
72
|
+
// gate on a stderr match to avoid mis-classifying real network drops as
|
|
73
|
+
// size-exceeded.
|
|
74
|
+
export const CURL_EXIT_RECV_FAILURE_OR_FILESIZE = 56
|
|
75
|
+
|
|
76
|
+
export class CurlImpersonateError extends Error {
|
|
77
|
+
constructor(
|
|
78
|
+
message: string,
|
|
79
|
+
public readonly exitCode: number | null,
|
|
80
|
+
public readonly stderr: string,
|
|
81
|
+
) {
|
|
82
|
+
super(message)
|
|
83
|
+
this.name = 'CurlImpersonateError'
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export function isCurlExitFilesizeExceeded(error: CurlImpersonateError): boolean {
|
|
88
|
+
if (error.exitCode === CURL_EXIT_MAX_FILESIZE_PRECHECK) return true
|
|
89
|
+
if (error.exitCode === CURL_EXIT_RECV_FAILURE_OR_FILESIZE && /maximum.{0,30}file size/i.test(error.stderr)) {
|
|
90
|
+
return true
|
|
91
|
+
}
|
|
92
|
+
return false
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export function isCurlExitTimeout(error: CurlImpersonateError): boolean {
|
|
96
|
+
return error.exitCode === CURL_EXIT_TIMEOUT
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export async function curlImpersonate(req: CurlImpersonateRequest): Promise<CurlImpersonateResponse> {
|
|
100
|
+
const timeoutSeconds = req.timeoutSeconds ?? DEFAULT_TIMEOUT_SECONDS
|
|
101
|
+
const method = req.method ?? 'GET'
|
|
102
|
+
|
|
103
|
+
// Per-request random sentinel + UTF-8-safe parsing. The static sentinel
|
|
104
|
+
// approach (previous revision) had a hardening hole: webfetch reads
|
|
105
|
+
// attacker-controlled pages, and a static sentinel is a public, fixed
|
|
106
|
+
// string. A page could include the sentinel byte sequence plus fabricated
|
|
107
|
+
// metadata before the real write-out tail and `indexOf` would split at
|
|
108
|
+
// the attacker-controlled occurrence. Per-request randomness (96 bits)
|
|
109
|
+
// removes the attacker's ability to predict the sentinel, and the parser
|
|
110
|
+
// anchors on the LAST occurrence (curl writes `-w` after the body, so the
|
|
111
|
+
// real metadata block is always last). Both defenses are needed: random
|
|
112
|
+
// alone fails if the attacker can read the sentinel from a previous
|
|
113
|
+
// response and replay it; last-match alone fails if the attacker can
|
|
114
|
+
// append text after curl's write-out (they can't, but defense in depth).
|
|
115
|
+
const sentinel = generateSentinel()
|
|
116
|
+
const writeOutTemplate = `${sentinel}%{http_code}\n%{url_effective}\n%{content_type}\n%{size_download}\n`
|
|
117
|
+
|
|
118
|
+
const cmd: string[] = [
|
|
119
|
+
curlBinary,
|
|
120
|
+
// `--disable` (alias -q) MUST be the first argument to suppress reading
|
|
121
|
+
// ~/.curlrc and /etc/curlrc. Without it, a user or attacker-controlled
|
|
122
|
+
// curlrc could inject --proxy, --header, --resolve, --no-location, etc.,
|
|
123
|
+
// silently subverting both the Chrome impersonation contract and the
|
|
124
|
+
// protocol restrictions below. Order is load-bearing: curl ignores
|
|
125
|
+
// --disable if it appears after any other flag.
|
|
126
|
+
'--disable',
|
|
127
|
+
'--silent',
|
|
128
|
+
'--show-error',
|
|
129
|
+
// Protocol allowlist. curl-impersonate supports many protocols by default
|
|
130
|
+
// (ftp, file, dict, etc.). normalizeUrl() already rejects non-http(s) at
|
|
131
|
+
// the call-site, but redirects are followed by curl after that gate fires
|
|
132
|
+
// and a 301/302 to ftp://... would otherwise be silently honored. The
|
|
133
|
+
// `=http,https` syntax means "ONLY these two" rather than "add these to
|
|
134
|
+
// defaults." --proto-redir governs the redirect chain specifically.
|
|
135
|
+
'--proto',
|
|
136
|
+
'=http,https',
|
|
137
|
+
'--proto-redir',
|
|
138
|
+
'=http,https',
|
|
139
|
+
// `--fail-with-body` would make curl exit non-zero on >=400 but still
|
|
140
|
+
// write the body. We intentionally DO NOT pass it: callers (webfetch,
|
|
141
|
+
// ddg) want to inspect httpStatus themselves and decide. Curl exits 0
|
|
142
|
+
// on a 404-with-body in this mode, which matches our contract.
|
|
143
|
+
'--compressed',
|
|
144
|
+
'--location',
|
|
145
|
+
'--max-redirs',
|
|
146
|
+
'10',
|
|
147
|
+
'--max-time',
|
|
148
|
+
String(timeoutSeconds),
|
|
149
|
+
'-w',
|
|
150
|
+
writeOutTemplate,
|
|
151
|
+
'-X',
|
|
152
|
+
method,
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
if (req.maxBytes !== undefined) {
|
|
156
|
+
cmd.push('--max-filesize', String(req.maxBytes))
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (req.formFields) {
|
|
160
|
+
for (const field of req.formFields) {
|
|
161
|
+
cmd.push('--data-urlencode', `${field.name}=${field.value}`)
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// `--` terminates option parsing so a URL beginning with `-` (e.g. an
|
|
166
|
+
// attacker-supplied "-K /etc/passwd" sneaking through normalizeUrl as
|
|
167
|
+
// "https://-K /etc/passwd") cannot be reinterpreted as a curl option.
|
|
168
|
+
cmd.push('--', req.url)
|
|
169
|
+
|
|
170
|
+
// Spawn detached so the child becomes the leader of its own process group.
|
|
171
|
+
// The curl-impersonate wrappers (curl_chrome136 et al.) are bash scripts
|
|
172
|
+
// that call the real curl-impersonate binary WITHOUT `exec` — meaning the
|
|
173
|
+
// wrapper is the parent and curl-impersonate is its child. On a plain
|
|
174
|
+
// SIGKILL to the wrapper PID, the curl child becomes orphaned and keeps
|
|
175
|
+
// the stdout pipe open until --max-time fires, turning a 50ms abort into
|
|
176
|
+
// a 30s hang. process.kill(-pid) addresses the negative PID, which signals
|
|
177
|
+
// the entire process group, killing both atomically. detached: true makes
|
|
178
|
+
// the child the pgid leader so -pid is well-defined.
|
|
179
|
+
const proc = spawn({
|
|
180
|
+
cmd,
|
|
181
|
+
stdout: 'pipe',
|
|
182
|
+
stderr: 'pipe',
|
|
183
|
+
detached: true,
|
|
184
|
+
})
|
|
185
|
+
|
|
186
|
+
const onAbort = () => {
|
|
187
|
+
try {
|
|
188
|
+
process.kill(-proc.pid, 'SIGKILL')
|
|
189
|
+
} catch {
|
|
190
|
+
proc.kill('SIGKILL')
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
req.signal?.addEventListener('abort', onAbort, { once: true })
|
|
194
|
+
|
|
195
|
+
try {
|
|
196
|
+
const [stdoutBuf, stderr, exitCode] = await Promise.all([
|
|
197
|
+
new Response(proc.stdout).arrayBuffer(),
|
|
198
|
+
new Response(proc.stderr).text(),
|
|
199
|
+
proc.exited,
|
|
200
|
+
])
|
|
201
|
+
|
|
202
|
+
if (req.signal?.aborted) {
|
|
203
|
+
throw new CurlImpersonateError('aborted', exitCode, stderr)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (exitCode !== 0) {
|
|
207
|
+
const detail = stderr.trim() || 'no stderr'
|
|
208
|
+
throw new CurlImpersonateError(`curl-impersonate exited ${exitCode}: ${detail}`, exitCode, stderr)
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return parseCurlOutput(stdoutBuf, sentinel, stderr)
|
|
212
|
+
} finally {
|
|
213
|
+
req.signal?.removeEventListener('abort', onAbort)
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Generates a per-request sentinel. Format: `\n--TYPECLAW-CURL-META-<hex>--\n`.
|
|
218
|
+
// 24 hex chars = 96 bits of entropy, plenty to defeat any attempt by an
|
|
219
|
+
// attacker-controlled response body to inject a colliding marker. ASCII-only
|
|
220
|
+
// + leading/trailing newlines means it's unambiguous in textual responses
|
|
221
|
+
// and free of NUL bytes (Bun's spawn rejects NULs in argv).
|
|
222
|
+
function generateSentinel(): string {
|
|
223
|
+
const hex = randomBytes(12).toString('hex')
|
|
224
|
+
return `\n--TYPECLAW-CURL-META-${hex}--\n`
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function parseCurlOutput(buf: ArrayBuffer, sentinel: string, stderr: string): CurlImpersonateResponse {
|
|
228
|
+
const sentinelBytes = new TextEncoder().encode(sentinel)
|
|
229
|
+
const bytes = new Uint8Array(buf)
|
|
230
|
+
|
|
231
|
+
// Anchor on the LAST occurrence (defense in depth alongside the random
|
|
232
|
+
// sentinel). curl writes the `-w` output strictly AFTER the body, so the
|
|
233
|
+
// real metadata block is always the trailing one.
|
|
234
|
+
const sentinelIndex = lastIndexOfBytes(bytes, sentinelBytes)
|
|
235
|
+
if (sentinelIndex < 0) {
|
|
236
|
+
throw new CurlImpersonateError(
|
|
237
|
+
'curl-impersonate produced no metadata block (sentinel missing). Wrapper or output corruption suspected.',
|
|
238
|
+
0,
|
|
239
|
+
stderr,
|
|
240
|
+
)
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const bodyBytes = bytes.subarray(0, sentinelIndex)
|
|
244
|
+
const metaBytes = bytes.subarray(sentinelIndex + sentinelBytes.byteLength)
|
|
245
|
+
const meta = new TextDecoder('utf-8', { fatal: false }).decode(metaBytes).split('\n')
|
|
246
|
+
|
|
247
|
+
const httpStatus = Number(meta[0]?.trim() ?? '0') || 0
|
|
248
|
+
const finalUrl = (meta[1] ?? '').trim()
|
|
249
|
+
const contentType = (meta[2] ?? '').trim().toLowerCase()
|
|
250
|
+
const declaredBytes = Number(meta[3]?.trim() ?? '0') || bodyBytes.byteLength
|
|
251
|
+
|
|
252
|
+
const body = new TextDecoder('utf-8', { fatal: false }).decode(bodyBytes)
|
|
253
|
+
|
|
254
|
+
return {
|
|
255
|
+
body,
|
|
256
|
+
finalUrl,
|
|
257
|
+
httpStatus,
|
|
258
|
+
contentType,
|
|
259
|
+
bytesIn: declaredBytes,
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
function lastIndexOfBytes(haystack: Uint8Array, needle: Uint8Array): number {
|
|
264
|
+
if (needle.byteLength === 0) return haystack.byteLength
|
|
265
|
+
for (let i = haystack.byteLength - needle.byteLength; i >= 0; i--) {
|
|
266
|
+
let matched = true
|
|
267
|
+
for (let j = 0; j < needle.byteLength; j++) {
|
|
268
|
+
if (haystack[i + j] !== needle[j]) {
|
|
269
|
+
matched = false
|
|
270
|
+
break
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
if (matched) return i
|
|
274
|
+
}
|
|
275
|
+
return -1
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Detect whether curl-impersonate is available on PATH. Used by fetch.ts to
|
|
279
|
+
// decide between the impersonating transport (production: container has the
|
|
280
|
+
// binary pinned in the image) and a Bun.fetch fallback (test/dev: no binary
|
|
281
|
+
// installed). The check is best-effort and cheap — we spawn `--version`
|
|
282
|
+
// and look at exit code. Cached per-process: the binary doesn't appear or
|
|
283
|
+
// disappear at runtime.
|
|
284
|
+
let availabilityCache: boolean | undefined
|
|
285
|
+
|
|
286
|
+
export async function isCurlImpersonateAvailable(): Promise<boolean> {
|
|
287
|
+
if (availabilityCache !== undefined) return availabilityCache
|
|
288
|
+
try {
|
|
289
|
+
const proc = spawn({ cmd: [curlBinary, '--version'], stdout: 'ignore', stderr: 'ignore' })
|
|
290
|
+
const code = await proc.exited
|
|
291
|
+
availabilityCache = code === 0
|
|
292
|
+
} catch {
|
|
293
|
+
availabilityCache = false
|
|
294
|
+
}
|
|
295
|
+
return availabilityCache
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
export function _resetAvailabilityCacheForTest(): void {
|
|
299
|
+
availabilityCache = undefined
|
|
300
|
+
}
|
package/src/agent/tools/ddg.ts
CHANGED
|
@@ -7,40 +7,16 @@
|
|
|
7
7
|
// single bad fingerprint match. `lite` exists for non-browser clients (text
|
|
8
8
|
// browsers, accessibility tools) and historically gates less aggressively —
|
|
9
9
|
// but as of 2026 it ALSO fingerprints at the TLS layer (JA3/JA4) and the
|
|
10
|
-
// HTTP/2 SETTINGS frame, well before any HTTP header is read.
|
|
11
|
-
//
|
|
12
|
-
//
|
|
13
|
-
//
|
|
14
|
-
// where real Chromium succeeded continuously while every fetch variant got
|
|
15
|
-
// 202 anomaly-modal or HTTP-200-with-anomaly responses.
|
|
16
|
-
//
|
|
17
|
-
// The fix is to shell out to `curl-impersonate` (lexiforest fork), which
|
|
18
|
-
// replays Chrome's exact TLS handshake + HTTP/2 settings + header ordering.
|
|
19
|
-
// The binary is installed by the typeclaw Dockerfile (see
|
|
20
|
-
// src/init/dockerfile.ts CURL_IMPERSONATE_* constants) at /usr/local/bin/
|
|
21
|
-
// and invoked via the version-pinned wrapper `curl_chrome136`.
|
|
22
|
-
//
|
|
23
|
-
// Why no `-H` overrides: curl_chrome136 already sends the full Chrome 136
|
|
24
|
-
// header set with correct ordering, sec-ch-ua values, etc. Adding our own
|
|
25
|
-
// headers would corrupt the impersonation. The previous code's
|
|
26
|
-
// BROWSER_HEADERS const has been removed for the same reason.
|
|
10
|
+
// HTTP/2 SETTINGS frame, well before any HTTP header is read. The shared
|
|
11
|
+
// curl-impersonate primitive (./curl-impersonate.ts) replays Chrome's exact
|
|
12
|
+
// TLS handshake + HTTP/2 settings + header ordering. See that file's header
|
|
13
|
+
// for the full rationale and AGENTS.md §"Web search" for the original story.
|
|
27
14
|
|
|
28
|
-
import {
|
|
29
|
-
|
|
30
|
-
const DDG_LITE_URL = 'https://lite.duckduckgo.com/lite/'
|
|
31
|
-
const CURL_IMPERSONATE_BINARY = 'curl_chrome136'
|
|
32
|
-
const REQUEST_TIMEOUT_SECONDS = 30
|
|
15
|
+
import { curlImpersonate } from './curl-impersonate'
|
|
33
16
|
|
|
34
|
-
|
|
17
|
+
export { _setCurlBinaryForTest } from './curl-impersonate'
|
|
35
18
|
|
|
36
|
-
|
|
37
|
-
// at a fake `curl_chrome136` script in a tmpdir so we exercise the real
|
|
38
|
-
// Bun.spawn path without depending on a curl-impersonate install on the
|
|
39
|
-
// test host. Production code never calls this — the const-import default
|
|
40
|
-
// above is what production sees.
|
|
41
|
-
export function _setCurlBinaryForTest(binary: string | null): void {
|
|
42
|
-
curlBinary = binary ?? CURL_IMPERSONATE_BINARY
|
|
43
|
-
}
|
|
19
|
+
const DDG_LITE_URL = 'https://lite.duckduckgo.com/lite/'
|
|
44
20
|
|
|
45
21
|
export type DdgResult = {
|
|
46
22
|
title: string
|
|
@@ -64,64 +40,13 @@ export class DdgCaptchaError extends Error {
|
|
|
64
40
|
}
|
|
65
41
|
|
|
66
42
|
export async function fetchDdgHtml(query: string, signal?: AbortSignal): Promise<string> {
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
// the stdout pipe open until --max-time fires (30s default), turning a
|
|
73
|
-
// 50ms abort into a 30s hang. process.kill(-pid) addresses the negative
|
|
74
|
-
// PID, which signals the entire process group, killing both the wrapper
|
|
75
|
-
// and the inner curl atomically. detached: true is what makes the child
|
|
76
|
-
// the pgid leader so -pid is well-defined; without it, the child shares
|
|
77
|
-
// our pgid and we'd nuke our own process.
|
|
78
|
-
const proc = spawn({
|
|
79
|
-
cmd: [
|
|
80
|
-
curlBinary,
|
|
81
|
-
'--silent',
|
|
82
|
-
'--show-error',
|
|
83
|
-
'--fail-with-body',
|
|
84
|
-
'--compressed',
|
|
85
|
-
'--max-time',
|
|
86
|
-
String(REQUEST_TIMEOUT_SECONDS),
|
|
87
|
-
'-X',
|
|
88
|
-
'POST',
|
|
89
|
-
'--data-urlencode',
|
|
90
|
-
`q=${query}`,
|
|
91
|
-
DDG_LITE_URL,
|
|
92
|
-
],
|
|
93
|
-
stdout: 'pipe',
|
|
94
|
-
stderr: 'pipe',
|
|
95
|
-
detached: true,
|
|
43
|
+
const response = await curlImpersonate({
|
|
44
|
+
url: DDG_LITE_URL,
|
|
45
|
+
method: 'POST',
|
|
46
|
+
formFields: [{ name: 'q', value: query }],
|
|
47
|
+
signal,
|
|
96
48
|
})
|
|
97
|
-
|
|
98
|
-
const onAbort = () => {
|
|
99
|
-
try {
|
|
100
|
-
process.kill(-proc.pid, 'SIGKILL')
|
|
101
|
-
} catch {
|
|
102
|
-
proc.kill('SIGKILL')
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
signal?.addEventListener('abort', onAbort, { once: true })
|
|
106
|
-
|
|
107
|
-
try {
|
|
108
|
-
const [stdout, stderr, exitCode] = await Promise.all([
|
|
109
|
-
new Response(proc.stdout).text(),
|
|
110
|
-
new Response(proc.stderr).text(),
|
|
111
|
-
proc.exited,
|
|
112
|
-
])
|
|
113
|
-
|
|
114
|
-
if (signal?.aborted) {
|
|
115
|
-
throw new Error('aborted')
|
|
116
|
-
}
|
|
117
|
-
if (exitCode !== 0) {
|
|
118
|
-
const detail = stderr.trim() || 'no stderr'
|
|
119
|
-
throw new Error(`curl-impersonate exited ${exitCode}: ${detail}`)
|
|
120
|
-
}
|
|
121
|
-
return stdout
|
|
122
|
-
} finally {
|
|
123
|
-
signal?.removeEventListener('abort', onAbort)
|
|
124
|
-
}
|
|
49
|
+
return response.body
|
|
125
50
|
}
|
|
126
51
|
|
|
127
52
|
// The `lite` endpoint's CAPTCHA page is plainer than `html`'s anomaly-modal:
|
|
@@ -1,3 +1,33 @@
|
|
|
1
|
+
// Webfetch's HTTP transport.
|
|
2
|
+
//
|
|
3
|
+
// Production path (container, curl-impersonate available): we shell out to
|
|
4
|
+
// `curl_chrome136` so outbound requests carry Chrome 136's TLS handshake
|
|
5
|
+
// (JA3/JA4), HTTP/2 SETTINGS frame, and full header set. This is what gets
|
|
6
|
+
// us past the modern bot-detection stacks on Cloudflare/Akamai-protected
|
|
7
|
+
// sites (Reuters, MarketWatch, etc.) when the agent is running from the
|
|
8
|
+
// user's home network — the IP is already residential, so impersonating
|
|
9
|
+
// the browser is the only remaining missing piece. See AGENTS.md §"Web
|
|
10
|
+
// search" and src/agent/tools/curl-impersonate.ts for the full story.
|
|
11
|
+
//
|
|
12
|
+
// Test/dev fallback (curl_chrome136 not on PATH): we transparently fall
|
|
13
|
+
// back to Bun's native `fetch()` with a static User-Agent. This keeps unit
|
|
14
|
+
// tests on developer macOS machines working without forcing every contributor
|
|
15
|
+
// to install curl-impersonate locally. Production runs always have the binary
|
|
16
|
+
// because the typeclaw Dockerfile pins it.
|
|
17
|
+
//
|
|
18
|
+
// Best-effort doctrine: this transport does NOT guarantee the fetch succeeds.
|
|
19
|
+
// Bot-detected sites can still serve 403/CAPTCHA pages. We surface what we
|
|
20
|
+
// got (status, body, final URL) and let the caller decide. The webfetch tool
|
|
21
|
+
// translates non-2xx into a tool-level error message that's useful to the
|
|
22
|
+
// model.
|
|
23
|
+
|
|
24
|
+
import {
|
|
25
|
+
CurlImpersonateError,
|
|
26
|
+
curlImpersonate,
|
|
27
|
+
isCurlExitFilesizeExceeded,
|
|
28
|
+
isCurlExitTimeout,
|
|
29
|
+
isCurlImpersonateAvailable,
|
|
30
|
+
} from '../curl-impersonate'
|
|
1
31
|
import { MAX_RESPONSE_BYTES } from './types'
|
|
2
32
|
|
|
3
33
|
export type FetchResult = {
|
|
@@ -15,7 +45,7 @@ export class WebfetchError extends Error {
|
|
|
15
45
|
}
|
|
16
46
|
}
|
|
17
47
|
|
|
18
|
-
const
|
|
48
|
+
const FALLBACK_HEADERS: Record<string, string> = {
|
|
19
49
|
'User-Agent': 'typeclaw/0 (+https://github.com/code-yeongyu/typeclaw)',
|
|
20
50
|
Accept: 'text/html,application/xhtml+xml,application/json;q=0.9,text/plain;q=0.8,*/*;q=0.1',
|
|
21
51
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
@@ -32,10 +62,83 @@ export function normalizeUrl(input: string): string {
|
|
|
32
62
|
return `https://${trimmed}`
|
|
33
63
|
}
|
|
34
64
|
|
|
65
|
+
// Test-only seam: forces fetchWithLimits to use the native-fetch fallback
|
|
66
|
+
// even when curl-impersonate is detected. Used by fetch.test.ts to keep its
|
|
67
|
+
// existing mocked-fetch contract working without the test having to install
|
|
68
|
+
// a fake curl binary. Production code never calls this.
|
|
69
|
+
let forceFallbackForTest = false
|
|
70
|
+
|
|
71
|
+
export function _setForceFallbackForTest(value: boolean): void {
|
|
72
|
+
forceFallbackForTest = value
|
|
73
|
+
}
|
|
74
|
+
|
|
35
75
|
export async function fetchWithLimits(
|
|
36
76
|
url: string,
|
|
37
77
|
timeoutSeconds: number,
|
|
38
78
|
parentSignal?: AbortSignal,
|
|
79
|
+
): Promise<FetchResult> {
|
|
80
|
+
const useImpersonate = !forceFallbackForTest && (await isCurlImpersonateAvailable())
|
|
81
|
+
if (useImpersonate) {
|
|
82
|
+
return fetchWithCurlImpersonate(url, timeoutSeconds, parentSignal)
|
|
83
|
+
}
|
|
84
|
+
return fetchWithBunFetch(url, timeoutSeconds, parentSignal)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
async function fetchWithCurlImpersonate(
|
|
88
|
+
url: string,
|
|
89
|
+
timeoutSeconds: number,
|
|
90
|
+
parentSignal?: AbortSignal,
|
|
91
|
+
): Promise<FetchResult> {
|
|
92
|
+
let response
|
|
93
|
+
try {
|
|
94
|
+
response = await curlImpersonate({
|
|
95
|
+
url,
|
|
96
|
+
method: 'GET',
|
|
97
|
+
timeoutSeconds,
|
|
98
|
+
maxBytes: MAX_RESPONSE_BYTES,
|
|
99
|
+
signal: parentSignal,
|
|
100
|
+
})
|
|
101
|
+
} catch (error) {
|
|
102
|
+
if (parentSignal?.aborted) {
|
|
103
|
+
throw new WebfetchError('Request aborted')
|
|
104
|
+
}
|
|
105
|
+
if (error instanceof CurlImpersonateError) {
|
|
106
|
+
if (isCurlExitTimeout(error)) {
|
|
107
|
+
throw new WebfetchError(`Request timed out after ${timeoutSeconds}s`)
|
|
108
|
+
}
|
|
109
|
+
if (isCurlExitFilesizeExceeded(error)) {
|
|
110
|
+
throw new WebfetchError(`Response too large (exceeds ${formatBytes(MAX_RESPONSE_BYTES)} limit)`)
|
|
111
|
+
}
|
|
112
|
+
throw new WebfetchError(`Fetch failed: ${error.message}`)
|
|
113
|
+
}
|
|
114
|
+
const message = error instanceof Error ? error.message : String(error)
|
|
115
|
+
throw new WebfetchError(`Fetch failed: ${message}`)
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (response.httpStatus < 200 || response.httpStatus >= 300) {
|
|
119
|
+
throw new WebfetchError(`Fetch failed: HTTP ${response.httpStatus}`)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const bodyByteLength = new TextEncoder().encode(response.body).byteLength
|
|
123
|
+
if (bodyByteLength > MAX_RESPONSE_BYTES) {
|
|
124
|
+
throw new WebfetchError(
|
|
125
|
+
`Response too large (${formatBytes(bodyByteLength)} exceeds ${formatBytes(MAX_RESPONSE_BYTES)} limit)`,
|
|
126
|
+
)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return {
|
|
130
|
+
body: response.body,
|
|
131
|
+
contentType: response.contentType,
|
|
132
|
+
finalUrl: response.finalUrl || url,
|
|
133
|
+
httpStatus: response.httpStatus,
|
|
134
|
+
bytesIn: bodyByteLength,
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
async function fetchWithBunFetch(
|
|
139
|
+
url: string,
|
|
140
|
+
timeoutSeconds: number,
|
|
141
|
+
parentSignal?: AbortSignal,
|
|
39
142
|
): Promise<FetchResult> {
|
|
40
143
|
const controller = new AbortController()
|
|
41
144
|
const timeout = setTimeout(() => controller.abort(new Error('timeout')), timeoutSeconds * 1000)
|
|
@@ -43,7 +146,7 @@ export async function fetchWithLimits(
|
|
|
43
146
|
parentSignal?.addEventListener('abort', onAbort, { once: true })
|
|
44
147
|
|
|
45
148
|
try {
|
|
46
|
-
const response = await fetch(url, { headers:
|
|
149
|
+
const response = await fetch(url, { headers: FALLBACK_HEADERS, signal: controller.signal, redirect: 'follow' })
|
|
47
150
|
if (!response.ok) {
|
|
48
151
|
throw new WebfetchError(`Fetch failed: HTTP ${response.status} ${response.statusText}`)
|
|
49
152
|
}
|
|
@@ -24,6 +24,10 @@ export const webfetchTool = defineTool({
|
|
|
24
24
|
description:
|
|
25
25
|
'Fetch a single HTTP(S) URL and return the body, optionally compacted by a strategy. ' +
|
|
26
26
|
'Use this when the user references a specific URL or when websearch surfaced a result you need to read in full. ' +
|
|
27
|
+
'Outbound requests impersonate Chrome 136 at the TLS, HTTP/2, and header layers ' +
|
|
28
|
+
'(via curl-impersonate), which helps with TLS/header fingerprint gates on sites behind Cloudflare/Akamai. ' +
|
|
29
|
+
'It does NOT solve JavaScript challenges, behavioural fingerprinting (mouse/scroll/timing), interactive CAPTCHAs, ' +
|
|
30
|
+
'or IP-reputation blocks — a 403 from those layers is expected and unrecoverable from this tool. ' +
|
|
27
31
|
'Strategy guide:\n' +
|
|
28
32
|
'- "readability": extract article content as markdown (blogs, docs, news). Default for HTML.\n' +
|
|
29
33
|
'- "jq": query JSON APIs (npm registry, GitHub API). Pass `query` (e.g. ".items[].name").\n' +
|
|
@@ -17,6 +17,49 @@ import { AGENT_BROWSER_DASHBOARD_UPSTREAM_PORT } from './dashboard-proxy'
|
|
|
17
17
|
|
|
18
18
|
export const REAL_BIN_ENV = 'TYPECLAW_AGENT_BROWSER_REAL_BIN'
|
|
19
19
|
|
|
20
|
+
// Recent desktop Chrome on Linux x86_64. The shim runs inside the TypeClaw
|
|
21
|
+
// container (always Linux), so a macOS or Windows UA would mismatch the TCP
|
|
22
|
+
// fingerprint, Accept-Language, and JS-side platform — itself a bot signal on
|
|
23
|
+
// stricter sites (Cloudflare, Akamai, PerimeterX). `X11; Linux x86_64` is
|
|
24
|
+
// also correct on linux/arm64 hosts: Chrome on Linux does not expose ARM in
|
|
25
|
+
// the UA string at all (verified against current Chrome 131 releases).
|
|
26
|
+
// The upstream binary defaults to a UA that includes "HeadlessChrome" /
|
|
27
|
+
// a stale Chromium build, which is widely fingerprinted as a bot and
|
|
28
|
+
// silently triggers CAPTCHAs, 403s, blank pages, and A/B-test misrouting.
|
|
29
|
+
// Bump on Chrome major releases — same hygiene as the curl-impersonate pin
|
|
30
|
+
// in src/init/dockerfile.ts.
|
|
31
|
+
export const DEFAULT_USER_AGENT =
|
|
32
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
|
33
|
+
|
|
34
|
+
export const USER_AGENT_ENV = 'AGENT_BROWSER_USER_AGENT'
|
|
35
|
+
|
|
36
|
+
export function hasUserAgentFlag(argv: readonly string[]): boolean {
|
|
37
|
+
// Matches both `--user-agent <val>` and `--user-agent=<val>`. The upstream
|
|
38
|
+
// CLI does not document a short alias for --user-agent today (verified via
|
|
39
|
+
// `agent-browser --help`), so we only check the long form.
|
|
40
|
+
for (const arg of argv) {
|
|
41
|
+
if (arg === '--user-agent' || arg.startsWith('--user-agent=')) return true
|
|
42
|
+
}
|
|
43
|
+
return false
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function injectUserAgentEnv(
|
|
47
|
+
argv: readonly string[],
|
|
48
|
+
env: Record<string, string | undefined>,
|
|
49
|
+
defaultUa: string = DEFAULT_USER_AGENT,
|
|
50
|
+
): void {
|
|
51
|
+
// Upstream's precedence is CLI flag > env > default. We only inject the
|
|
52
|
+
// env when BOTH layers above it are absent so:
|
|
53
|
+
// - explicit `--user-agent foo` wins (mobile testing, intentional bot UA)
|
|
54
|
+
// - operator-set AGENT_BROWSER_USER_AGENT wins (per-shell override)
|
|
55
|
+
// - default UA fills the otherwise-empty slot
|
|
56
|
+
// `set device "iPhone 14"` is unaffected: it sets UA via CDP at runtime,
|
|
57
|
+
// not through this env var, so our injection doesn't fight device emulation.
|
|
58
|
+
if (env[USER_AGENT_ENV] !== undefined && env[USER_AGENT_ENV] !== '') return
|
|
59
|
+
if (hasUserAgentFlag(argv)) return
|
|
60
|
+
env[USER_AGENT_ENV] = defaultUa
|
|
61
|
+
}
|
|
62
|
+
|
|
20
63
|
export type DashboardIntent = 'start' | 'stop' | 'other'
|
|
21
64
|
|
|
22
65
|
export function classifyDashboardCommand(argv: readonly string[]): DashboardIntent {
|
|
@@ -111,6 +154,7 @@ export type ShimOptions = {
|
|
|
111
154
|
realBin?: string
|
|
112
155
|
upstreamPort?: number
|
|
113
156
|
spawn?: (cmd: string[]) => { exited: Promise<number> }
|
|
157
|
+
env?: Record<string, string | undefined>
|
|
114
158
|
}
|
|
115
159
|
|
|
116
160
|
export async function runShim(opts: ShimOptions = {}): Promise<number> {
|
|
@@ -118,6 +162,9 @@ export async function runShim(opts: ShimOptions = {}): Promise<number> {
|
|
|
118
162
|
const realBin = opts.realBin ?? resolveRealAgentBrowserBin()
|
|
119
163
|
const upstreamPort = opts.upstreamPort ?? AGENT_BROWSER_DASHBOARD_UPSTREAM_PORT
|
|
120
164
|
const spawn = opts.spawn ?? defaultSpawn
|
|
165
|
+
const env = opts.env ?? process.env
|
|
166
|
+
|
|
167
|
+
injectUserAgentEnv(argv, env)
|
|
121
168
|
|
|
122
169
|
const intent = classifyDashboardCommand(argv)
|
|
123
170
|
if (intent !== 'start') {
|
|
@@ -77,6 +77,8 @@ You MAY:
|
|
|
77
77
|
- Set up a missing upstream branch via \`git push -u origin <branch>\` if it's clear that's the only issue.
|
|
78
78
|
- Retry \`git push\` once after fixing a clear, narrow issue.
|
|
79
79
|
|
|
80
|
+
**When you run \`git push\` (either to set upstream or to retry), the security plugin's \`gitExfil\` guard will block unless you pass \`acknowledgeGuards: { gitExfil: true }\` on the bash tool call.** You are running on the operator's behalf as the recovery step of a deterministic backup the operator already approved by enabling \`backup.pushToOrigin\`; the ack is justified for this specific retry path. Do NOT use this ack for any other reason — only the one push retry the rules above allow. If your push retry fails again, write the diagnosis and stop; do not ack a second push.
|
|
81
|
+
|
|
80
82
|
You MUST NOT:
|
|
81
83
|
- Force-push (\`--force\`, \`--force-with-lease\`).
|
|
82
84
|
- Resolve merge conflicts by editing files. If a rebase had conflicts, the runner already aborted it. Leave the repo as-is and explain.
|