typeclaw 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/README.md +20 -15
  2. package/auth.schema.json +113 -0
  3. package/package.json +1 -1
  4. package/secrets.schema.json +113 -0
  5. package/src/agent/auth.ts +4 -2
  6. package/src/agent/index.ts +16 -28
  7. package/src/agent/model-fallback.ts +127 -0
  8. package/src/agent/session-meta.ts +1 -1
  9. package/src/agent/session-origin.ts +3 -2
  10. package/src/agent/tools/curl-impersonate.ts +300 -0
  11. package/src/agent/tools/ddg.ts +13 -88
  12. package/src/agent/tools/webfetch/fetch.ts +105 -2
  13. package/src/agent/tools/webfetch/tool.ts +4 -0
  14. package/src/bundled-plugins/agent-browser/shim.ts +47 -0
  15. package/src/bundled-plugins/backup/subagents.ts +2 -0
  16. package/src/bundled-plugins/memory/README.md +49 -12
  17. package/src/bundled-plugins/memory/citation-superset.ts +63 -0
  18. package/src/bundled-plugins/memory/dreaming.ts +105 -17
  19. package/src/bundled-plugins/memory/index.ts +2 -2
  20. package/src/bundled-plugins/memory/memory-logger.ts +45 -26
  21. package/src/bundled-plugins/memory/strength.ts +127 -0
  22. package/src/bundled-plugins/memory/topics.ts +75 -0
  23. package/src/bundled-plugins/security/index.ts +88 -43
  24. package/src/bundled-plugins/security/permissions.ts +36 -0
  25. package/src/bundled-plugins/security/policies/git-exfil.ts +20 -0
  26. package/src/bundled-plugins/security/policies/outbound-secret-scan.ts +12 -0
  27. package/src/bundled-plugins/security/policies/prompt-injection.ts +23 -3
  28. package/src/bundled-plugins/security/policies/secret-exfil-bash.ts +7 -0
  29. package/src/bundled-plugins/security/policies/secret-exfil-read.ts +6 -0
  30. package/src/bundled-plugins/security/policies/session-search-secrets.ts +9 -0
  31. package/src/bundled-plugins/security/policies/ssrf.ts +6 -0
  32. package/src/bundled-plugins/security/policies/system-prompt-leak.ts +7 -0
  33. package/src/channels/adapters/github/auth-app.ts +120 -0
  34. package/src/channels/adapters/github/auth-pat.ts +50 -0
  35. package/src/channels/adapters/github/auth.ts +33 -0
  36. package/src/channels/adapters/github/channel-resolver.ts +30 -0
  37. package/src/channels/adapters/github/dedup.ts +26 -0
  38. package/src/channels/adapters/github/event-allowlist.ts +8 -0
  39. package/src/channels/adapters/github/fetch-attachment.ts +5 -0
  40. package/src/channels/adapters/github/history.ts +63 -0
  41. package/src/channels/adapters/github/inbound.ts +286 -0
  42. package/src/channels/adapters/github/index.ts +370 -0
  43. package/src/channels/adapters/github/managed-path.ts +54 -0
  44. package/src/channels/adapters/github/membership.ts +35 -0
  45. package/src/channels/adapters/github/outbound.ts +145 -0
  46. package/src/channels/adapters/github/webhook-register.ts +349 -0
  47. package/src/channels/manager.ts +94 -9
  48. package/src/channels/router.ts +194 -28
  49. package/src/channels/schema.ts +31 -1
  50. package/src/channels/tunnel-bridge.ts +51 -0
  51. package/src/channels/types.ts +3 -1
  52. package/src/cli/builtins.ts +28 -0
  53. package/src/cli/channel.ts +511 -25
  54. package/src/cli/container-command-client.ts +244 -0
  55. package/src/cli/cron.ts +173 -0
  56. package/src/cli/host-command-runner.ts +150 -0
  57. package/src/cli/index.ts +42 -1
  58. package/src/cli/init.ts +400 -67
  59. package/src/cli/model.ts +14 -4
  60. package/src/cli/oauth-callbacks.ts +49 -0
  61. package/src/cli/plugin-command-help.ts +49 -0
  62. package/src/cli/plugin-commands-dispatch.ts +112 -0
  63. package/src/cli/plugin-commands.ts +118 -0
  64. package/src/cli/provider.ts +3 -20
  65. package/src/cli/tui.ts +10 -2
  66. package/src/cli/tunnel.ts +533 -0
  67. package/src/cli/ui.ts +8 -3
  68. package/src/config/config.ts +134 -24
  69. package/src/config/models-mutation.ts +42 -8
  70. package/src/config/providers-mutation.ts +12 -8
  71. package/src/container/start.ts +48 -4
  72. package/src/cron/bridge.ts +136 -0
  73. package/src/cron/consumer.ts +174 -48
  74. package/src/cron/index.ts +19 -2
  75. package/src/cron/list.ts +105 -0
  76. package/src/cron/scheduler.ts +12 -3
  77. package/src/cron/schema.ts +11 -3
  78. package/src/doctor/checks.ts +0 -50
  79. package/src/init/dockerfile.ts +165 -13
  80. package/src/init/ensure-deps.ts +15 -4
  81. package/src/init/github-webhook-install.ts +109 -0
  82. package/src/init/hatching.ts +2 -2
  83. package/src/init/index.ts +519 -12
  84. package/src/init/oauth-login.ts +17 -3
  85. package/src/init/run-bun-install.ts +17 -3
  86. package/src/init/run-owner-claim.ts +11 -2
  87. package/src/permissions/builtins.ts +29 -2
  88. package/src/permissions/match-rule.ts +24 -2
  89. package/src/permissions/permissions.ts +24 -7
  90. package/src/permissions/resolve.ts +1 -0
  91. package/src/plugin/define.ts +44 -1
  92. package/src/plugin/index.ts +18 -3
  93. package/src/plugin/manager.ts +16 -0
  94. package/src/plugin/registry.ts +85 -3
  95. package/src/plugin/types.ts +144 -1
  96. package/src/plugin/zod-introspect.ts +100 -0
  97. package/src/role-claim/match-rule.ts +2 -1
  98. package/src/run/index.ts +112 -4
  99. package/src/secrets/index.ts +1 -1
  100. package/src/secrets/schema.ts +21 -0
  101. package/src/server/command-runner.ts +476 -0
  102. package/src/server/index.ts +388 -5
  103. package/src/shared/index.ts +8 -0
  104. package/src/shared/protocol.ts +80 -1
  105. package/src/skills/typeclaw-channel-github/SKILL.md +24 -0
  106. package/src/skills/typeclaw-config/SKILL.md +27 -26
  107. package/src/skills/typeclaw-cron/SKILL.md +234 -3
  108. package/src/skills/typeclaw-memory/SKILL.md +25 -15
  109. package/src/skills/typeclaw-monorepo/SKILL.md +2 -2
  110. package/src/skills/typeclaw-permissions/SKILL.md +35 -16
  111. package/src/skills/typeclaw-plugins/SKILL.md +251 -5
  112. package/src/skills/typeclaw-tunnels/SKILL.md +111 -0
  113. package/src/test-helpers/wait-for.ts +50 -0
  114. package/src/tui/index.ts +70 -7
  115. package/src/tunnels/__fixtures__/cloudflared-quick-stderr.txt +11 -0
  116. package/src/tunnels/events.ts +14 -0
  117. package/src/tunnels/index.ts +12 -0
  118. package/src/tunnels/log-ring.ts +54 -0
  119. package/src/tunnels/manager.ts +139 -0
  120. package/src/tunnels/providers/cloudflare-quick.ts +189 -0
  121. package/src/tunnels/providers/external.ts +53 -0
  122. package/src/tunnels/quick-url-parser.ts +5 -0
  123. package/src/tunnels/types.ts +43 -0
  124. package/src/usage/report.ts +15 -12
  125. package/typeclaw.schema.json +311 -26
@@ -0,0 +1,300 @@
1
+ // Shared curl-impersonate spawn primitive.
2
+ //
3
+ // Why this exists: by 2026, every non-trivial public site (DDG, Reuters via
4
+ // Akamai, MarketWatch via Cloudflare, etc.) fingerprints incoming traffic at
5
+ // the TLS handshake (JA3/JA4) and HTTP/2 SETTINGS frame BEFORE any HTTP header
6
+ // is read. Bun's native fetch cannot match Chrome's handshake (upstream issue
7
+ // #11368), so outbound requests get gated by anomaly checks regardless of
8
+ // headers, body shape, or pacing. The fix is to shell out to curl-impersonate
9
+ // (lexiforest fork), which replays Chrome's exact TLS handshake, HTTP/2
10
+ // settings, and header ordering. Pinned by the typeclaw Dockerfile at
11
+ // /usr/local/bin/curl_chrome136 — see src/init/dockerfile.ts for the version
12
+ // and SHA pin.
13
+ //
14
+ // AGENTS.md explicitly warns against adding `-H` overrides because the
15
+ // curl_chrome wrapper already sends the full Chrome header set (correct
16
+ // ordering, sec-ch-ua, sec-fetch-*, accept-encoding, etc.) and any custom
17
+ // header corrupts the impersonation. We therefore expose NO header-override
18
+ // surface from this primitive; add one only when a real caller needs it AND
19
+ // the override is something curl_chrome can't be told to send another way.
20
+
21
+ import { randomBytes } from 'node:crypto'
22
+
23
+ import { spawn } from 'bun'
24
+
25
+ export const CURL_IMPERSONATE_BINARY = 'curl_chrome136'
26
+ export const DEFAULT_TIMEOUT_SECONDS = 30
27
+
28
+ let curlBinary: string = CURL_IMPERSONATE_BINARY
29
+
30
+ // Test-only seam: lets *.test.ts point the spawn at a fake `curl_chrome136`
31
+ // script in a tmpdir so we exercise the real Bun.spawn path without depending
32
+ // on a curl-impersonate install on the test host. Production code never calls
33
+ // this — the module-level default above is what production sees.
34
+ export function _setCurlBinaryForTest(binary: string | null): void {
35
+ curlBinary = binary ?? CURL_IMPERSONATE_BINARY
36
+ }
37
+
38
+ export type CurlImpersonateRequest = {
39
+ url: string
40
+ method?: 'GET' | 'POST'
41
+ // Form-urlencoded body fields for POST. Each entry is passed as a separate
42
+ // --data-urlencode argument so curl handles the encoding. Required if
43
+ // method is 'POST' and you want a body.
44
+ formFields?: Array<{ name: string; value: string }>
45
+ // Hard cap on bytes accepted from the response (passed as --max-filesize).
46
+ // The actual buffer is still bounded by the caller; this just makes curl
47
+ // bail early instead of streaming gigabytes.
48
+ maxBytes?: number
49
+ timeoutSeconds?: number
50
+ signal?: AbortSignal
51
+ }
52
+
53
+ export type CurlImpersonateResponse = {
54
+ body: string
55
+ finalUrl: string
56
+ httpStatus: number
57
+ contentType: string
58
+ bytesIn: number
59
+ }
60
+
61
+ // Specific curl exit codes we map to typed errors. The full list is in
62
+ // `man curl` § "EXIT CODES"; these are the only ones we translate at the
63
+ // primitive layer. Everything else surfaces as a generic CurlImpersonateError
64
+ // with stderr attached for caller-side diagnostics.
65
+ export const CURL_EXIT_TIMEOUT = 28
66
+ export const CURL_EXIT_MAX_FILESIZE_PRECHECK = 63
67
+ // Observed empirically (and corroborated by Oracle review): curl returns
68
+ // exit 56 with stderr `Exceeded the maximum allowed file size (...)` when
69
+ // --max-filesize is hit at TRANSFER time (e.g. server omitted Content-Length
70
+ // and curl discovered the overflow mid-stream). The Linux man page lists 56
71
+ // as the more general "Failure in receiving network data," so we additionally
72
+ // gate on a stderr match to avoid mis-classifying real network drops as
73
+ // size-exceeded.
74
+ export const CURL_EXIT_RECV_FAILURE_OR_FILESIZE = 56
75
+
76
+ export class CurlImpersonateError extends Error {
77
+ constructor(
78
+ message: string,
79
+ public readonly exitCode: number | null,
80
+ public readonly stderr: string,
81
+ ) {
82
+ super(message)
83
+ this.name = 'CurlImpersonateError'
84
+ }
85
+ }
86
+
87
+ export function isCurlExitFilesizeExceeded(error: CurlImpersonateError): boolean {
88
+ if (error.exitCode === CURL_EXIT_MAX_FILESIZE_PRECHECK) return true
89
+ if (error.exitCode === CURL_EXIT_RECV_FAILURE_OR_FILESIZE && /maximum.{0,30}file size/i.test(error.stderr)) {
90
+ return true
91
+ }
92
+ return false
93
+ }
94
+
95
+ export function isCurlExitTimeout(error: CurlImpersonateError): boolean {
96
+ return error.exitCode === CURL_EXIT_TIMEOUT
97
+ }
98
+
99
+ export async function curlImpersonate(req: CurlImpersonateRequest): Promise<CurlImpersonateResponse> {
100
+ const timeoutSeconds = req.timeoutSeconds ?? DEFAULT_TIMEOUT_SECONDS
101
+ const method = req.method ?? 'GET'
102
+
103
+ // Per-request random sentinel + UTF-8-safe parsing. The static sentinel
104
+ // approach (previous revision) had a hardening hole: webfetch reads
105
+ // attacker-controlled pages, and a static sentinel is a public, fixed
106
+ // string. A page could include the sentinel byte sequence plus fabricated
107
+ // metadata before the real write-out tail and `indexOf` would split at
108
+ // the attacker-controlled occurrence. Per-request randomness (96 bits)
109
+ // removes the attacker's ability to predict the sentinel, and the parser
110
+ // anchors on the LAST occurrence (curl writes `-w` after the body, so the
111
+ // real metadata block is always last). Both defenses are needed: random
112
+ // alone fails if the attacker can read the sentinel from a previous
113
+ // response and replay it; last-match alone fails if the attacker can
114
+ // append text after curl's write-out (they can't, but defense in depth).
115
+ const sentinel = generateSentinel()
116
+ const writeOutTemplate = `${sentinel}%{http_code}\n%{url_effective}\n%{content_type}\n%{size_download}\n`
117
+
118
+ const cmd: string[] = [
119
+ curlBinary,
120
+ // `--disable` (alias -q) MUST be the first argument to suppress reading
121
+ // ~/.curlrc and /etc/curlrc. Without it, a user or attacker-controlled
122
+ // curlrc could inject --proxy, --header, --resolve, --no-location, etc.,
123
+ // silently subverting both the Chrome impersonation contract and the
124
+ // protocol restrictions below. Order is load-bearing: curl ignores
125
+ // --disable if it appears after any other flag.
126
+ '--disable',
127
+ '--silent',
128
+ '--show-error',
129
+ // Protocol allowlist. curl-impersonate supports many protocols by default
130
+ // (ftp, file, dict, etc.). normalizeUrl() already rejects non-http(s) at
131
+ // the call-site, but redirects are followed by curl after that gate fires
132
+ // and a 301/302 to ftp://... would otherwise be silently honored. The
133
+ // `=http,https` syntax means "ONLY these two" rather than "add these to
134
+ // defaults." --proto-redir governs the redirect chain specifically.
135
+ '--proto',
136
+ '=http,https',
137
+ '--proto-redir',
138
+ '=http,https',
139
+ // `--fail-with-body` would make curl exit non-zero on >=400 but still
140
+ // write the body. We intentionally DO NOT pass it: callers (webfetch,
141
+ // ddg) want to inspect httpStatus themselves and decide. Curl exits 0
142
+ // on a 404-with-body in this mode, which matches our contract.
143
+ '--compressed',
144
+ '--location',
145
+ '--max-redirs',
146
+ '10',
147
+ '--max-time',
148
+ String(timeoutSeconds),
149
+ '-w',
150
+ writeOutTemplate,
151
+ '-X',
152
+ method,
153
+ ]
154
+
155
+ if (req.maxBytes !== undefined) {
156
+ cmd.push('--max-filesize', String(req.maxBytes))
157
+ }
158
+
159
+ if (req.formFields) {
160
+ for (const field of req.formFields) {
161
+ cmd.push('--data-urlencode', `${field.name}=${field.value}`)
162
+ }
163
+ }
164
+
165
+ // `--` terminates option parsing so a URL beginning with `-` (e.g. an
166
+ // attacker-supplied "-K /etc/passwd" sneaking through normalizeUrl as
167
+ // "https://-K /etc/passwd") cannot be reinterpreted as a curl option.
168
+ cmd.push('--', req.url)
169
+
170
+ // Spawn detached so the child becomes the leader of its own process group.
171
+ // The curl-impersonate wrappers (curl_chrome136 et al.) are bash scripts
172
+ // that call the real curl-impersonate binary WITHOUT `exec` — meaning the
173
+ // wrapper is the parent and curl-impersonate is its child. On a plain
174
+ // SIGKILL to the wrapper PID, the curl child becomes orphaned and keeps
175
+ // the stdout pipe open until --max-time fires, turning a 50ms abort into
176
+ // a 30s hang. process.kill(-pid) addresses the negative PID, which signals
177
+ // the entire process group, killing both atomically. detached: true makes
178
+ // the child the pgid leader so -pid is well-defined.
179
+ const proc = spawn({
180
+ cmd,
181
+ stdout: 'pipe',
182
+ stderr: 'pipe',
183
+ detached: true,
184
+ })
185
+
186
+ const onAbort = () => {
187
+ try {
188
+ process.kill(-proc.pid, 'SIGKILL')
189
+ } catch {
190
+ proc.kill('SIGKILL')
191
+ }
192
+ }
193
+ req.signal?.addEventListener('abort', onAbort, { once: true })
194
+
195
+ try {
196
+ const [stdoutBuf, stderr, exitCode] = await Promise.all([
197
+ new Response(proc.stdout).arrayBuffer(),
198
+ new Response(proc.stderr).text(),
199
+ proc.exited,
200
+ ])
201
+
202
+ if (req.signal?.aborted) {
203
+ throw new CurlImpersonateError('aborted', exitCode, stderr)
204
+ }
205
+
206
+ if (exitCode !== 0) {
207
+ const detail = stderr.trim() || 'no stderr'
208
+ throw new CurlImpersonateError(`curl-impersonate exited ${exitCode}: ${detail}`, exitCode, stderr)
209
+ }
210
+
211
+ return parseCurlOutput(stdoutBuf, sentinel, stderr)
212
+ } finally {
213
+ req.signal?.removeEventListener('abort', onAbort)
214
+ }
215
+ }
216
+
217
+ // Generates a per-request sentinel. Format: `\n--TYPECLAW-CURL-META-<hex>--\n`.
218
+ // 24 hex chars = 96 bits of entropy, plenty to defeat any attempt by an
219
+ // attacker-controlled response body to inject a colliding marker. ASCII-only
220
+ // + leading/trailing newlines means it's unambiguous in textual responses
221
+ // and free of NUL bytes (Bun's spawn rejects NULs in argv).
222
+ function generateSentinel(): string {
223
+ const hex = randomBytes(12).toString('hex')
224
+ return `\n--TYPECLAW-CURL-META-${hex}--\n`
225
+ }
226
+
227
+ function parseCurlOutput(buf: ArrayBuffer, sentinel: string, stderr: string): CurlImpersonateResponse {
228
+ const sentinelBytes = new TextEncoder().encode(sentinel)
229
+ const bytes = new Uint8Array(buf)
230
+
231
+ // Anchor on the LAST occurrence (defense in depth alongside the random
232
+ // sentinel). curl writes the `-w` output strictly AFTER the body, so the
233
+ // real metadata block is always the trailing one.
234
+ const sentinelIndex = lastIndexOfBytes(bytes, sentinelBytes)
235
+ if (sentinelIndex < 0) {
236
+ throw new CurlImpersonateError(
237
+ 'curl-impersonate produced no metadata block (sentinel missing). Wrapper or output corruption suspected.',
238
+ 0,
239
+ stderr,
240
+ )
241
+ }
242
+
243
+ const bodyBytes = bytes.subarray(0, sentinelIndex)
244
+ const metaBytes = bytes.subarray(sentinelIndex + sentinelBytes.byteLength)
245
+ const meta = new TextDecoder('utf-8', { fatal: false }).decode(metaBytes).split('\n')
246
+
247
+ const httpStatus = Number(meta[0]?.trim() ?? '0') || 0
248
+ const finalUrl = (meta[1] ?? '').trim()
249
+ const contentType = (meta[2] ?? '').trim().toLowerCase()
250
+ const declaredBytes = Number(meta[3]?.trim() ?? '0') || bodyBytes.byteLength
251
+
252
+ const body = new TextDecoder('utf-8', { fatal: false }).decode(bodyBytes)
253
+
254
+ return {
255
+ body,
256
+ finalUrl,
257
+ httpStatus,
258
+ contentType,
259
+ bytesIn: declaredBytes,
260
+ }
261
+ }
262
+
263
+ function lastIndexOfBytes(haystack: Uint8Array, needle: Uint8Array): number {
264
+ if (needle.byteLength === 0) return haystack.byteLength
265
+ for (let i = haystack.byteLength - needle.byteLength; i >= 0; i--) {
266
+ let matched = true
267
+ for (let j = 0; j < needle.byteLength; j++) {
268
+ if (haystack[i + j] !== needle[j]) {
269
+ matched = false
270
+ break
271
+ }
272
+ }
273
+ if (matched) return i
274
+ }
275
+ return -1
276
+ }
277
+
278
+ // Detect whether curl-impersonate is available on PATH. Used by fetch.ts to
279
+ // decide between the impersonating transport (production: container has the
280
+ // binary pinned in the image) and a Bun.fetch fallback (test/dev: no binary
281
+ // installed). The check is best-effort and cheap — we spawn `--version`
282
+ // and look at exit code. Cached per-process: the binary doesn't appear or
283
+ // disappear at runtime.
284
+ let availabilityCache: boolean | undefined
285
+
286
+ export async function isCurlImpersonateAvailable(): Promise<boolean> {
287
+ if (availabilityCache !== undefined) return availabilityCache
288
+ try {
289
+ const proc = spawn({ cmd: [curlBinary, '--version'], stdout: 'ignore', stderr: 'ignore' })
290
+ const code = await proc.exited
291
+ availabilityCache = code === 0
292
+ } catch {
293
+ availabilityCache = false
294
+ }
295
+ return availabilityCache
296
+ }
297
+
298
+ export function _resetAvailabilityCacheForTest(): void {
299
+ availabilityCache = undefined
300
+ }
@@ -7,40 +7,16 @@
7
7
  // single bad fingerprint match. `lite` exists for non-browser clients (text
8
8
  // browsers, accessibility tools) and historically gates less aggressively —
9
9
  // but as of 2026 it ALSO fingerprints at the TLS layer (JA3/JA4) and the
10
- // HTTP/2 SETTINGS frame, well before any HTTP header is read. Bun's native
11
- // fetch cannot match Chrome's handshake (upstream issue #11368), so requests
12
- // from `fetch()` get gated regardless of headers, body shape, or pacing
13
- // confirmed empirically over a multi-hour session against a single home IP
14
- // where real Chromium succeeded continuously while every fetch variant got
15
- // 202 anomaly-modal or HTTP-200-with-anomaly responses.
16
- //
17
- // The fix is to shell out to `curl-impersonate` (lexiforest fork), which
18
- // replays Chrome's exact TLS handshake + HTTP/2 settings + header ordering.
19
- // The binary is installed by the typeclaw Dockerfile (see
20
- // src/init/dockerfile.ts CURL_IMPERSONATE_* constants) at /usr/local/bin/
21
- // and invoked via the version-pinned wrapper `curl_chrome136`.
22
- //
23
- // Why no `-H` overrides: curl_chrome136 already sends the full Chrome 136
24
- // header set with correct ordering, sec-ch-ua values, etc. Adding our own
25
- // headers would corrupt the impersonation. The previous code's
26
- // BROWSER_HEADERS const has been removed for the same reason.
10
+ // HTTP/2 SETTINGS frame, well before any HTTP header is read. The shared
11
+ // curl-impersonate primitive (./curl-impersonate.ts) replays Chrome's exact
12
+ // TLS handshake + HTTP/2 settings + header ordering. See that file's header
13
+ // for the full rationale and AGENTS.md §"Web search" for the original story.
27
14
 
28
- import { spawn } from 'bun'
29
-
30
- const DDG_LITE_URL = 'https://lite.duckduckgo.com/lite/'
31
- const CURL_IMPERSONATE_BINARY = 'curl_chrome136'
32
- const REQUEST_TIMEOUT_SECONDS = 30
15
+ import { curlImpersonate } from './curl-impersonate'
33
16
 
34
- let curlBinary = CURL_IMPERSONATE_BINARY
17
+ export { _setCurlBinaryForTest } from './curl-impersonate'
35
18
 
36
- // Test-only seam: lets ddg.test.ts and websearch.test.ts point the spawn
37
- // at a fake `curl_chrome136` script in a tmpdir so we exercise the real
38
- // Bun.spawn path without depending on a curl-impersonate install on the
39
- // test host. Production code never calls this — the const-import default
40
- // above is what production sees.
41
- export function _setCurlBinaryForTest(binary: string | null): void {
42
- curlBinary = binary ?? CURL_IMPERSONATE_BINARY
43
- }
19
+ const DDG_LITE_URL = 'https://lite.duckduckgo.com/lite/'
44
20
 
45
21
  export type DdgResult = {
46
22
  title: string
@@ -64,64 +40,13 @@ export class DdgCaptchaError extends Error {
64
40
  }
65
41
 
66
42
  export async function fetchDdgHtml(query: string, signal?: AbortSignal): Promise<string> {
67
- // Spawn detached so the child becomes the leader of its own process group.
68
- // The curl-impersonate wrappers (curl_chrome136 et al.) are bash scripts
69
- // that call the real curl-impersonate binary WITHOUT `exec` — meaning the
70
- // wrapper is the parent and curl-impersonate is its child. On a plain
71
- // SIGKILL to the wrapper PID, the curl child becomes orphaned and keeps
72
- // the stdout pipe open until --max-time fires (30s default), turning a
73
- // 50ms abort into a 30s hang. process.kill(-pid) addresses the negative
74
- // PID, which signals the entire process group, killing both the wrapper
75
- // and the inner curl atomically. detached: true is what makes the child
76
- // the pgid leader so -pid is well-defined; without it, the child shares
77
- // our pgid and we'd nuke our own process.
78
- const proc = spawn({
79
- cmd: [
80
- curlBinary,
81
- '--silent',
82
- '--show-error',
83
- '--fail-with-body',
84
- '--compressed',
85
- '--max-time',
86
- String(REQUEST_TIMEOUT_SECONDS),
87
- '-X',
88
- 'POST',
89
- '--data-urlencode',
90
- `q=${query}`,
91
- DDG_LITE_URL,
92
- ],
93
- stdout: 'pipe',
94
- stderr: 'pipe',
95
- detached: true,
43
+ const response = await curlImpersonate({
44
+ url: DDG_LITE_URL,
45
+ method: 'POST',
46
+ formFields: [{ name: 'q', value: query }],
47
+ signal,
96
48
  })
97
-
98
- const onAbort = () => {
99
- try {
100
- process.kill(-proc.pid, 'SIGKILL')
101
- } catch {
102
- proc.kill('SIGKILL')
103
- }
104
- }
105
- signal?.addEventListener('abort', onAbort, { once: true })
106
-
107
- try {
108
- const [stdout, stderr, exitCode] = await Promise.all([
109
- new Response(proc.stdout).text(),
110
- new Response(proc.stderr).text(),
111
- proc.exited,
112
- ])
113
-
114
- if (signal?.aborted) {
115
- throw new Error('aborted')
116
- }
117
- if (exitCode !== 0) {
118
- const detail = stderr.trim() || 'no stderr'
119
- throw new Error(`curl-impersonate exited ${exitCode}: ${detail}`)
120
- }
121
- return stdout
122
- } finally {
123
- signal?.removeEventListener('abort', onAbort)
124
- }
49
+ return response.body
125
50
  }
126
51
 
127
52
  // The `lite` endpoint's CAPTCHA page is plainer than `html`'s anomaly-modal:
@@ -1,3 +1,33 @@
1
+ // Webfetch's HTTP transport.
2
+ //
3
+ // Production path (container, curl-impersonate available): we shell out to
4
+ // `curl_chrome136` so outbound requests carry Chrome 136's TLS handshake
5
+ // (JA3/JA4), HTTP/2 SETTINGS frame, and full header set. This is what gets
6
+ // us past the modern bot-detection stacks on Cloudflare/Akamai-protected
7
+ // sites (Reuters, MarketWatch, etc.) when the agent is running from the
8
+ // user's home network — the IP is already residential, so impersonating
9
+ // the browser is the only remaining missing piece. See AGENTS.md §"Web
10
+ // search" and src/agent/tools/curl-impersonate.ts for the full story.
11
+ //
12
+ // Test/dev fallback (curl_chrome136 not on PATH): we transparently fall
13
+ // back to Bun's native `fetch()` with a static User-Agent. This keeps unit
14
+ // tests on developer macOS machines working without forcing every contributor
15
+ // to install curl-impersonate locally. Production runs always have the binary
16
+ // because the typeclaw Dockerfile pins it.
17
+ //
18
+ // Best-effort doctrine: this transport does NOT guarantee the fetch succeeds.
19
+ // Bot-detected sites can still serve 403/CAPTCHA pages. We surface what we
20
+ // got (status, body, final URL) and let the caller decide. The webfetch tool
21
+ // translates non-2xx into a tool-level error message that's useful to the
22
+ // model.
23
+
24
+ import {
25
+ CurlImpersonateError,
26
+ curlImpersonate,
27
+ isCurlExitFilesizeExceeded,
28
+ isCurlExitTimeout,
29
+ isCurlImpersonateAvailable,
30
+ } from '../curl-impersonate'
1
31
  import { MAX_RESPONSE_BYTES } from './types'
2
32
 
3
33
  export type FetchResult = {
@@ -15,7 +45,7 @@ export class WebfetchError extends Error {
15
45
  }
16
46
  }
17
47
 
18
- const DEFAULT_HEADERS: Record<string, string> = {
48
+ const FALLBACK_HEADERS: Record<string, string> = {
19
49
  'User-Agent': 'typeclaw/0 (+https://github.com/code-yeongyu/typeclaw)',
20
50
  Accept: 'text/html,application/xhtml+xml,application/json;q=0.9,text/plain;q=0.8,*/*;q=0.1',
21
51
  'Accept-Language': 'en-US,en;q=0.9',
@@ -32,10 +62,83 @@ export function normalizeUrl(input: string): string {
32
62
  return `https://${trimmed}`
33
63
  }
34
64
 
65
+ // Test-only seam: forces fetchWithLimits to use the native-fetch fallback
66
+ // even when curl-impersonate is detected. Used by fetch.test.ts to keep its
67
+ // existing mocked-fetch contract working without the test having to install
68
+ // a fake curl binary. Production code never calls this.
69
+ let forceFallbackForTest = false
70
+
71
+ export function _setForceFallbackForTest(value: boolean): void {
72
+ forceFallbackForTest = value
73
+ }
74
+
35
75
  export async function fetchWithLimits(
36
76
  url: string,
37
77
  timeoutSeconds: number,
38
78
  parentSignal?: AbortSignal,
79
+ ): Promise<FetchResult> {
80
+ const useImpersonate = !forceFallbackForTest && (await isCurlImpersonateAvailable())
81
+ if (useImpersonate) {
82
+ return fetchWithCurlImpersonate(url, timeoutSeconds, parentSignal)
83
+ }
84
+ return fetchWithBunFetch(url, timeoutSeconds, parentSignal)
85
+ }
86
+
87
+ async function fetchWithCurlImpersonate(
88
+ url: string,
89
+ timeoutSeconds: number,
90
+ parentSignal?: AbortSignal,
91
+ ): Promise<FetchResult> {
92
+ let response
93
+ try {
94
+ response = await curlImpersonate({
95
+ url,
96
+ method: 'GET',
97
+ timeoutSeconds,
98
+ maxBytes: MAX_RESPONSE_BYTES,
99
+ signal: parentSignal,
100
+ })
101
+ } catch (error) {
102
+ if (parentSignal?.aborted) {
103
+ throw new WebfetchError('Request aborted')
104
+ }
105
+ if (error instanceof CurlImpersonateError) {
106
+ if (isCurlExitTimeout(error)) {
107
+ throw new WebfetchError(`Request timed out after ${timeoutSeconds}s`)
108
+ }
109
+ if (isCurlExitFilesizeExceeded(error)) {
110
+ throw new WebfetchError(`Response too large (exceeds ${formatBytes(MAX_RESPONSE_BYTES)} limit)`)
111
+ }
112
+ throw new WebfetchError(`Fetch failed: ${error.message}`)
113
+ }
114
+ const message = error instanceof Error ? error.message : String(error)
115
+ throw new WebfetchError(`Fetch failed: ${message}`)
116
+ }
117
+
118
+ if (response.httpStatus < 200 || response.httpStatus >= 300) {
119
+ throw new WebfetchError(`Fetch failed: HTTP ${response.httpStatus}`)
120
+ }
121
+
122
+ const bodyByteLength = new TextEncoder().encode(response.body).byteLength
123
+ if (bodyByteLength > MAX_RESPONSE_BYTES) {
124
+ throw new WebfetchError(
125
+ `Response too large (${formatBytes(bodyByteLength)} exceeds ${formatBytes(MAX_RESPONSE_BYTES)} limit)`,
126
+ )
127
+ }
128
+
129
+ return {
130
+ body: response.body,
131
+ contentType: response.contentType,
132
+ finalUrl: response.finalUrl || url,
133
+ httpStatus: response.httpStatus,
134
+ bytesIn: bodyByteLength,
135
+ }
136
+ }
137
+
138
+ async function fetchWithBunFetch(
139
+ url: string,
140
+ timeoutSeconds: number,
141
+ parentSignal?: AbortSignal,
39
142
  ): Promise<FetchResult> {
40
143
  const controller = new AbortController()
41
144
  const timeout = setTimeout(() => controller.abort(new Error('timeout')), timeoutSeconds * 1000)
@@ -43,7 +146,7 @@ export async function fetchWithLimits(
43
146
  parentSignal?.addEventListener('abort', onAbort, { once: true })
44
147
 
45
148
  try {
46
- const response = await fetch(url, { headers: DEFAULT_HEADERS, signal: controller.signal, redirect: 'follow' })
149
+ const response = await fetch(url, { headers: FALLBACK_HEADERS, signal: controller.signal, redirect: 'follow' })
47
150
  if (!response.ok) {
48
151
  throw new WebfetchError(`Fetch failed: HTTP ${response.status} ${response.statusText}`)
49
152
  }
@@ -24,6 +24,10 @@ export const webfetchTool = defineTool({
24
24
  description:
25
25
  'Fetch a single HTTP(S) URL and return the body, optionally compacted by a strategy. ' +
26
26
  'Use this when the user references a specific URL or when websearch surfaced a result you need to read in full. ' +
27
+ 'Outbound requests impersonate Chrome 136 at the TLS, HTTP/2, and header layers ' +
28
+ '(via curl-impersonate), which helps with TLS/header fingerprint gates on sites behind Cloudflare/Akamai. ' +
29
+ 'It does NOT solve JavaScript challenges, behavioural fingerprinting (mouse/scroll/timing), interactive CAPTCHAs, ' +
30
+ 'or IP-reputation blocks — a 403 from those layers is expected and unrecoverable from this tool. ' +
27
31
  'Strategy guide:\n' +
28
32
  '- "readability": extract article content as markdown (blogs, docs, news). Default for HTML.\n' +
29
33
  '- "jq": query JSON APIs (npm registry, GitHub API). Pass `query` (e.g. ".items[].name").\n' +
@@ -17,6 +17,49 @@ import { AGENT_BROWSER_DASHBOARD_UPSTREAM_PORT } from './dashboard-proxy'
17
17
 
18
18
  export const REAL_BIN_ENV = 'TYPECLAW_AGENT_BROWSER_REAL_BIN'
19
19
 
20
+ // Recent desktop Chrome on Linux x86_64. The shim runs inside the TypeClaw
21
+ // container (always Linux), so a macOS or Windows UA would mismatch the TCP
22
+ // fingerprint, Accept-Language, and JS-side platform — itself a bot signal on
23
+ // stricter sites (Cloudflare, Akamai, PerimeterX). `X11; Linux x86_64` is
24
+ // also correct on linux/arm64 hosts: Chrome on Linux does not expose ARM in
25
+ // the UA string at all (verified against current Chrome 131 releases).
26
+ // The upstream binary defaults to a UA that includes "HeadlessChrome" /
27
+ // a stale Chromium build, which is widely fingerprinted as a bot and
28
+ // silently triggers CAPTCHAs, 403s, blank pages, and A/B-test misrouting.
29
+ // Bump on Chrome major releases — same hygiene as the curl-impersonate pin
30
+ // in src/init/dockerfile.ts.
31
+ export const DEFAULT_USER_AGENT =
32
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
33
+
34
+ export const USER_AGENT_ENV = 'AGENT_BROWSER_USER_AGENT'
35
+
36
+ export function hasUserAgentFlag(argv: readonly string[]): boolean {
37
+ // Matches both `--user-agent <val>` and `--user-agent=<val>`. The upstream
38
+ // CLI does not document a short alias for --user-agent today (verified via
39
+ // `agent-browser --help`), so we only check the long form.
40
+ for (const arg of argv) {
41
+ if (arg === '--user-agent' || arg.startsWith('--user-agent=')) return true
42
+ }
43
+ return false
44
+ }
45
+
46
+ export function injectUserAgentEnv(
47
+ argv: readonly string[],
48
+ env: Record<string, string | undefined>,
49
+ defaultUa: string = DEFAULT_USER_AGENT,
50
+ ): void {
51
+ // Upstream's precedence is CLI flag > env > default. We only inject the
52
+ // env when BOTH layers above it are absent so:
53
+ // - explicit `--user-agent foo` wins (mobile testing, intentional bot UA)
54
+ // - operator-set AGENT_BROWSER_USER_AGENT wins (per-shell override)
55
+ // - default UA fills the otherwise-empty slot
56
+ // `set device "iPhone 14"` is unaffected: it sets UA via CDP at runtime,
57
+ // not through this env var, so our injection doesn't fight device emulation.
58
+ if (env[USER_AGENT_ENV] !== undefined && env[USER_AGENT_ENV] !== '') return
59
+ if (hasUserAgentFlag(argv)) return
60
+ env[USER_AGENT_ENV] = defaultUa
61
+ }
62
+
20
63
  export type DashboardIntent = 'start' | 'stop' | 'other'
21
64
 
22
65
  export function classifyDashboardCommand(argv: readonly string[]): DashboardIntent {
@@ -111,6 +154,7 @@ export type ShimOptions = {
111
154
  realBin?: string
112
155
  upstreamPort?: number
113
156
  spawn?: (cmd: string[]) => { exited: Promise<number> }
157
+ env?: Record<string, string | undefined>
114
158
  }
115
159
 
116
160
  export async function runShim(opts: ShimOptions = {}): Promise<number> {
@@ -118,6 +162,9 @@ export async function runShim(opts: ShimOptions = {}): Promise<number> {
118
162
  const realBin = opts.realBin ?? resolveRealAgentBrowserBin()
119
163
  const upstreamPort = opts.upstreamPort ?? AGENT_BROWSER_DASHBOARD_UPSTREAM_PORT
120
164
  const spawn = opts.spawn ?? defaultSpawn
165
+ const env = opts.env ?? process.env
166
+
167
+ injectUserAgentEnv(argv, env)
121
168
 
122
169
  const intent = classifyDashboardCommand(argv)
123
170
  if (intent !== 'start') {
@@ -77,6 +77,8 @@ You MAY:
77
77
  - Set up a missing upstream branch via \`git push -u origin <branch>\` if it's clear that's the only issue.
78
78
  - Retry \`git push\` once after fixing a clear, narrow issue.
79
79
 
80
+ **When you run \`git push\` (either to set upstream or to retry), the security plugin's \`gitExfil\` guard will block unless you pass \`acknowledgeGuards: { gitExfil: true }\` on the bash tool call.** You are running on the operator's behalf as the recovery step of a deterministic backup the operator already approved by enabling \`backup.pushToOrigin\`; the ack is justified for this specific retry path. Do NOT use this ack for any other reason — only the one push retry the rules above allow. If your push retry fails again, write the diagnosis and stop; do not ack a second push.
81
+
80
82
  You MUST NOT:
81
83
  - Force-push (\`--force\`, \`--force-with-lease\`).
82
84
  - Resolve merge conflicts by editing files. If a rebase had conflicts, the runner already aborted it. Leave the repo as-is and explain.