typeclaw 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/package.json +1 -1
  2. package/src/agent/auth.ts +4 -2
  3. package/src/agent/index.ts +16 -28
  4. package/src/agent/model-fallback.ts +127 -0
  5. package/src/agent/tools/curl-impersonate.ts +300 -0
  6. package/src/agent/tools/ddg.ts +13 -88
  7. package/src/agent/tools/webfetch/fetch.ts +105 -2
  8. package/src/agent/tools/webfetch/tool.ts +4 -0
  9. package/src/bundled-plugins/agent-browser/shim.ts +47 -0
  10. package/src/bundled-plugins/backup/subagents.ts +2 -0
  11. package/src/bundled-plugins/memory/README.md +49 -12
  12. package/src/bundled-plugins/memory/citation-superset.ts +63 -0
  13. package/src/bundled-plugins/memory/dreaming.ts +105 -17
  14. package/src/bundled-plugins/memory/index.ts +2 -2
  15. package/src/bundled-plugins/memory/memory-logger.ts +45 -26
  16. package/src/bundled-plugins/memory/strength.ts +127 -0
  17. package/src/bundled-plugins/memory/topics.ts +75 -0
  18. package/src/bundled-plugins/security/index.ts +87 -43
  19. package/src/bundled-plugins/security/permissions.ts +36 -0
  20. package/src/bundled-plugins/security/policies/git-exfil.ts +20 -0
  21. package/src/bundled-plugins/security/policies/outbound-secret-scan.ts +12 -0
  22. package/src/bundled-plugins/security/policies/prompt-injection.ts +23 -3
  23. package/src/bundled-plugins/security/policies/secret-exfil-bash.ts +7 -0
  24. package/src/bundled-plugins/security/policies/secret-exfil-read.ts +6 -0
  25. package/src/bundled-plugins/security/policies/session-search-secrets.ts +9 -0
  26. package/src/bundled-plugins/security/policies/ssrf.ts +6 -0
  27. package/src/bundled-plugins/security/policies/system-prompt-leak.ts +7 -0
  28. package/src/channels/adapters/github/index.ts +87 -3
  29. package/src/channels/router.ts +194 -28
  30. package/src/channels/types.ts +3 -1
  31. package/src/cli/channel.ts +2 -45
  32. package/src/cli/init.ts +148 -87
  33. package/src/cli/model.ts +12 -3
  34. package/src/cli/oauth-callbacks.ts +49 -0
  35. package/src/cli/provider.ts +3 -20
  36. package/src/cli/ui.ts +95 -0
  37. package/src/config/config.ts +59 -24
  38. package/src/config/models-mutation.ts +42 -8
  39. package/src/config/providers-mutation.ts +12 -8
  40. package/src/container/start.ts +18 -1
  41. package/src/cron/consumer.ts +129 -43
  42. package/src/init/dockerfile.ts +221 -3
  43. package/src/init/hatching.ts +2 -2
  44. package/src/init/index.ts +47 -3
  45. package/src/init/oauth-login.ts +17 -3
  46. package/src/permissions/builtins.ts +29 -7
  47. package/src/permissions/permissions.ts +24 -7
  48. package/src/plugin/define.ts +2 -0
  49. package/src/plugin/manager.ts +14 -0
  50. package/src/plugin/types.ts +6 -0
  51. package/src/run/index.ts +2 -1
  52. package/src/skills/typeclaw-memory/SKILL.md +25 -15
  53. package/src/skills/typeclaw-permissions/SKILL.md +35 -17
  54. package/src/tui/index.ts +35 -3
  55. package/src/usage/report.ts +15 -12
  56. package/typeclaw.schema.json +57 -25
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "typeclaw",
3
- "version": "0.4.0",
3
+ "version": "0.5.1",
4
4
  "homepage": "https://github.com/typeclaw/typeclaw#readme",
5
5
  "bugs": {
6
6
  "url": "https://github.com/typeclaw/typeclaw/issues"
package/src/agent/auth.ts CHANGED
@@ -83,8 +83,10 @@ export function getAuthFor(providerId: KnownProviderId): Auth {
83
83
 
84
84
  // Back-compat shim for callers that still want the `default` profile's auth
85
85
  // (the main session path). Equivalent to `getAuthFor(provider-of-default)`.
86
+ // Uses the head of the fallback chain; auth for the rest of the chain is
87
+ // resolved lazily when fallback actually fires.
86
88
  export function getAuth(): Auth {
87
- const defaultRef = getConfig().models.default
89
+ const defaultRef = getConfig().models.default[0]!
88
90
  return getAuthFor(providerForModelRef(defaultRef))
89
91
  }
90
92
 
@@ -98,7 +100,7 @@ function hasAnyCredentialInEnv(apiKeyEnv: string | null): boolean {
98
100
 
99
101
  function missingCredentialMessage(providerId: KnownProviderId): string {
100
102
  const provider = KNOWN_PROVIDERS[providerId]
101
- const defaultRef = getConfig().models.default
103
+ const defaultRef = getConfig().models.default[0]!
102
104
  const defaultProviderId = providerForModelRef(defaultRef)
103
105
  // For the `default` profile, name the model in the error message (matches
104
106
  // pre-multi-model behavior). For any other profile, the user is mixing
@@ -8,7 +8,7 @@ import type { AgentSession, ToolDefinition } from '@mariozechner/pi-coding-agent
8
8
  import { loadMemory } from '@/bundled-plugins/memory/load-memory'
9
9
  import type { ChannelRouter } from '@/channels/router'
10
10
  import { getConfig, resolveModel, resolveProfile } from '@/config'
11
- import { providerForModelRef } from '@/config/providers'
11
+ import { providerForModelRef, type KnownModelRef } from '@/config/providers'
12
12
  import type { PermissionService } from '@/permissions'
13
13
  import type {
14
14
  BuiltinToolRef,
@@ -134,6 +134,12 @@ export type CreateSessionOptions = {
134
134
  // overrides) so different sessions on the same agent can run different
135
135
  // models without per-session config edits.
136
136
  profile?: string
137
+ // Override the resolved ref directly, bypassing `profile` resolution. Used
138
+ // by the model-fallback helper (`promptWithFallback`) to recreate a session
139
+ // pinned to the next ref in the chain after the previous one failed. When
140
+ // set, `profile` is still recorded for the fallback-warning bookkeeping;
141
+ // the profile→refs resolution is skipped.
142
+ refOverride?: KnownModelRef
137
143
  // Defensive ceiling on cumulative bytes of tool-result text per session,
138
144
  // applied to the named tools only. See `src/agent/tool-result-budget.ts`
139
145
  // for the rationale. Intended for subagents that read large files
@@ -161,10 +167,14 @@ export async function createSession(options: CreateSessionOptions = {}): Promise
161
167
 
162
168
  export async function createSessionWithDispose(options: CreateSessionOptions = {}): Promise<CreateSessionResult> {
163
169
  const resolved = resolveProfile(getConfig().models, options.profile)
164
- if (resolved.fellBackToDefault && options.profile !== undefined && options.profile !== 'default') {
165
- warnProfileFallbackOnce(options.profile, resolved.ref)
166
- }
167
- const { authStorage, modelRegistry } = getAuthFor(providerForModelRef(resolved.ref))
170
+ // Unknown profiles silently fall back to `default`. The fallback is by design
171
+ // (see `resolveProfile`) and surfacing a warning here just creates noise on
172
+ // every memory-logger / dreaming subagent spawn for advanced users who know
173
+ // exactly what they're doing.
174
+ // `refOverride` lets the model-fallback helper pin a specific entry from
175
+ // the chain when it recreates a session after the previous ref failed.
176
+ const activeRef: KnownModelRef = options.refOverride ?? resolved.ref
177
+ const { authStorage, modelRegistry } = getAuthFor(providerForModelRef(activeRef))
168
178
 
169
179
  const materializedSkills =
170
180
  options.plugins && options.plugins.registry.skills.length > 0
@@ -279,7 +289,7 @@ export async function createSessionWithDispose(options: CreateSessionOptions = {
279
289
  ? customToolsPreBudget.map((t) => wrapToolDefinitionWithBudget(t, sessionBudget, sessionBudgetState))
280
290
  : customToolsPreBudget
281
291
 
282
- const model = resolveModel(resolved.ref)
292
+ const model = resolveModel(activeRef)
283
293
  const { session } = await createAgentSession({
284
294
  model,
285
295
  sessionManager,
@@ -737,25 +747,3 @@ function resolveRoleContext(
737
747
  export function getBundledSkillsDir(): string {
738
748
  return join(dirname(fileURLToPath(import.meta.url)), '..', 'skills')
739
749
  }
740
-
741
- // Profile-fallback warning is fired once per (profile, ref) pair per process.
742
- // Without rate-limiting, every memory-logger spawn (~every idle event) would
743
- // emit a fresh warning when the user has only `default` configured — tens of
744
- // warnings per channel session is noise the operator will learn to ignore.
745
- // The pair includes `ref` so a config reload that changes `default` re-warns.
746
- const profileFallbackWarned = new Set<string>()
747
-
748
- function warnProfileFallbackOnce(profile: string, ref: string): void {
749
- const key = `${profile}\x00${ref}`
750
- if (profileFallbackWarned.has(key)) return
751
- profileFallbackWarned.add(key)
752
- console.warn(
753
- `[agent] unknown model profile "${profile}"; falling back to "default" (${ref}). Add it under \`models\` in typeclaw.json to remove this warning. (further occurrences suppressed)`,
754
- )
755
- }
756
-
757
- // Test-only: clear the rate-limit cache so a test can assert the warning fires
758
- // once after rate-limit reset.
759
- export function __resetProfileFallbackWarningsForTesting(): void {
760
- profileFallbackWarned.clear()
761
- }
@@ -0,0 +1,127 @@
1
+ import { resolveProfile } from '@/config'
2
+ import type { Models } from '@/config/config'
3
+ import type { KnownModelRef } from '@/config/providers'
4
+
5
+ import type { AgentSession } from './index'
6
+ import { subscribeProviderErrors } from './provider-error'
7
+
8
+ // Result of a single fallback-aware prompt run.
9
+ // - `refUsed` is the ref whose session ultimately handled the turn.
10
+ // - `attempts` lists every ref that was tried, in order, with the failure
11
+ // reason for each attempt that didn't make it through. `attempts.length`
12
+ // is always >= 1; the last entry succeeded iff `success: true`.
13
+ // - `session` / `dispose` are the session that handled the turn (or attempted
14
+ // the final entry, on full-chain failure). Callers that need to keep using
15
+ // the session for subsequent turns store these in their state; callers that
16
+ // tear down per-turn (cron) just call `dispose()` and discard.
17
+ export type FallbackPromptResult = {
18
+ success: boolean
19
+ refUsed: KnownModelRef
20
+ attempts: FallbackAttempt[]
21
+ session: AgentSession
22
+ dispose: () => Promise<void>
23
+ // When `success === false`, this is the error from the final attempt.
24
+ lastError?: Error
25
+ }
26
+
27
+ export type FallbackAttempt = {
28
+ ref: KnownModelRef
29
+ // 'hard' = session.prompt() threw. 'soft' = pi-coding-agent surfaced an
30
+ // upstream error via stopReason: 'error' on the final assistant message.
31
+ // 'success' = the turn finished cleanly.
32
+ outcome: 'hard' | 'soft' | 'success'
33
+ errorMessage?: string
34
+ }
35
+
36
+ // Build the ordered list of refs to attempt for a given profile. Single-ref
37
+ // profiles produce a length-1 chain; the fallback path is then a no-op in
38
+ // practice (the first attempt either succeeds or the error propagates).
39
+ //
40
+ // Exported so callers can introspect the chain (e.g. logs, telemetry) before
41
+ // firing the prompt — useful for `[cron] ${jobId}: trying chain a → b → c`.
42
+ export function resolveFallbackChain(models: Models, profile: string | undefined): KnownModelRef[] {
43
+ return resolveProfile(models, profile).refs
44
+ }
45
+
46
+ // Drives one `session.prompt(text)` call with full fallback semantics:
47
+ //
48
+ // 1. Create a session bound to `refs[0]` via `createSessionForRef`.
49
+ // 2. Subscribe to provider-error events so soft errors (pi-coding-agent's
50
+ // `stopReason: 'error'` shape) trigger fallback in addition to throws.
51
+ // 3. Await `session.prompt(text)`.
52
+ // 4. If the prompt threw OR a soft error fired during the turn:
53
+ // - dispose the failed session
54
+ // - advance to `refs[i+1]` and retry (only if a fallback is available)
55
+ // 5. Return the session that handled the turn (or the last-tried session
56
+ // on full-chain failure), the ref used, and the attempt log.
57
+ //
58
+ // The wrapper intentionally does NOT swallow the final failure: when every
59
+ // ref in the chain has been exhausted, the returned `success: false` plus
60
+ // `lastError` lets the caller surface the failure however it already does
61
+ // (console.error in the server drain, channel reaction in the router,
62
+ // cron-job status). This keeps the helper composable with the existing
63
+ // error-handling code at each call site.
64
+ export async function promptWithFallback(opts: {
65
+ refs: KnownModelRef[]
66
+ text: string
67
+ createSessionForRef: (ref: KnownModelRef) => Promise<{ session: AgentSession; dispose: () => Promise<void> }>
68
+ // Called after each non-final attempt so callers can log the per-attempt
69
+ // failure with their own context (sessionId, channel key, job id, ...).
70
+ onAttemptFailed?: (attempt: FallbackAttempt) => void
71
+ }): Promise<FallbackPromptResult> {
72
+ if (opts.refs.length === 0) {
73
+ throw new Error('promptWithFallback: refs[] must be non-empty')
74
+ }
75
+ const attempts: FallbackAttempt[] = []
76
+ let lastError: Error | undefined
77
+ for (let i = 0; i < opts.refs.length; i++) {
78
+ const ref = opts.refs[i]!
79
+ const isLast = i === opts.refs.length - 1
80
+ const { session, dispose } = await opts.createSessionForRef(ref)
81
+ // Capture the first soft error per attempt. The `subscribeProviderErrors`
82
+ // listener fires synchronously off the `message_end` event, which lands
83
+ // BEFORE `session.prompt()` resolves — so by the time `await` returns,
84
+ // `softError` is populated if a soft error occurred.
85
+ let softError: Error | undefined
86
+ const unsub = subscribeProviderErrors(session, (err) => {
87
+ if (!softError) softError = new Error(err.message)
88
+ })
89
+ try {
90
+ try {
91
+ await session.prompt(opts.text)
92
+ } catch (err) {
93
+ const error = err instanceof Error ? err : new Error(String(err))
94
+ const attempt: FallbackAttempt = { ref, outcome: 'hard', errorMessage: error.message }
95
+ attempts.push(attempt)
96
+ lastError = error
97
+ if (!isLast) opts.onAttemptFailed?.(attempt)
98
+ unsub()
99
+ await dispose()
100
+ if (isLast) {
101
+ return { success: false, refUsed: ref, attempts, session, dispose: async () => {}, lastError }
102
+ }
103
+ continue
104
+ }
105
+ if (softError !== undefined) {
106
+ const attempt: FallbackAttempt = { ref, outcome: 'soft', errorMessage: softError.message }
107
+ attempts.push(attempt)
108
+ lastError = softError
109
+ if (!isLast) opts.onAttemptFailed?.(attempt)
110
+ unsub()
111
+ await dispose()
112
+ if (isLast) {
113
+ return { success: false, refUsed: ref, attempts, session, dispose: async () => {}, lastError }
114
+ }
115
+ continue
116
+ }
117
+ attempts.push({ ref, outcome: 'success' })
118
+ unsub()
119
+ return { success: true, refUsed: ref, attempts, session, dispose }
120
+ } catch (err) {
121
+ unsub()
122
+ await dispose()
123
+ throw err
124
+ }
125
+ }
126
+ throw new Error('promptWithFallback: unreachable — loop terminated without returning')
127
+ }
@@ -0,0 +1,300 @@
1
+ // Shared curl-impersonate spawn primitive.
2
+ //
3
+ // Why this exists: by 2026, every non-trivial public site (DDG, Reuters via
4
+ // Akamai, MarketWatch via Cloudflare, etc.) fingerprints incoming traffic at
5
+ // the TLS handshake (JA3/JA4) and HTTP/2 SETTINGS frame BEFORE any HTTP header
6
+ // is read. Bun's native fetch cannot match Chrome's handshake (upstream issue
7
+ // #11368), so outbound requests get gated by anomaly checks regardless of
8
+ // headers, body shape, or pacing. The fix is to shell out to curl-impersonate
9
+ // (lexiforest fork), which replays Chrome's exact TLS handshake, HTTP/2
10
+ // settings, and header ordering. Pinned by the typeclaw Dockerfile at
11
+ // /usr/local/bin/curl_chrome136 — see src/init/dockerfile.ts for the version
12
+ // and SHA pin.
13
+ //
14
+ // AGENTS.md explicitly warns against adding `-H` overrides because the
15
+ // curl_chrome wrapper already sends the full Chrome header set (correct
16
+ // ordering, sec-ch-ua, sec-fetch-*, accept-encoding, etc.) and any custom
17
+ // header corrupts the impersonation. We therefore expose NO header-override
18
+ // surface from this primitive; add one only when a real caller needs it AND
19
+ // the override is something curl_chrome can't be told to send another way.
20
+
21
+ import { randomBytes } from 'node:crypto'
22
+
23
+ import { spawn } from 'bun'
24
+
25
+ export const CURL_IMPERSONATE_BINARY = 'curl_chrome136'
26
+ export const DEFAULT_TIMEOUT_SECONDS = 30
27
+
28
+ let curlBinary: string = CURL_IMPERSONATE_BINARY
29
+
30
+ // Test-only seam: lets *.test.ts point the spawn at a fake `curl_chrome136`
31
+ // script in a tmpdir so we exercise the real Bun.spawn path without depending
32
+ // on a curl-impersonate install on the test host. Production code never calls
33
+ // this — the module-level default above is what production sees.
34
+ export function _setCurlBinaryForTest(binary: string | null): void {
35
+ curlBinary = binary ?? CURL_IMPERSONATE_BINARY
36
+ }
37
+
38
+ export type CurlImpersonateRequest = {
39
+ url: string
40
+ method?: 'GET' | 'POST'
41
+ // Form-urlencoded body fields for POST. Each entry is passed as a separate
42
+ // --data-urlencode argument so curl handles the encoding. Required if
43
+ // method is 'POST' and you want a body.
44
+ formFields?: Array<{ name: string; value: string }>
45
+ // Hard cap on bytes accepted from the response (passed as --max-filesize).
46
+ // The actual buffer is still bounded by the caller; this just makes curl
47
+ // bail early instead of streaming gigabytes.
48
+ maxBytes?: number
49
+ timeoutSeconds?: number
50
+ signal?: AbortSignal
51
+ }
52
+
53
+ export type CurlImpersonateResponse = {
54
+ body: string
55
+ finalUrl: string
56
+ httpStatus: number
57
+ contentType: string
58
+ bytesIn: number
59
+ }
60
+
61
+ // Specific curl exit codes we map to typed errors. The full list is in
62
+ // `man curl` § "EXIT CODES"; these are the only ones we translate at the
63
+ // primitive layer. Everything else surfaces as a generic CurlImpersonateError
64
+ // with stderr attached for caller-side diagnostics.
65
+ export const CURL_EXIT_TIMEOUT = 28
66
+ export const CURL_EXIT_MAX_FILESIZE_PRECHECK = 63
67
+ // Observed empirically (and corroborated by Oracle review): curl returns
68
+ // exit 56 with stderr `Exceeded the maximum allowed file size (...)` when
69
+ // --max-filesize is hit at TRANSFER time (e.g. server omitted Content-Length
70
+ // and curl discovered the overflow mid-stream). The Linux man page lists 56
71
+ // as the more general "Failure in receiving network data," so we additionally
72
+ // gate on a stderr match to avoid mis-classifying real network drops as
73
+ // size-exceeded.
74
+ export const CURL_EXIT_RECV_FAILURE_OR_FILESIZE = 56
75
+
76
+ export class CurlImpersonateError extends Error {
77
+ constructor(
78
+ message: string,
79
+ public readonly exitCode: number | null,
80
+ public readonly stderr: string,
81
+ ) {
82
+ super(message)
83
+ this.name = 'CurlImpersonateError'
84
+ }
85
+ }
86
+
87
+ export function isCurlExitFilesizeExceeded(error: CurlImpersonateError): boolean {
88
+ if (error.exitCode === CURL_EXIT_MAX_FILESIZE_PRECHECK) return true
89
+ if (error.exitCode === CURL_EXIT_RECV_FAILURE_OR_FILESIZE && /maximum.{0,30}file size/i.test(error.stderr)) {
90
+ return true
91
+ }
92
+ return false
93
+ }
94
+
95
+ export function isCurlExitTimeout(error: CurlImpersonateError): boolean {
96
+ return error.exitCode === CURL_EXIT_TIMEOUT
97
+ }
98
+
99
+ export async function curlImpersonate(req: CurlImpersonateRequest): Promise<CurlImpersonateResponse> {
100
+ const timeoutSeconds = req.timeoutSeconds ?? DEFAULT_TIMEOUT_SECONDS
101
+ const method = req.method ?? 'GET'
102
+
103
+ // Per-request random sentinel + UTF-8-safe parsing. The static sentinel
104
+ // approach (previous revision) had a hardening hole: webfetch reads
105
+ // attacker-controlled pages, and a static sentinel is a public, fixed
106
+ // string. A page could include the sentinel byte sequence plus fabricated
107
+ // metadata before the real write-out tail and `indexOf` would split at
108
+ // the attacker-controlled occurrence. Per-request randomness (96 bits)
109
+ // removes the attacker's ability to predict the sentinel, and the parser
110
+ // anchors on the LAST occurrence (curl writes `-w` after the body, so the
111
+ // real metadata block is always last). Both defenses are needed: random
112
+ // alone fails if the attacker can read the sentinel from a previous
113
+ // response and replay it; last-match alone fails if the attacker can
114
+ // append text after curl's write-out (they can't, but defense in depth).
115
+ const sentinel = generateSentinel()
116
+ const writeOutTemplate = `${sentinel}%{http_code}\n%{url_effective}\n%{content_type}\n%{size_download}\n`
117
+
118
+ const cmd: string[] = [
119
+ curlBinary,
120
+ // `--disable` (alias -q) MUST be the first argument to suppress reading
121
+ // ~/.curlrc and /etc/curlrc. Without it, a user or attacker-controlled
122
+ // curlrc could inject --proxy, --header, --resolve, --no-location, etc.,
123
+ // silently subverting both the Chrome impersonation contract and the
124
+ // protocol restrictions below. Order is load-bearing: curl ignores
125
+ // --disable if it appears after any other flag.
126
+ '--disable',
127
+ '--silent',
128
+ '--show-error',
129
+ // Protocol allowlist. curl-impersonate supports many protocols by default
130
+ // (ftp, file, dict, etc.). normalizeUrl() already rejects non-http(s) at
131
+ // the call-site, but redirects are followed by curl after that gate fires
132
+ // and a 301/302 to ftp://... would otherwise be silently honored. The
133
+ // `=http,https` syntax means "ONLY these two" rather than "add these to
134
+ // defaults." --proto-redir governs the redirect chain specifically.
135
+ '--proto',
136
+ '=http,https',
137
+ '--proto-redir',
138
+ '=http,https',
139
+ // `--fail-with-body` would make curl exit non-zero on >=400 but still
140
+ // write the body. We intentionally DO NOT pass it: callers (webfetch,
141
+ // ddg) want to inspect httpStatus themselves and decide. Curl exits 0
142
+ // on a 404-with-body in this mode, which matches our contract.
143
+ '--compressed',
144
+ '--location',
145
+ '--max-redirs',
146
+ '10',
147
+ '--max-time',
148
+ String(timeoutSeconds),
149
+ '-w',
150
+ writeOutTemplate,
151
+ '-X',
152
+ method,
153
+ ]
154
+
155
+ if (req.maxBytes !== undefined) {
156
+ cmd.push('--max-filesize', String(req.maxBytes))
157
+ }
158
+
159
+ if (req.formFields) {
160
+ for (const field of req.formFields) {
161
+ cmd.push('--data-urlencode', `${field.name}=${field.value}`)
162
+ }
163
+ }
164
+
165
+ // `--` terminates option parsing so a URL beginning with `-` (e.g. an
166
+ // attacker-supplied "-K /etc/passwd" sneaking through normalizeUrl as
167
+ // "https://-K /etc/passwd") cannot be reinterpreted as a curl option.
168
+ cmd.push('--', req.url)
169
+
170
+ // Spawn detached so the child becomes the leader of its own process group.
171
+ // The curl-impersonate wrappers (curl_chrome136 et al.) are bash scripts
172
+ // that call the real curl-impersonate binary WITHOUT `exec` — meaning the
173
+ // wrapper is the parent and curl-impersonate is its child. On a plain
174
+ // SIGKILL to the wrapper PID, the curl child becomes orphaned and keeps
175
+ // the stdout pipe open until --max-time fires, turning a 50ms abort into
176
+ // a 30s hang. process.kill(-pid) addresses the negative PID, which signals
177
+ // the entire process group, killing both atomically. detached: true makes
178
+ // the child the pgid leader so -pid is well-defined.
179
+ const proc = spawn({
180
+ cmd,
181
+ stdout: 'pipe',
182
+ stderr: 'pipe',
183
+ detached: true,
184
+ })
185
+
186
+ const onAbort = () => {
187
+ try {
188
+ process.kill(-proc.pid, 'SIGKILL')
189
+ } catch {
190
+ proc.kill('SIGKILL')
191
+ }
192
+ }
193
+ req.signal?.addEventListener('abort', onAbort, { once: true })
194
+
195
+ try {
196
+ const [stdoutBuf, stderr, exitCode] = await Promise.all([
197
+ new Response(proc.stdout).arrayBuffer(),
198
+ new Response(proc.stderr).text(),
199
+ proc.exited,
200
+ ])
201
+
202
+ if (req.signal?.aborted) {
203
+ throw new CurlImpersonateError('aborted', exitCode, stderr)
204
+ }
205
+
206
+ if (exitCode !== 0) {
207
+ const detail = stderr.trim() || 'no stderr'
208
+ throw new CurlImpersonateError(`curl-impersonate exited ${exitCode}: ${detail}`, exitCode, stderr)
209
+ }
210
+
211
+ return parseCurlOutput(stdoutBuf, sentinel, stderr)
212
+ } finally {
213
+ req.signal?.removeEventListener('abort', onAbort)
214
+ }
215
+ }
216
+
217
+ // Generates a per-request sentinel. Format: `\n--TYPECLAW-CURL-META-<hex>--\n`.
218
+ // 24 hex chars = 96 bits of entropy, plenty to defeat any attempt by an
219
+ // attacker-controlled response body to inject a colliding marker. ASCII-only
220
+ // + leading/trailing newlines means it's unambiguous in textual responses
221
+ // and free of NUL bytes (Bun's spawn rejects NULs in argv).
222
+ function generateSentinel(): string {
223
+ const hex = randomBytes(12).toString('hex')
224
+ return `\n--TYPECLAW-CURL-META-${hex}--\n`
225
+ }
226
+
227
+ function parseCurlOutput(buf: ArrayBuffer, sentinel: string, stderr: string): CurlImpersonateResponse {
228
+ const sentinelBytes = new TextEncoder().encode(sentinel)
229
+ const bytes = new Uint8Array(buf)
230
+
231
+ // Anchor on the LAST occurrence (defense in depth alongside the random
232
+ // sentinel). curl writes the `-w` output strictly AFTER the body, so the
233
+ // real metadata block is always the trailing one.
234
+ const sentinelIndex = lastIndexOfBytes(bytes, sentinelBytes)
235
+ if (sentinelIndex < 0) {
236
+ throw new CurlImpersonateError(
237
+ 'curl-impersonate produced no metadata block (sentinel missing). Wrapper or output corruption suspected.',
238
+ 0,
239
+ stderr,
240
+ )
241
+ }
242
+
243
+ const bodyBytes = bytes.subarray(0, sentinelIndex)
244
+ const metaBytes = bytes.subarray(sentinelIndex + sentinelBytes.byteLength)
245
+ const meta = new TextDecoder('utf-8', { fatal: false }).decode(metaBytes).split('\n')
246
+
247
+ const httpStatus = Number(meta[0]?.trim() ?? '0') || 0
248
+ const finalUrl = (meta[1] ?? '').trim()
249
+ const contentType = (meta[2] ?? '').trim().toLowerCase()
250
+ const declaredBytes = Number(meta[3]?.trim() ?? '0') || bodyBytes.byteLength
251
+
252
+ const body = new TextDecoder('utf-8', { fatal: false }).decode(bodyBytes)
253
+
254
+ return {
255
+ body,
256
+ finalUrl,
257
+ httpStatus,
258
+ contentType,
259
+ bytesIn: declaredBytes,
260
+ }
261
+ }
262
+
263
+ function lastIndexOfBytes(haystack: Uint8Array, needle: Uint8Array): number {
264
+ if (needle.byteLength === 0) return haystack.byteLength
265
+ for (let i = haystack.byteLength - needle.byteLength; i >= 0; i--) {
266
+ let matched = true
267
+ for (let j = 0; j < needle.byteLength; j++) {
268
+ if (haystack[i + j] !== needle[j]) {
269
+ matched = false
270
+ break
271
+ }
272
+ }
273
+ if (matched) return i
274
+ }
275
+ return -1
276
+ }
277
+
278
+ // Detect whether curl-impersonate is available on PATH. Used by fetch.ts to
279
+ // decide between the impersonating transport (production: container has the
280
+ // binary pinned in the image) and a Bun.fetch fallback (test/dev: no binary
281
+ // installed). The check is best-effort and cheap — we spawn `--version`
282
+ // and look at exit code. Cached per-process: the binary doesn't appear or
283
+ // disappear at runtime.
284
+ let availabilityCache: boolean | undefined
285
+
286
+ export async function isCurlImpersonateAvailable(): Promise<boolean> {
287
+ if (availabilityCache !== undefined) return availabilityCache
288
+ try {
289
+ const proc = spawn({ cmd: [curlBinary, '--version'], stdout: 'ignore', stderr: 'ignore' })
290
+ const code = await proc.exited
291
+ availabilityCache = code === 0
292
+ } catch {
293
+ availabilityCache = false
294
+ }
295
+ return availabilityCache
296
+ }
297
+
298
+ export function _resetAvailabilityCacheForTest(): void {
299
+ availabilityCache = undefined
300
+ }
@@ -7,40 +7,16 @@
7
7
  // single bad fingerprint match. `lite` exists for non-browser clients (text
8
8
  // browsers, accessibility tools) and historically gates less aggressively —
9
9
  // but as of 2026 it ALSO fingerprints at the TLS layer (JA3/JA4) and the
10
- // HTTP/2 SETTINGS frame, well before any HTTP header is read. Bun's native
11
- // fetch cannot match Chrome's handshake (upstream issue #11368), so requests
12
- // from `fetch()` get gated regardless of headers, body shape, or pacing
13
- // confirmed empirically over a multi-hour session against a single home IP
14
- // where real Chromium succeeded continuously while every fetch variant got
15
- // 202 anomaly-modal or HTTP-200-with-anomaly responses.
16
- //
17
- // The fix is to shell out to `curl-impersonate` (lexiforest fork), which
18
- // replays Chrome's exact TLS handshake + HTTP/2 settings + header ordering.
19
- // The binary is installed by the typeclaw Dockerfile (see
20
- // src/init/dockerfile.ts CURL_IMPERSONATE_* constants) at /usr/local/bin/
21
- // and invoked via the version-pinned wrapper `curl_chrome136`.
22
- //
23
- // Why no `-H` overrides: curl_chrome136 already sends the full Chrome 136
24
- // header set with correct ordering, sec-ch-ua values, etc. Adding our own
25
- // headers would corrupt the impersonation. The previous code's
26
- // BROWSER_HEADERS const has been removed for the same reason.
10
+ // HTTP/2 SETTINGS frame, well before any HTTP header is read. The shared
11
+ // curl-impersonate primitive (./curl-impersonate.ts) replays Chrome's exact
12
+ // TLS handshake + HTTP/2 settings + header ordering. See that file's header
13
+ // for the full rationale and AGENTS.md §"Web search" for the original story.
27
14
 
28
- import { spawn } from 'bun'
29
-
30
- const DDG_LITE_URL = 'https://lite.duckduckgo.com/lite/'
31
- const CURL_IMPERSONATE_BINARY = 'curl_chrome136'
32
- const REQUEST_TIMEOUT_SECONDS = 30
15
+ import { curlImpersonate } from './curl-impersonate'
33
16
 
34
- let curlBinary = CURL_IMPERSONATE_BINARY
17
+ export { _setCurlBinaryForTest } from './curl-impersonate'
35
18
 
36
- // Test-only seam: lets ddg.test.ts and websearch.test.ts point the spawn
37
- // at a fake `curl_chrome136` script in a tmpdir so we exercise the real
38
- // Bun.spawn path without depending on a curl-impersonate install on the
39
- // test host. Production code never calls this — the const-import default
40
- // above is what production sees.
41
- export function _setCurlBinaryForTest(binary: string | null): void {
42
- curlBinary = binary ?? CURL_IMPERSONATE_BINARY
43
- }
19
+ const DDG_LITE_URL = 'https://lite.duckduckgo.com/lite/'
44
20
 
45
21
  export type DdgResult = {
46
22
  title: string
@@ -64,64 +40,13 @@ export class DdgCaptchaError extends Error {
64
40
  }
65
41
 
66
42
  export async function fetchDdgHtml(query: string, signal?: AbortSignal): Promise<string> {
67
- // Spawn detached so the child becomes the leader of its own process group.
68
- // The curl-impersonate wrappers (curl_chrome136 et al.) are bash scripts
69
- // that call the real curl-impersonate binary WITHOUT `exec` — meaning the
70
- // wrapper is the parent and curl-impersonate is its child. On a plain
71
- // SIGKILL to the wrapper PID, the curl child becomes orphaned and keeps
72
- // the stdout pipe open until --max-time fires (30s default), turning a
73
- // 50ms abort into a 30s hang. process.kill(-pid) addresses the negative
74
- // PID, which signals the entire process group, killing both the wrapper
75
- // and the inner curl atomically. detached: true is what makes the child
76
- // the pgid leader so -pid is well-defined; without it, the child shares
77
- // our pgid and we'd nuke our own process.
78
- const proc = spawn({
79
- cmd: [
80
- curlBinary,
81
- '--silent',
82
- '--show-error',
83
- '--fail-with-body',
84
- '--compressed',
85
- '--max-time',
86
- String(REQUEST_TIMEOUT_SECONDS),
87
- '-X',
88
- 'POST',
89
- '--data-urlencode',
90
- `q=${query}`,
91
- DDG_LITE_URL,
92
- ],
93
- stdout: 'pipe',
94
- stderr: 'pipe',
95
- detached: true,
43
+ const response = await curlImpersonate({
44
+ url: DDG_LITE_URL,
45
+ method: 'POST',
46
+ formFields: [{ name: 'q', value: query }],
47
+ signal,
96
48
  })
97
-
98
- const onAbort = () => {
99
- try {
100
- process.kill(-proc.pid, 'SIGKILL')
101
- } catch {
102
- proc.kill('SIGKILL')
103
- }
104
- }
105
- signal?.addEventListener('abort', onAbort, { once: true })
106
-
107
- try {
108
- const [stdout, stderr, exitCode] = await Promise.all([
109
- new Response(proc.stdout).text(),
110
- new Response(proc.stderr).text(),
111
- proc.exited,
112
- ])
113
-
114
- if (signal?.aborted) {
115
- throw new Error('aborted')
116
- }
117
- if (exitCode !== 0) {
118
- const detail = stderr.trim() || 'no stderr'
119
- throw new Error(`curl-impersonate exited ${exitCode}: ${detail}`)
120
- }
121
- return stdout
122
- } finally {
123
- signal?.removeEventListener('abort', onAbort)
124
- }
49
+ return response.body
125
50
  }
126
51
 
127
52
  // The `lite` endpoint's CAPTCHA page is plainer than `html`'s anomaly-modal: