imprint-mcp 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +132 -28
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +111 -4
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +65 -27
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +14 -2
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/credential-extract.ts +174 -25
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/emit.ts +85 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/sensitive-keys.ts +141 -7
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +17 -0
- package/src/imprint/teach.ts +582 -147
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
package/src/imprint/compile.ts
CHANGED
|
@@ -19,6 +19,7 @@ import {
|
|
|
19
19
|
import { dirname, join as pathJoin } from 'node:path';
|
|
20
20
|
import type { OnDeadlineReached } from './agent.ts';
|
|
21
21
|
import { inferAppApiHosts } from './app-api-hosts.ts';
|
|
22
|
+
import type { SharedModuleManifestEntry } from './build-plan.ts';
|
|
22
23
|
import { type CompileAgentProgress, compileAgent } from './compile-agent.ts';
|
|
23
24
|
import { isSameRegistrableDomain, registrableDomain } from './etld.ts';
|
|
24
25
|
import { type LLMOptions, extractJsonArray, resolveProvider } from './llm.ts';
|
|
@@ -82,6 +83,14 @@ interface GenerateOptions extends CompileOptions {
|
|
|
82
83
|
classifications?: ClassifiedValue[];
|
|
83
84
|
/** Credential values extracted during teach, passed to integration tests via env var. */
|
|
84
85
|
teachCredentials?: { site: string; values: Record<string, string> };
|
|
86
|
+
/** Absolute path to the multi-tool build plan sidecar (.build-plan.json). */
|
|
87
|
+
buildPlanPath?: string;
|
|
88
|
+
/** Shared-module build manifest for this site (verified flags). */
|
|
89
|
+
sharedModules?: SharedModuleManifestEntry[];
|
|
90
|
+
/** Per-tool implementation plan (param→field mapping, request construction,
|
|
91
|
+
* response parsing, shared-module imports). Injected into the agent's initial
|
|
92
|
+
* message so the compile follows it. */
|
|
93
|
+
toolPlan?: string;
|
|
85
94
|
}
|
|
86
95
|
|
|
87
96
|
interface GenerateResult {
|
|
@@ -122,6 +131,9 @@ export async function generate(opts: GenerateOptions): Promise<GenerateResult> {
|
|
|
122
131
|
sharedContext: opts.sharedContext,
|
|
123
132
|
classifications: opts.classifications,
|
|
124
133
|
teachCredentials: opts.teachCredentials,
|
|
134
|
+
buildPlanPath: opts.buildPlanPath,
|
|
135
|
+
sharedModules: opts.sharedModules,
|
|
136
|
+
toolPlan: opts.toolPlan,
|
|
125
137
|
});
|
|
126
138
|
|
|
127
139
|
setSpanAttributes(span, {
|
|
@@ -145,7 +157,7 @@ export async function generate(opts: GenerateOptions): Promise<GenerateResult> {
|
|
|
145
157
|
];
|
|
146
158
|
if (result.outcome === 'timeout') {
|
|
147
159
|
lines.push(
|
|
148
|
-
'hint: increase the timeout with --timeout (teach) or --max-duration (generate)',
|
|
160
|
+
'hint: most complex tools take 10-15 minutes. increase the timeout with --timeout (teach) or --max-duration (generate)',
|
|
149
161
|
);
|
|
150
162
|
}
|
|
151
163
|
throw new Error(lines.join('\n'));
|
|
@@ -274,7 +286,7 @@ const TRIAGE_RESOURCE_TYPES = new Set(['XHR', 'Fetch', 'Document']);
|
|
|
274
286
|
const HEADER_TRUNCATE_LIMIT = 200;
|
|
275
287
|
// Per-request body cap for triage. Triage only needs enough body to distinguish
|
|
276
288
|
// data-bearing POSTs (search/booking) from telemetry; full bodies on a busy
|
|
277
|
-
// site can total >1MB and blow the 200K-token cap on `claude-opus-4-
|
|
289
|
+
// site can total >1MB and blow the 200K-token cap on `claude-opus-4-8`.
|
|
278
290
|
const TRIAGE_BODY_LIMIT = 500;
|
|
279
291
|
|
|
280
292
|
export interface TriageResult {
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bounded-concurrency fan-out helpers shared across the teach pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Lives in its own module (rather than teach.ts) so leaf modules like
|
|
5
|
+
* teach-plan.ts can reuse it without importing teach.ts, which would create an
|
|
6
|
+
* import cycle (teach.ts → teach-plan.ts → teach.ts). teach.ts re-exports both
|
|
7
|
+
* for backwards compatibility with existing callers + tests.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/** Run `fn` over `items` with at most `concurrency` in flight, preserving input
|
|
11
|
+
* order in the result. Throws the first error encountered (after in-flight work
|
|
12
|
+
* settles); use mapLimitSettled when you need per-item success/failure. */
|
|
13
|
+
export async function mapLimit<T, R>(
|
|
14
|
+
items: T[],
|
|
15
|
+
concurrency: number,
|
|
16
|
+
fn: (item: T) => Promise<R>,
|
|
17
|
+
): Promise<R[]> {
|
|
18
|
+
const results = new Array<R>(items.length);
|
|
19
|
+
let next = 0;
|
|
20
|
+
let firstError: unknown;
|
|
21
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
22
|
+
while (next < items.length && firstError === undefined) {
|
|
23
|
+
const index = next++;
|
|
24
|
+
const item = items[index];
|
|
25
|
+
if (item === undefined) continue;
|
|
26
|
+
try {
|
|
27
|
+
results[index] = await fn(item);
|
|
28
|
+
} catch (err) {
|
|
29
|
+
firstError ??= err;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
await Promise.allSettled(workers);
|
|
34
|
+
if (firstError !== undefined) throw firstError;
|
|
35
|
+
return results;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
type SettledResult<R> = { ok: true; value: R } | { ok: false; error: unknown };
|
|
39
|
+
|
|
40
|
+
/** Like mapLimit, but never throws: each item resolves to a tagged
|
|
41
|
+
* success/failure entry, preserving input order. */
|
|
42
|
+
export async function mapLimitSettled<T, R>(
|
|
43
|
+
items: T[],
|
|
44
|
+
concurrency: number,
|
|
45
|
+
fn: (item: T) => Promise<R>,
|
|
46
|
+
): Promise<SettledResult<R>[]> {
|
|
47
|
+
const results = new Array<SettledResult<R>>(items.length);
|
|
48
|
+
let next = 0;
|
|
49
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
50
|
+
while (next < items.length) {
|
|
51
|
+
const index = next++;
|
|
52
|
+
const item = items[index];
|
|
53
|
+
if (item === undefined) continue;
|
|
54
|
+
try {
|
|
55
|
+
results[index] = { ok: true, value: await fn(item) };
|
|
56
|
+
} catch (err) {
|
|
57
|
+
results[index] = { ok: false, error: err };
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
await Promise.allSettled(workers);
|
|
62
|
+
return results;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Error thrown by withTimeout when the deadline elapses before the work settles.
|
|
66
|
+
* A distinct class lets callers tell a timeout apart from a genuine failure. */
|
|
67
|
+
export class TimeoutError extends Error {
|
|
68
|
+
constructor(label: string, ms: number) {
|
|
69
|
+
super(`${label} exceeded ${Math.round(ms / 1000)}s timeout`);
|
|
70
|
+
this.name = 'TimeoutError';
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Race a promise against a timeout. The underlying work (e.g. a CLI child) is
|
|
75
|
+
* NOT cancelled — the caller just stops awaiting it and decides how to degrade.
|
|
76
|
+
* Throws TimeoutError on timeout. */
|
|
77
|
+
export async function withTimeout<T>(work: Promise<T>, ms: number, label: string): Promise<T> {
|
|
78
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
79
|
+
const timeout = new Promise<never>((_, reject) => {
|
|
80
|
+
timer = setTimeout(() => reject(new TimeoutError(label, ms)), ms);
|
|
81
|
+
});
|
|
82
|
+
try {
|
|
83
|
+
return await Promise.race([work, timeout]);
|
|
84
|
+
} finally {
|
|
85
|
+
if (timer) clearTimeout(timer);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
@@ -12,14 +12,13 @@
|
|
|
12
12
|
* value is visible and lets us confirm which form was the login form.
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
|
-
import { isSensitiveCredentialKey,
|
|
15
|
+
import { isSensitiveCredentialKey, isUsernameLikeKey } from './sensitive-keys.ts';
|
|
16
16
|
import type { CapturedEvent, CapturedRequest, Session } from './types.ts';
|
|
17
17
|
|
|
18
|
-
/**
|
|
19
|
-
* password field.
|
|
20
|
-
*
|
|
21
|
-
const
|
|
22
|
-
/^(user(?:name|id)?|email(?:address)?|login(?:id)?|account|patron(?:number|id)?)$/i;
|
|
18
|
+
/** Predicate: this key looks like the username/email/login partner of a
|
|
19
|
+
* password field. Backed by `USERNAME_LIKE_KEYS` in sensitive-keys.ts so
|
|
20
|
+
* the dictionary stays in one place. */
|
|
21
|
+
const isUsernameKey = (key: string): boolean => isUsernameLikeKey(key);
|
|
23
22
|
|
|
24
23
|
/** Where, within a request, a redactable value lives. */
|
|
25
24
|
export type ReplacementLocation =
|
|
@@ -58,6 +57,29 @@ interface ExtractionResult {
|
|
|
58
57
|
replacements: Replacement[];
|
|
59
58
|
}
|
|
60
59
|
|
|
60
|
+
/** Parsers are tried in this order on every request that has a body. Each
|
|
61
|
+
* one is side-effect-free and returns `null` when its input doesn't fit
|
|
62
|
+
* its expected framing — so trying JSON first on a form body, or form on
|
|
63
|
+
* a JSON body, is safe: only the parser that actually fits will produce a
|
|
64
|
+
* finding.
|
|
65
|
+
*
|
|
66
|
+
* Dispatch is parser-driven, not Content-Type-driven, because real sites
|
|
67
|
+
* routinely mislabel their bodies — the canonical example is the Nextep
|
|
68
|
+
* cafe API (`Content-Type: text/plain` for JSON bodies). Letting the data
|
|
69
|
+
* speak for itself prevents whole classes of silent extraction failures.
|
|
70
|
+
*
|
|
71
|
+
* URL-query parsing runs even on requests without a body (e.g. GET-based
|
|
72
|
+
* logins that pass credentials in the query string). Multipart is checked
|
|
73
|
+
* before generic form-urlencoded because a multipart body still contains
|
|
74
|
+
* `=` characters and would be parsed as a single malformed form pair
|
|
75
|
+
* otherwise. */
|
|
76
|
+
const BODY_PARSERS: Array<(r: CapturedRequest) => BodyFinding | null> = [
|
|
77
|
+
findInJsonBody,
|
|
78
|
+
findInJsonWrappedInForm,
|
|
79
|
+
findInMultipartBody,
|
|
80
|
+
findInFormBody,
|
|
81
|
+
];
|
|
82
|
+
|
|
61
83
|
/** Top-level entry point. */
|
|
62
84
|
export function extractCredentials(session: Session): ExtractionResult {
|
|
63
85
|
const findings: CredentialFinding[] = [];
|
|
@@ -65,13 +87,17 @@ export function extractCredentials(session: Session): ExtractionResult {
|
|
|
65
87
|
const usernamesInDom = collectFormSubmitUsernames(session.events);
|
|
66
88
|
|
|
67
89
|
for (const req of session.requests) {
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
90
|
+
let found: BodyFinding | null = null;
|
|
91
|
+
if (req.body) {
|
|
92
|
+
for (const parse of BODY_PARSERS) {
|
|
93
|
+
found = parse(req);
|
|
94
|
+
if (found) break;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
// Last-resort: credentials in the URL query string (rare but real for
|
|
98
|
+
// some legacy GET-based login endpoints). Tried after body parsers so
|
|
99
|
+
// body-based logins always win when both are present.
|
|
100
|
+
if (!found) found = findInUrlQuery(req);
|
|
75
101
|
if (!found) continue;
|
|
76
102
|
|
|
77
103
|
const confirmedByDom = usernamesInDom.has(found.usernameValue);
|
|
@@ -132,7 +158,7 @@ function findInFormBody(req: CapturedRequest): BodyFinding | null {
|
|
|
132
158
|
|
|
133
159
|
// Second pass: find a username-like key.
|
|
134
160
|
for (const { key, value } of pairs) {
|
|
135
|
-
if (
|
|
161
|
+
if (isUsernameKey(key) && value.length > 0) {
|
|
136
162
|
usernameKey = key;
|
|
137
163
|
usernameValue = value;
|
|
138
164
|
break;
|
|
@@ -163,11 +189,7 @@ function findInJsonBody(req: CapturedRequest): BodyFinding | null {
|
|
|
163
189
|
if (typeof pwdHit.value !== 'string' || pwdHit.value.length === 0) return null;
|
|
164
190
|
|
|
165
191
|
// Look for a username-like key; prefer one in the same parent object.
|
|
166
|
-
const userHit = findFirstByPredicate(
|
|
167
|
-
parsed,
|
|
168
|
-
(k) => USERNAME_KEY_RE.test(normalizeKey(k)),
|
|
169
|
-
pwdHit.parent,
|
|
170
|
-
);
|
|
192
|
+
const userHit = findFirstByPredicate(parsed, isUsernameKey, pwdHit.parent);
|
|
171
193
|
if (!userHit || typeof userHit.value !== 'string' || userHit.value.length === 0) return null;
|
|
172
194
|
|
|
173
195
|
return {
|
|
@@ -178,6 +200,138 @@ function findInJsonBody(req: CapturedRequest): BodyFinding | null {
|
|
|
178
200
|
};
|
|
179
201
|
}
|
|
180
202
|
|
|
203
|
+
/** Handles legacy framings where a JSON document is the value of a single
|
|
204
|
+
* form-encoded field — `payload={"username":"…","password":"…"}` or
|
|
205
|
+
* `data=…` or `request=…`. Real PHP / ColdFusion apps do this. We delegate
|
|
206
|
+
* the inner pairing to findInJsonBody by synthesizing a child request, and
|
|
207
|
+
* re-encode the path as `body-form` so the redactor knows to swap the
|
|
208
|
+
* whole inner JSON string back in. */
|
|
209
|
+
function findInJsonWrappedInForm(req: CapturedRequest): BodyFinding | null {
|
|
210
|
+
if (!req.body) return null;
|
|
211
|
+
const pairs = parseFormBody(req.body);
|
|
212
|
+
if (pairs.length === 0) return null;
|
|
213
|
+
|
|
214
|
+
const WRAPPER_KEYS = new Set(['payload', 'data', 'request', 'json', 'body']);
|
|
215
|
+
for (const { key, value } of pairs) {
|
|
216
|
+
if (!WRAPPER_KEYS.has(key.toLowerCase())) continue;
|
|
217
|
+
if (!value.startsWith('{') && !value.startsWith('[')) continue;
|
|
218
|
+
// Build a synthetic request with the unwrapped JSON as body.
|
|
219
|
+
const inner: CapturedRequest = { ...req, body: value };
|
|
220
|
+
const found = findInJsonBody(inner);
|
|
221
|
+
if (!found) continue;
|
|
222
|
+
// Project the JSON paths back into form-key terms — the redactor
|
|
223
|
+
// matches on `originalValue` regardless of `location`, but we keep the
|
|
224
|
+
// location semantically correct so future readers aren't confused.
|
|
225
|
+
return {
|
|
226
|
+
...found,
|
|
227
|
+
usernameLocation: { kind: 'body-form', key },
|
|
228
|
+
passwordLocation: { kind: 'body-form', key },
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
return null;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/** Parse a multipart/form-data body into {key, value} pairs and pair like
|
|
235
|
+
* the form-urlencoded path. Defensive: any malformed part is skipped.
|
|
236
|
+
*
|
|
237
|
+
* We sniff the boundary from the first line (`--<boundary>`) rather than
|
|
238
|
+
* trusting the Content-Type header, because the whole point of this
|
|
239
|
+
* module is to not trust Content-Type. */
|
|
240
|
+
function findInMultipartBody(req: CapturedRequest): BodyFinding | null {
|
|
241
|
+
if (!req.body) return null;
|
|
242
|
+
const body = req.body;
|
|
243
|
+
// First line should be `--<boundary>`. If it doesn't start with `--` or
|
|
244
|
+
// there's no following newline, this isn't multipart.
|
|
245
|
+
const firstNewline = body.indexOf('\n');
|
|
246
|
+
if (firstNewline < 0) return null;
|
|
247
|
+
const firstLine = body.slice(0, firstNewline).trimEnd();
|
|
248
|
+
if (!firstLine.startsWith('--')) return null;
|
|
249
|
+
const boundary = firstLine.slice(2);
|
|
250
|
+
if (boundary.length === 0 || boundary.length > 200) return null;
|
|
251
|
+
// Split on the boundary; skip the prologue (empty before first boundary)
|
|
252
|
+
// and the epilogue (after closing `--<boundary>--`).
|
|
253
|
+
const sep = `--${boundary}`;
|
|
254
|
+
const parts = body.split(sep).slice(1);
|
|
255
|
+
const pairs: Array<{ key: string; value: string }> = [];
|
|
256
|
+
for (const partRaw of parts) {
|
|
257
|
+
const part = partRaw.startsWith('\r\n')
|
|
258
|
+
? partRaw.slice(2)
|
|
259
|
+
: partRaw.startsWith('\n')
|
|
260
|
+
? partRaw.slice(1)
|
|
261
|
+
: partRaw;
|
|
262
|
+
if (part.startsWith('--')) break; // closing boundary
|
|
263
|
+
// Headers and body are separated by a blank line.
|
|
264
|
+
const headerEnd = part.indexOf('\r\n\r\n');
|
|
265
|
+
const headerEnd2 = headerEnd >= 0 ? headerEnd : part.indexOf('\n\n');
|
|
266
|
+
if (headerEnd2 < 0) continue;
|
|
267
|
+
const sepLen = headerEnd >= 0 ? 4 : 2;
|
|
268
|
+
const headers = part.slice(0, headerEnd2);
|
|
269
|
+
let value = part.slice(headerEnd2 + sepLen);
|
|
270
|
+
// Strip the trailing CRLF that precedes the next boundary.
|
|
271
|
+
value = value.replace(/\r?\n$/, '');
|
|
272
|
+
const nameMatch = headers.match(/name="([^"]*)"/i);
|
|
273
|
+
if (!nameMatch) continue;
|
|
274
|
+
const key = nameMatch[1] ?? '';
|
|
275
|
+
if (!key) continue;
|
|
276
|
+
pairs.push({ key, value });
|
|
277
|
+
}
|
|
278
|
+
if (pairs.length === 0) return null;
|
|
279
|
+
return pairFromKeyValuePairs(pairs, 'body-form');
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/** Credentials in the URL query string — `GET /login?username=…&password=…`
|
|
283
|
+
* or a POST whose body is empty but credentials ride in the URL. Rare but
|
|
284
|
+
* real for some legacy CGI endpoints. */
|
|
285
|
+
function findInUrlQuery(req: CapturedRequest): BodyFinding | null {
|
|
286
|
+
let qs: string;
|
|
287
|
+
try {
|
|
288
|
+
const u = new URL(req.url);
|
|
289
|
+
qs = u.search.startsWith('?') ? u.search.slice(1) : u.search;
|
|
290
|
+
} catch {
|
|
291
|
+
return null;
|
|
292
|
+
}
|
|
293
|
+
if (!qs) return null;
|
|
294
|
+
const pairs = parseFormBody(qs);
|
|
295
|
+
if (pairs.length === 0) return null;
|
|
296
|
+
return pairFromKeyValuePairs(pairs, 'body-form');
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/** Shared pairing: given key/value pairs, find a password partner and a
|
|
300
|
+
* username partner. Returns a BodyFinding or null. Used by every parser
|
|
301
|
+
* that flattens its input into key/value pairs (form, multipart, URL
|
|
302
|
+
* query). The `location.kind` argument is passed through unchanged. */
|
|
303
|
+
function pairFromKeyValuePairs(
|
|
304
|
+
pairs: Array<{ key: string; value: string }>,
|
|
305
|
+
kind: 'body-form',
|
|
306
|
+
): BodyFinding | null {
|
|
307
|
+
let passwordKey: string | null = null;
|
|
308
|
+
let passwordValue: string | null = null;
|
|
309
|
+
for (const { key, value } of pairs) {
|
|
310
|
+
if (isSensitiveCredentialKey(key) && value.length > 0) {
|
|
311
|
+
passwordKey = key;
|
|
312
|
+
passwordValue = value;
|
|
313
|
+
break;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
if (passwordKey === null || passwordValue === null) return null;
|
|
317
|
+
let usernameKey: string | null = null;
|
|
318
|
+
let usernameValue: string | null = null;
|
|
319
|
+
for (const { key, value } of pairs) {
|
|
320
|
+
if (isUsernameKey(key) && value.length > 0) {
|
|
321
|
+
usernameKey = key;
|
|
322
|
+
usernameValue = value;
|
|
323
|
+
break;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
if (usernameKey === null || usernameValue === null) return null;
|
|
327
|
+
return {
|
|
328
|
+
usernameValue,
|
|
329
|
+
passwordValue,
|
|
330
|
+
usernameLocation: { kind, key: usernameKey },
|
|
331
|
+
passwordLocation: { kind, key: passwordKey },
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
|
|
181
335
|
interface JsonHit {
|
|
182
336
|
key: string;
|
|
183
337
|
value: unknown;
|
|
@@ -238,12 +392,7 @@ function collectFormSubmitUsernames(events: CapturedEvent[]): Set<string> {
|
|
|
238
392
|
fields?: Array<{ name?: string; type?: string; value?: string }>;
|
|
239
393
|
};
|
|
240
394
|
for (const f of detail.fields ?? []) {
|
|
241
|
-
if (
|
|
242
|
-
f.name &&
|
|
243
|
-
f.value &&
|
|
244
|
-
f.type !== 'password' &&
|
|
245
|
-
USERNAME_KEY_RE.test(normalizeKey(f.name))
|
|
246
|
-
) {
|
|
395
|
+
if (f.name && f.value && f.type !== 'password' && isUsernameKey(f.name)) {
|
|
247
396
|
out.add(f.value);
|
|
248
397
|
}
|
|
249
398
|
}
|
package/src/imprint/cron.ts
CHANGED
|
@@ -242,6 +242,7 @@ async function runCronImpl(opts: RunCronOptions): Promise<void> {
|
|
|
242
242
|
if (
|
|
243
243
|
ladder.includes('fetch') ||
|
|
244
244
|
ladder.includes('fetch-bootstrap') ||
|
|
245
|
+
ladder.includes('cdp-replay') ||
|
|
245
246
|
ladder.includes('stealth-fetch')
|
|
246
247
|
) {
|
|
247
248
|
const validator = buildZodValidator(tool.workflow.parameters);
|
package/src/imprint/doctor.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
/** `imprint doctor` — check that the environment can actually run imprint.
|
|
2
2
|
* Reports pass/fail per prerequisite plus a one-line fix when failed. */
|
|
3
3
|
|
|
4
|
+
import { spawnSync } from 'node:child_process';
|
|
4
5
|
import { existsSync, readFileSync, readdirSync } from 'node:fs';
|
|
5
6
|
import { homedir } from 'node:os';
|
|
6
7
|
import { join as pathJoin } from 'node:path';
|
|
@@ -20,6 +21,7 @@ export function doctor(): CheckResult[] {
|
|
|
20
21
|
checkBun(),
|
|
21
22
|
checkChromium(),
|
|
22
23
|
checkPlaywrightChromium(),
|
|
24
|
+
checkVirtualDisplay(),
|
|
23
25
|
checkLLMProvider(),
|
|
24
26
|
checkPushOptional(),
|
|
25
27
|
checkClaudeCode(),
|
|
@@ -87,6 +89,43 @@ function checkPlaywrightChromium(): CheckResult {
|
|
|
87
89
|
};
|
|
88
90
|
}
|
|
89
91
|
|
|
92
|
+
function hasXvfbBinary(): boolean {
|
|
93
|
+
try {
|
|
94
|
+
return spawnSync('sh', ['-c', 'command -v Xvfb'], { stdio: 'ignore' }).status === 0;
|
|
95
|
+
} catch {
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/** The trusted-browser replay (playbook rung's cdp-browser transport) runs Chrome
|
|
101
|
+
* HEADLESS by default and needs NO display — the `HeadlessChrome` UA token is
|
|
102
|
+
* stripped so anti-bot services don't edge-block it. A display only matters as a
|
|
103
|
+
* fallback on a GPU-less Linux host, where headless WebGL reports SwiftShader and
|
|
104
|
+
* the replay must run HEADED under Xvfb (launchChromium auto-starts it when a
|
|
105
|
+
* headed launch finds no `$DISPLAY`). macOS/Windows need nothing. Advisory only. */
|
|
106
|
+
function checkVirtualDisplay(): CheckResult {
|
|
107
|
+
const name = 'Display (headed replay)';
|
|
108
|
+
if (process.platform !== 'linux') {
|
|
109
|
+
return { name, ok: true, detail: `${process.platform}: native window server (no Xvfb needed)` };
|
|
110
|
+
}
|
|
111
|
+
const display = process.env.DISPLAY;
|
|
112
|
+
if (display) return { name, ok: true, detail: `$DISPLAY=${display}` };
|
|
113
|
+
if (hasXvfbBinary()) {
|
|
114
|
+
return {
|
|
115
|
+
name,
|
|
116
|
+
ok: true,
|
|
117
|
+
detail: 'no $DISPLAY; Xvfb present — headed-replay fallback available for GPU-less hosts',
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
return {
|
|
121
|
+
name,
|
|
122
|
+
ok: true, // advisory — default replay is headless; Xvfb is only a GPU-less fallback
|
|
123
|
+
detail:
|
|
124
|
+
'Linux, no $DISPLAY and no Xvfb — default replay is headless (fine); install Xvfb only if a GPU-less host gets bot-flagged',
|
|
125
|
+
fix: 'GPU-less host bot-flagged? install the headed-replay fallback: apt-get install xvfb (or export DISPLAY=:0)',
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
90
129
|
function checkLLMProvider(): CheckResult {
|
|
91
130
|
const statuses = getProviderStatuses();
|
|
92
131
|
const detected = statuses.filter((s) => s.detected);
|
package/src/imprint/emit.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
|
|
|
5
5
|
import { basename, dirname, join as pathJoin, resolve as pathResolve } from 'node:path';
|
|
6
6
|
import { loadJsonFile } from './load-json.ts';
|
|
7
7
|
import { ensureImprintRuntimeLink } from './runtime-link.ts';
|
|
8
|
+
import { isLoginFieldKey } from './sensitive-keys.ts';
|
|
8
9
|
import { type Workflow, WorkflowSchema } from './types.ts';
|
|
9
10
|
|
|
10
11
|
interface EmitOptions {
|
|
@@ -36,6 +37,8 @@ export function emit(opts: EmitOptions): EmitResult {
|
|
|
36
37
|
'workflow.json',
|
|
37
38
|
);
|
|
38
39
|
|
|
40
|
+
assertNoCredentialShapedParams(workflow);
|
|
41
|
+
|
|
39
42
|
const outDir = opts.outDir ?? defaultOutDir(opts.workflowPath, workflow);
|
|
40
43
|
|
|
41
44
|
mkdirSync(outDir, { recursive: true });
|
|
@@ -137,6 +140,88 @@ export { WORKFLOW };
|
|
|
137
140
|
`;
|
|
138
141
|
}
|
|
139
142
|
|
|
143
|
+
/** Pre-emit guardrail: refuse to write a workflow whose parameters look
|
|
144
|
+
* like login credentials (`password`, `userid`, `email`, etc., per the
|
|
145
|
+
* shared dictionary in sensitive-keys.ts) but are templated as plain
|
|
146
|
+
* `${param.X}` instead of credential-store references like
|
|
147
|
+
* `${credential.X}`.
|
|
148
|
+
*
|
|
149
|
+
* This catches the failure mode where upstream credential extraction
|
|
150
|
+
* silently failed (e.g. unusual Content-Type, body framing the parser
|
|
151
|
+
* didn't recognise, declined credential-save prompt), so the compile
|
|
152
|
+
* agent had no credential anchor and chose to model the login fields as
|
|
153
|
+
* ordinary callable parameters. The resulting MCP tool would advertise
|
|
154
|
+
* `userid`/`password` as required inputs, forward whatever the caller
|
|
155
|
+
* passed verbatim, and (most often) silently produce empty results when
|
|
156
|
+
* the caller passed empty strings.
|
|
157
|
+
*
|
|
158
|
+
* We require either:
|
|
159
|
+
* - The parameter isn't credential-shaped, OR
|
|
160
|
+
* - The body template references `${credential.<name>}` (or another
|
|
161
|
+
* `credential.*` reference), in which case the workflow is pulling
|
|
162
|
+
* from the credential store and the `${param.X}` parameter is
|
|
163
|
+
* effectively a no-op the user can safely ignore.
|
|
164
|
+
*
|
|
165
|
+
* Throws with the remediation steps the user needs to take. */
|
|
166
|
+
function assertNoCredentialShapedParams(workflow: Workflow): void {
|
|
167
|
+
const offenders: Array<{ name: string; matches: string[] }> = [];
|
|
168
|
+
for (const param of workflow.parameters) {
|
|
169
|
+
if (!isLoginFieldKey(param.name)) continue;
|
|
170
|
+
const paramRef = `\${param.${param.name}}`;
|
|
171
|
+
const credentialRef = `\${credential.${param.name}}`;
|
|
172
|
+
const requestsUsingParam: string[] = [];
|
|
173
|
+
let coveredByCredentialRef = false;
|
|
174
|
+
for (let i = 0; i < workflow.requests.length; i++) {
|
|
175
|
+
const req = workflow.requests[i];
|
|
176
|
+
if (!req) continue;
|
|
177
|
+
const haystack = `${req.url} ${req.body ?? ''} ${Object.values(req.headers).join(' ')}`;
|
|
178
|
+
if (haystack.includes(credentialRef)) {
|
|
179
|
+
coveredByCredentialRef = true;
|
|
180
|
+
}
|
|
181
|
+
if (haystack.includes(paramRef)) {
|
|
182
|
+
requestsUsingParam.push(`requests[${i}] (${req.method} ${req.url})`);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// Only flag if the body templates the param and there's no parallel
|
|
186
|
+
// credential reference. A workflow that uses both `${param.X}` and
|
|
187
|
+
// `${credential.X}` is suspicious but not necessarily broken — leave
|
|
188
|
+
// it to the user. The dangerous case is `${param.X}` alone.
|
|
189
|
+
if (requestsUsingParam.length > 0 && !coveredByCredentialRef) {
|
|
190
|
+
offenders.push({ name: param.name, matches: requestsUsingParam });
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
if (offenders.length === 0) return;
|
|
194
|
+
|
|
195
|
+
const lines = [
|
|
196
|
+
`Workflow ${JSON.stringify(workflow.toolName)} declares ${offenders.length} credential-shaped parameter(s) that are templated as plain \`\${param.X}\` instead of \`\${credential.X}\`:`,
|
|
197
|
+
'',
|
|
198
|
+
];
|
|
199
|
+
for (const o of offenders) {
|
|
200
|
+
lines.push(` • parameter \`${o.name}\` — used in:`);
|
|
201
|
+
for (const m of o.matches) lines.push(` - ${m}`);
|
|
202
|
+
}
|
|
203
|
+
lines.push(
|
|
204
|
+
'',
|
|
205
|
+
'Credentials MUST be pulled from the credential store via `${credential.<name>}`, never modelled as plain workflow parameters.',
|
|
206
|
+
"This usually means the redact stage failed to extract a username+password pair from the recorded login request — common causes include unusual Content-Type headers, multipart bodies, or login fields the extractor dictionary doesn't yet cover.",
|
|
207
|
+
'',
|
|
208
|
+
'To fix:',
|
|
209
|
+
` 1. Delete the redacted session: rm ${workflowToolHint(workflow)}/sessions/*.redacted.json (or the relevant one)`,
|
|
210
|
+
` 2. Re-run from the redact stage: imprint teach ${workflow.site} --from redact`,
|
|
211
|
+
' 3. Accept the "Save credentials for site to the credential manager?" prompt this time.',
|
|
212
|
+
' 4. Let teach continue through generate → compile-playbook → emit.',
|
|
213
|
+
'',
|
|
214
|
+
"If the prompt does NOT appear during step 3, the extractor still cannot pair this site's login fields — please file a bug attaching the (redacted!) session.",
|
|
215
|
+
);
|
|
216
|
+
throw new Error(lines.join('\n'));
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/** Pretty path hint for the error message above. We don't have IMPRINT_HOME
|
|
220
|
+
* in scope and don't need it — `~/.imprint/<site>` is the convention. */
|
|
221
|
+
function workflowToolHint(workflow: Workflow): string {
|
|
222
|
+
return `~/.imprint/${workflow.site}`;
|
|
223
|
+
}
|
|
224
|
+
|
|
140
225
|
function pascalCase(s: string): string {
|
|
141
226
|
return s
|
|
142
227
|
.split(/[_-]+/)
|
|
@@ -73,10 +73,11 @@ const FREEFORM_POLICIES: PolicyName[] = [
|
|
|
73
73
|
Policies.PGP_PRIVATE_KEY,
|
|
74
74
|
Policies.PASSWORD_ASSIGNMENT,
|
|
75
75
|
Policies.ENVIRONMENT_VARIABLE_SECRET,
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
76
|
+
// NOTE: the GENERIC_* catch-alls (GENERIC_PASSWORD/TOKEN/CREDENTIAL/SECRET) are
|
|
77
|
+
// intentionally omitted — they match on value shape alone and fire on benign
|
|
78
|
+
// data (e.g. `id=1234567890`), corrupting/over-redacting structured payloads.
|
|
79
|
+
// Real secrets are still covered by the keyword-anchored and specific policies
|
|
80
|
+
// above and below (PASSWORD_ASSIGNMENT, OAUTH_*, private keys, cloud tokens, PII).
|
|
80
81
|
Policies.OAUTH_CLIENT_SECRET,
|
|
81
82
|
Policies.OAUTH_REFRESH_TOKEN,
|
|
82
83
|
Policies.OAUTH_ACCESS_TOKEN,
|
|
@@ -87,7 +87,7 @@ export function generatePasteSnippet(opts: {
|
|
|
87
87
|
|
|
88
88
|
switch (platform) {
|
|
89
89
|
case 'claude-code':
|
|
90
|
-
return `Add the ${toolName} tool: run \`${shellCmd}\` to register ${descLower}. Parameters: ${paramList}. The backend ladder handles browser/API state and bot detection automatically (fetch → gated fetch-bootstrap → stealth-fetch → playbook).`;
|
|
90
|
+
return `Add the ${toolName} tool: run \`${shellCmd}\` to register ${descLower}. Parameters: ${paramList}. The backend ladder handles browser/API state and bot detection automatically (fetch → gated fetch-bootstrap → cdp-replay → stealth-fetch → playbook).`;
|
|
91
91
|
|
|
92
92
|
case 'codex':
|
|
93
93
|
return `Add the ${toolName} tool: run \`${shellCmd}\` to register ${descLower}. Parameters: ${paramList}.`;
|
|
@@ -352,7 +352,7 @@ ${yamlStringify(p, { lineWidth: 0 }).trim()}
|
|
|
352
352
|
// Backend ladder explanation.
|
|
353
353
|
const backendBlock = `## Backend Ladder
|
|
354
354
|
|
|
355
|
-
The MCP server automatically escalates from fetch API replay to gated fetch-bootstrap when browser-minted state is declared, then stealth-fetch for bot-defense state, then playbook for full DOM replay.
|
|
355
|
+
The MCP server automatically escalates from fetch API replay to gated fetch-bootstrap when browser-minted state is declared, then cdp-replay (API requests run inside a live trusted Chrome so a protected POST refreshes its anti-bot token between calls), then stealth-fetch for bot-defense state, then playbook for full DOM replay.
|
|
356
356
|
Bot detection is handled transparently.`;
|
|
357
357
|
|
|
358
358
|
// Scheduling block (optional).
|