unbrowse 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +455 -96
- package/dist/index.js +2 -6
- package/dist/mcp.js +695 -46
- package/dist/server.js +25811 -0
- package/package.json +1 -2
- package/vendor/kuri/darwin-arm64/kuri +0 -0
- package/vendor/kuri/darwin-x64/kuri +0 -0
- package/vendor/kuri/linux-arm64/kuri +0 -0
- package/vendor/kuri/linux-x64/kuri +0 -0
- package/vendor/kuri/manifest.json +7 -10
- package/runtime-src/agent-outcome.ts +0 -166
- package/runtime-src/analytics-session.ts +0 -55
- package/runtime-src/api/browse-index.ts +0 -317
- package/runtime-src/api/browse-session.ts +0 -572
- package/runtime-src/api/browse-submit-prereqs.ts +0 -48
- package/runtime-src/api/browse-submit.ts +0 -1184
- package/runtime-src/api/routes.ts +0 -1823
- package/runtime-src/auth/browser-cookies.ts +0 -423
- package/runtime-src/auth/index.ts +0 -535
- package/runtime-src/auth/runtime.ts +0 -116
- package/runtime-src/browser/index.ts +0 -659
- package/runtime-src/browser/types.ts +0 -41
- package/runtime-src/build-info.generated.ts +0 -6
- package/runtime-src/capture/index.ts +0 -1794
- package/runtime-src/capture/prefetch.ts +0 -95
- package/runtime-src/capture/rsc.ts +0 -45
- package/runtime-src/cli/shortcuts.ts +0 -273
- package/runtime-src/cli.ts +0 -1572
- package/runtime-src/client/graph-client.ts +0 -100
- package/runtime-src/client/index.ts +0 -1425
- package/runtime-src/debug-trace.ts +0 -18
- package/runtime-src/domain.ts +0 -38
- package/runtime-src/execution/index.ts +0 -3397
- package/runtime-src/execution/retry.ts +0 -46
- package/runtime-src/execution/robots.ts +0 -167
- package/runtime-src/execution/search-forms.ts +0 -188
- package/runtime-src/extraction/index.ts +0 -1507
- package/runtime-src/foundry/publish-bundle.ts +0 -392
- package/runtime-src/graph/agent-augment.ts +0 -315
- package/runtime-src/graph/index.ts +0 -1524
- package/runtime-src/graph/local-fixtures.ts +0 -393
- package/runtime-src/graph/local-harness.ts +0 -646
- package/runtime-src/graph/planner.ts +0 -411
- package/runtime-src/graph/session.ts +0 -294
- package/runtime-src/graph/trace-store.ts +0 -136
- package/runtime-src/index.ts +0 -24
- package/runtime-src/indexer/index.ts +0 -465
- package/runtime-src/intent-match.ts +0 -1515
- package/runtime-src/kuri/client.ts +0 -1839
- package/runtime-src/logger.ts +0 -30
- package/runtime-src/marketplace/index.ts +0 -103
- package/runtime-src/mcp.ts +0 -1747
- package/runtime-src/orchestrator/browser-agent.ts +0 -374
- package/runtime-src/orchestrator/dag-advisor.ts +0 -59
- package/runtime-src/orchestrator/dag-feedback.ts +0 -257
- package/runtime-src/orchestrator/first-pass-action.ts +0 -403
- package/runtime-src/orchestrator/index.ts +0 -4480
- package/runtime-src/orchestrator/passive-publish.ts +0 -187
- package/runtime-src/orchestrator/timing-economics.ts +0 -80
- package/runtime-src/payments/cascade.ts +0 -137
- package/runtime-src/payments/index.ts +0 -270
- package/runtime-src/payments/lobster-pay.ts +0 -182
- package/runtime-src/payments/wallet.ts +0 -98
- package/runtime-src/publish/review-context.ts +0 -93
- package/runtime-src/publish/sanitize.ts +0 -197
- package/runtime-src/publish/schema-review.ts +0 -192
- package/runtime-src/publish-admission.ts +0 -388
- package/runtime-src/ratelimit/index.ts +0 -23
- package/runtime-src/reverse-engineer/bundle-scanner.ts +0 -127
- package/runtime-src/reverse-engineer/description-prompt.ts +0 -213
- package/runtime-src/reverse-engineer/index.ts +0 -1551
- package/runtime-src/router.ts +0 -17
- package/runtime-src/routing-telemetry.ts +0 -395
- package/runtime-src/runtime/browser-access.ts +0 -11
- package/runtime-src/runtime/browser-auth.ts +0 -12
- package/runtime-src/runtime/browser-host.ts +0 -48
- package/runtime-src/runtime/lifecycle.ts +0 -17
- package/runtime-src/runtime/local-server.ts +0 -311
- package/runtime-src/runtime/paths.ts +0 -99
- package/runtime-src/runtime/setup.ts +0 -251
- package/runtime-src/runtime/supervisor.ts +0 -69
- package/runtime-src/runtime/update-hints.ts +0 -351
- package/runtime-src/server.ts +0 -100
- package/runtime-src/session-logs.ts +0 -142
- package/runtime-src/settings.ts +0 -221
- package/runtime-src/single-binary.ts +0 -143
- package/runtime-src/site-policy.ts +0 -54
- package/runtime-src/stale-cleanup-runner.ts +0 -144
- package/runtime-src/stale-cleanup.ts +0 -133
- package/runtime-src/telemetry-attribution.ts +0 -120
- package/runtime-src/telemetry.ts +0 -253
- package/runtime-src/template-params.ts +0 -141
- package/runtime-src/transform/drift.ts +0 -60
- package/runtime-src/transform/index.ts +0 -277
- package/runtime-src/types/index.ts +0 -1
- package/runtime-src/types/skill.ts +0 -912
- package/runtime-src/vault/index.ts +0 -196
- package/runtime-src/verification/auth-gate.ts +0 -8
- package/runtime-src/verification/candidates.ts +0 -27
- package/runtime-src/verification/index.ts +0 -120
- package/runtime-src/verification/matrix.ts +0 -30
- package/runtime-src/version.ts +0 -148
- package/runtime-src/workflow/artifact.ts +0 -161
- package/runtime-src/workflow/compile.ts +0 -808
- package/runtime-src/workflow/publish.ts +0 -225
- package/runtime-src/workflow/runtime.ts +0 -213
- package/vendor/kuri/win-x64/kuri.exe +0 -0
|
@@ -1,1551 +0,0 @@
|
|
|
1
|
-
import type { RawRequest, CapturedWsMessage } from "../capture/index.js";
|
|
2
|
-
import type { CsrfPlan, EndpointDescriptor, EndpointPathBindingCandidate, WsMessage } from "../types/index.js";
|
|
3
|
-
import { inferSchema } from "../transform/index.js";
|
|
4
|
-
import { getRegistrableDomain } from "../domain.js";
|
|
5
|
-
import { nanoid } from "nanoid";
|
|
6
|
-
import { inferEndpointSemantic, resolveEndpointPathBindings } from "../graph/index.js";
|
|
7
|
-
import { writeDebugTrace } from "../debug-trace.js";
|
|
8
|
-
import { buildQueryBindingMap } from "../template-params.js";
|
|
9
|
-
import { buildDescriptionPrompt, groundedDescription, extractResponseKeys, inferDescriptionParams } from "./description-prompt.js";
|
|
10
|
-
import { isRscPayload, extractRscDataEndpoints } from "../capture/rsc.js";
|
|
11
|
-
const SKIP_EXTENSIONS = /\.(js|mjs|css|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|map|webp|html|avif)([?#]|$)/i;
|
|
12
|
-
const SKIP_JS_BUNDLES = /\/(boq-|_\/mss\/|og\/_\/js\/|_\/scs\/)/i;
|
|
13
|
-
const SKIP_PATHS = /\/_next\/static\/|\/_next\/data\/|\/_next\/image|\/static\/chunks\/|\/static\/media\/|\/cdn-cgi\//i;
|
|
14
|
-
|
|
15
|
-
// Known infrastructure/auth hosts — never useful as skill endpoints
|
|
16
|
-
const SKIP_HOSTS = /(cloudflare\.com|google-analytics\.com|doubleclick\.net|gstatic\.com|accounts\.google\.com|login\.microsoftonline\.com|auth0\.com|cognito-idp\.|appleid\.apple\.com|github\.com\/login|facebook\.com\/login|protechts\.net|demdex\.net|litms|platform-telemetry|datadoghq\.com|fullstory\.com|launchdarkly\.com|intercom\.io|privy\.io|mypinata\.cloud|sentry\.io|segment\.io|amplitude\.com|mixpanel\.com|hotjar\.com|clarity\.ms|googletagmanager\.com|walletconnect\.com|imagedelivery\.net|cloudflareinsights\.com)/i;
|
|
17
|
-
|
|
18
|
-
// Google-specific telemetry, ads, and infrastructure subdomains (BUG-GC-004)
|
|
19
|
-
const SKIP_TELEMETRY_HOSTS = /(waa-pa\.|signaler-pa\.|appsgrowthpromo-pa\.|ogads-pa\.|peoplestackwebexperiments-pa\.)/i;
|
|
20
|
-
|
|
21
|
-
// Known telemetry/logging path patterns
|
|
22
|
-
const SKIP_TELEMETRY_PATHS = /\/(log|logging|telemetry|analytics|beacon|ping|heartbeat|metrics)(\/|$)/i;
|
|
23
|
-
|
|
24
|
-
// RPC/API path hints — tightened to avoid false positives (BUG-GC-004)
|
|
25
|
-
const RPC_HINTS = /(\/$rpc\/|\/rpc\/|graphql|trending|search|feed|results|batchexecute|\/api\/)/i;
|
|
26
|
-
|
|
27
|
-
const ALLOWED_METHODS = new Set(["GET", "POST", "PUT", "PATCH", "DELETE"]);
|
|
28
|
-
|
|
29
|
-
// Headers that must never be stored in skill manifests (BUG-GC-005)
|
|
30
|
-
// Includes session tokens, API keys, and Google-specific credential headers.
|
|
31
|
-
const STRIP_HEADERS = new Set([
|
|
32
|
-
"cookie",
|
|
33
|
-
"authorization",
|
|
34
|
-
"x-csrf-token",
|
|
35
|
-
"x-api-key",
|
|
36
|
-
"api-key",
|
|
37
|
-
"x-auth-token",
|
|
38
|
-
"x-app-key",
|
|
39
|
-
"x-app-secret",
|
|
40
|
-
"content-length",
|
|
41
|
-
"host",
|
|
42
|
-
// Google credential headers
|
|
43
|
-
"x-goog-api-key",
|
|
44
|
-
"x-server-token",
|
|
45
|
-
"x-goog-encode-response-if-executable",
|
|
46
|
-
"x-clientdetails",
|
|
47
|
-
"x-javascript-user-agent",
|
|
48
|
-
]);
|
|
49
|
-
// Also strip any header matching these prefixes
|
|
50
|
-
const STRIP_HEADER_PREFIXES = [
|
|
51
|
-
"x-goog-auth", "x-goog-spatula",
|
|
52
|
-
"x-auth-", // generic auth headers
|
|
53
|
-
"x-amz-security-", // AWS security tokens
|
|
54
|
-
"x-stripe-", // Stripe API headers
|
|
55
|
-
"x-firebase-", // Firebase auth headers
|
|
56
|
-
];
|
|
57
|
-
|
|
58
|
-
// Browser-captured headers that are not secrets themselves, but are still
|
|
59
|
-
// required to replay authenticated requests after publish-time header redaction.
|
|
60
|
-
const REPLAY_HEADER_PREFIXES = [
|
|
61
|
-
"x-li-",
|
|
62
|
-
];
|
|
63
|
-
const REPLAY_HEADER_EXACT = new Set([
|
|
64
|
-
"accept",
|
|
65
|
-
"csrf-token",
|
|
66
|
-
"origin",
|
|
67
|
-
"x-requested-with",
|
|
68
|
-
"x-restli-protocol-version",
|
|
69
|
-
]);
|
|
70
|
-
|
|
71
|
-
// Headers known to be safe (non-sensitive) — used by the catch-all filter below
|
|
72
|
-
const SAFE_HEADERS = new Set([
|
|
73
|
-
"accept", "accept-encoding", "accept-language", "cache-control",
|
|
74
|
-
"content-type", "origin", "referer", "user-agent", "pragma",
|
|
75
|
-
"if-none-match", "if-modified-since", "range", "dnt", "connection",
|
|
76
|
-
"sec-ch-ua", "sec-ch-ua-mobile", "sec-ch-ua-platform",
|
|
77
|
-
"sec-fetch-dest", "sec-fetch-mode", "sec-fetch-site",
|
|
78
|
-
"x-requested-with",
|
|
79
|
-
]);
|
|
80
|
-
|
|
81
|
-
// Patterns that indicate a header contains credentials — catch-all safety net
|
|
82
|
-
const SENSITIVE_HEADER_PATTERN = /token|key|secret|credential|password|session/i;
|
|
83
|
-
|
|
84
|
-
// Query param names that likely contain credentials and must be stripped from URL templates
|
|
85
|
-
const SENSITIVE_QUERY_PARAMS = /^(api[_-]?key|apikey|access[_-]?token|auth[_-]?token|secret|password|key|token|session[_-]?id|client[_-]?secret|private[_-]?key|bearer)$/i;
|
|
86
|
-
|
|
87
|
-
// Framework-internal query params — noise from Next.js RSC, cache busting, etc.
|
|
88
|
-
const FRAMEWORK_QUERY_PARAMS = /^(_rsc|_next|__next|_t|_hash|__cf_chl_tk|nxtP\[.*\])$/i;
|
|
89
|
-
|
|
90
|
-
// Ad/tracking hosts that slip through the main SKIP_HOSTS filter
|
|
91
|
-
const AD_HOSTS = /buysellads\.com|carbonads\.com|ethicalads\.io|srv\.buysellads\.com|facet-futures\./i;
|
|
92
|
-
|
|
93
|
-
// Schema-level ad/tracking detection — if a response body's top-level keys
|
|
94
|
-
// match advertising vocabulary, the endpoint is an ad server regardless of host.
|
|
95
|
-
const AD_SCHEMA_KEYS = new Set([
|
|
96
|
-
"campaignid", "creativeid", "creativetype", "creativecontent",
|
|
97
|
-
"orderid", "impressionurl", "clickurl", "customerid",
|
|
98
|
-
"adunitid", "adslot", "adsize", "lineitemid",
|
|
99
|
-
]);
|
|
100
|
-
const AD_SCHEMA_THRESHOLD = 3; // need at least this many ad-like keys to classify
|
|
101
|
-
|
|
102
|
-
function singularize(word: string): string {
|
|
103
|
-
if (word.endsWith("ies") && word.length > 4) return `${word.slice(0, -3)}y`;
|
|
104
|
-
if (word.endsWith("ses") || word.endsWith("ges") || word.endsWith("zes")) return word.slice(0, -2);
|
|
105
|
-
if (word.endsWith("s") && !word.endsWith("ss") && word.length > 3) return word.slice(0, -1);
|
|
106
|
-
return word;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
function titleCase(text: string): string {
|
|
110
|
-
return text
|
|
111
|
-
.split(/[^a-zA-Z0-9]+/)
|
|
112
|
-
.filter(Boolean)
|
|
113
|
-
.map((part) => part[0] ? `${part[0].toUpperCase()}${part.slice(1)}` : "")
|
|
114
|
-
.join(" ");
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
function compactForSemanticExample(value: unknown, depth = 0): unknown {
|
|
118
|
-
if (depth > 2 || value == null) return value;
|
|
119
|
-
if (Array.isArray(value)) return value.slice(0, 2).map((item) => compactForSemanticExample(item, depth + 1));
|
|
120
|
-
if (typeof value === "object") {
|
|
121
|
-
const entries = Object.entries(value as Record<string, unknown>).slice(0, 8);
|
|
122
|
-
return Object.fromEntries(entries.map(([key, next]) => [key, compactForSemanticExample(next, depth + 1)]));
|
|
123
|
-
}
|
|
124
|
-
if (typeof value === "string" && value.length > 160) return `${value.slice(0, 157)}...`;
|
|
125
|
-
return value;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
function flattenRequestExample(value: unknown): Record<string, unknown> {
|
|
129
|
-
if (!value || typeof value !== "object" || Array.isArray(value)) return {};
|
|
130
|
-
const out: Record<string, unknown> = {};
|
|
131
|
-
for (const [groupKey, groupValue] of Object.entries(value as Record<string, unknown>)) {
|
|
132
|
-
if (groupValue == null) continue;
|
|
133
|
-
if (!groupValue || typeof groupValue !== "object" || Array.isArray(groupValue)) {
|
|
134
|
-
out[groupKey] = groupValue;
|
|
135
|
-
continue;
|
|
136
|
-
}
|
|
137
|
-
for (const [nestedKey, nestedValue] of Object.entries(groupValue as Record<string, unknown>)) {
|
|
138
|
-
if (out[nestedKey] == null) out[nestedKey] = nestedValue;
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
return out;
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
function summarizeResponseExample(sample: unknown): { subject: string; fields: string[] } {
|
|
145
|
-
if (Array.isArray(sample)) {
|
|
146
|
-
const first = sample.find((item) => item && typeof item === "object");
|
|
147
|
-
const fields = first ? collectKeysShallow(first).slice(0, 6) : [];
|
|
148
|
-
return { subject: "items", fields };
|
|
149
|
-
}
|
|
150
|
-
if (!sample || typeof sample !== "object") return { subject: "response", fields: [] };
|
|
151
|
-
const record = sample as Record<string, unknown>;
|
|
152
|
-
const preferredKey = Object.keys(record).find((key) => {
|
|
153
|
-
const value = record[key];
|
|
154
|
-
return (Array.isArray(value) && value.length > 0) || (value && typeof value === "object");
|
|
155
|
-
}) ?? Object.keys(record)[0] ?? "response";
|
|
156
|
-
const preferredValue = record[preferredKey];
|
|
157
|
-
if (Array.isArray(preferredValue) && preferredValue.length > 0) {
|
|
158
|
-
const fields = preferredValue[0] && typeof preferredValue[0] === "object"
|
|
159
|
-
? collectKeysShallow(preferredValue[0]).slice(0, 6)
|
|
160
|
-
: [];
|
|
161
|
-
return { subject: singularize(preferredKey), fields };
|
|
162
|
-
}
|
|
163
|
-
if (preferredValue && typeof preferredValue === "object") {
|
|
164
|
-
return { subject: singularize(preferredKey), fields: collectKeysShallow(preferredValue).slice(0, 6) };
|
|
165
|
-
}
|
|
166
|
-
return { subject: singularize(preferredKey), fields: collectKeysShallow(sample).slice(0, 6) };
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
function inferPathSubject(pathname: string): string {
|
|
170
|
-
const generic = new Set(["api", "graphql", "rpc", "search", "query", "v1", "v2", "v3", "rest"]);
|
|
171
|
-
const segments = pathname.split("/").filter(Boolean).filter((segment) => !generic.has(segment.toLowerCase()));
|
|
172
|
-
return singularize(segments[segments.length - 1] ?? "response");
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
function buildEndpointDescription(
|
|
176
|
-
req: RawRequest,
|
|
177
|
-
sampleRequest: Record<string, unknown>,
|
|
178
|
-
sampleResponse: unknown,
|
|
179
|
-
): string {
|
|
180
|
-
const url = new URL(req.url);
|
|
181
|
-
|
|
182
|
-
// Build param descriptors from the flattened sample request so the
|
|
183
|
-
// description is grounded in the actual parameters observed at capture time.
|
|
184
|
-
const locationHints = Object.fromEntries(
|
|
185
|
-
Object.keys(sampleRequest).map((key) => [
|
|
186
|
-
key,
|
|
187
|
-
url.searchParams.has(key) ? "query"
|
|
188
|
-
: url.pathname.includes(`{${key}}`) ? "path"
|
|
189
|
-
: "body",
|
|
190
|
-
]),
|
|
191
|
-
);
|
|
192
|
-
const params = inferDescriptionParams(sampleRequest, locationHints);
|
|
193
|
-
|
|
194
|
-
const responseKeys = extractResponseKeys(sampleResponse);
|
|
195
|
-
const dependencyBindings = Array.from(new Set([
|
|
196
|
-
...Object.keys(sampleRequest),
|
|
197
|
-
...responseKeys.filter((key) => /(id|slug|cursor|page|date|token|status|type|name)/i.test(key)),
|
|
198
|
-
]));
|
|
199
|
-
const searchTerms = Array.from(new Set([
|
|
200
|
-
...url.pathname.split("/").filter(Boolean),
|
|
201
|
-
...Object.keys(sampleRequest),
|
|
202
|
-
...responseKeys,
|
|
203
|
-
])).slice(0, 24);
|
|
204
|
-
|
|
205
|
-
const ctx = {
|
|
206
|
-
url_template: req.url,
|
|
207
|
-
method: req.method,
|
|
208
|
-
params,
|
|
209
|
-
sample_response_keys: responseKeys.length > 0 ? responseKeys : undefined,
|
|
210
|
-
domain: url.hostname,
|
|
211
|
-
dependency_bindings: dependencyBindings,
|
|
212
|
-
search_terms: searchTerms,
|
|
213
|
-
};
|
|
214
|
-
|
|
215
|
-
// Build the grounding prompt (available for optional LLM polish in
|
|
216
|
-
// backend/services/descriptions.ts) and the deterministic description.
|
|
217
|
-
const _prompt = buildDescriptionPrompt(ctx);
|
|
218
|
-
|
|
219
|
-
// Use the grounded description builder from description-prompt.ts so
|
|
220
|
-
// every description references real params and response fields.
|
|
221
|
-
return groundedDescription(ctx);
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
function looksLikeAdResponse(body: string | undefined): boolean {
|
|
225
|
-
if (!body) return false;
|
|
226
|
-
try {
|
|
227
|
-
const parsed = JSON.parse(body);
|
|
228
|
-
const keys = collectKeysShallow(parsed);
|
|
229
|
-
let hits = 0;
|
|
230
|
-
for (const k of keys) {
|
|
231
|
-
if (AD_SCHEMA_KEYS.has(k.toLowerCase())) hits++;
|
|
232
|
-
}
|
|
233
|
-
return hits >= AD_SCHEMA_THRESHOLD;
|
|
234
|
-
} catch {
|
|
235
|
-
return false;
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
function isHtmlResponseBody(body: string | undefined): boolean {
|
|
240
|
-
if (!body) return false;
|
|
241
|
-
const trimmed = body.trim();
|
|
242
|
-
if (!trimmed) return false;
|
|
243
|
-
if (!/[<>]/.test(trimmed)) return false;
|
|
244
|
-
return /<(html|body|head|main|article|div|section|a|script|meta|title)\b/i.test(trimmed) ||
|
|
245
|
-
/<!doctype html/i.test(trimmed);
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
function isJsonResponseBody(body: string | undefined): boolean {
|
|
249
|
-
if (!body) return false;
|
|
250
|
-
try {
|
|
251
|
-
JSON.parse(stripJsonPrefix(body));
|
|
252
|
-
return true;
|
|
253
|
-
} catch {
|
|
254
|
-
return false;
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
function hasAdmissibleParsedBody(body: string | undefined): boolean {
|
|
259
|
-
return isJsonResponseBody(body) || isHtmlResponseBody(body);
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
/** Collect top-level + one-level-nested keys from an object/array */
|
|
263
|
-
function collectKeysShallow(obj: unknown): string[] {
|
|
264
|
-
const keys: string[] = [];
|
|
265
|
-
if (obj && typeof obj === "object") {
|
|
266
|
-
const items = Array.isArray(obj) ? obj.slice(0, 3) : [obj];
|
|
267
|
-
for (const item of items) {
|
|
268
|
-
if (item && typeof item === "object" && !Array.isArray(item)) {
|
|
269
|
-
for (const k of Object.keys(item as Record<string, unknown>)) {
|
|
270
|
-
keys.push(k);
|
|
271
|
-
const val = (item as Record<string, unknown>)[k];
|
|
272
|
-
if (Array.isArray(val) && val.length > 0 && typeof val[0] === "object" && val[0]) {
|
|
273
|
-
keys.push(...Object.keys(val[0] as Record<string, unknown>));
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
return keys;
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
function normalizeTokenText(text: string): string {
|
|
283
|
-
return text
|
|
284
|
-
.replace(/([a-z0-9])([A-Z])/g, "$1 $2")
|
|
285
|
-
.replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
|
|
286
|
-
.replace(/([a-zA-Z])(\d)/g, "$1 $2")
|
|
287
|
-
.replace(/(\d)([a-zA-Z])/g, "$1 $2");
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
function tokenize(text: string | undefined): string[] {
|
|
291
|
-
if (!text) return [];
|
|
292
|
-
return normalizeTokenText(text).toLowerCase().split(/[^a-z0-9]+/).filter(Boolean);
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
function collectSemanticTokens(value: unknown, out = new Set<string>(), depth = 0): Set<string> {
|
|
296
|
-
if (depth > 6 || value == null) return out;
|
|
297
|
-
if (Array.isArray(value)) {
|
|
298
|
-
for (const item of value.slice(0, 3)) collectSemanticTokens(item, out, depth + 1);
|
|
299
|
-
return out;
|
|
300
|
-
}
|
|
301
|
-
if (typeof value === "object") {
|
|
302
|
-
for (const [key, next] of Object.entries(value as Record<string, unknown>).slice(0, 12)) {
|
|
303
|
-
for (const token of tokenize(key)) out.add(token);
|
|
304
|
-
collectSemanticTokens(next, out, depth + 1);
|
|
305
|
-
}
|
|
306
|
-
return out;
|
|
307
|
-
}
|
|
308
|
-
if (typeof value === "string" && value.length <= 64) {
|
|
309
|
-
for (const token of tokenize(value)) out.add(token);
|
|
310
|
-
}
|
|
311
|
-
return out;
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
type IntentEntityKind = "comment" | "post" | "person" | "company" | "repository" | "topic" | "channel" | "listing";
|
|
315
|
-
|
|
316
|
-
type IntentActionKind = "create" | "update" | "delete" | "send" | "read";
|
|
317
|
-
|
|
318
|
-
function inferIntentEntityKind(intent: string | undefined): IntentEntityKind | null {
|
|
319
|
-
const text = intent?.toLowerCase() ?? "";
|
|
320
|
-
if (/\b(comment|comments|reply|replies)\b/.test(text)) return "comment";
|
|
321
|
-
if (/\b(post|posts|status|statuses|tweet|tweets|message|messages|feed|timeline|stream|home)\b/.test(text)) return "post";
|
|
322
|
-
if (/\b(person|people|profile|profiles|member|members|user|users)\b/.test(text)) return "person";
|
|
323
|
-
if (/\b(company|companies|organization|organisations|org|business|businesses)\b/.test(text)) return "company";
|
|
324
|
-
if (/\b(repo|repos|repository|repositories|project|projects)\b/.test(text)) return "repository";
|
|
325
|
-
if (/\b(topic|topics|trend|trends|hashtag|hashtags)\b/.test(text)) return "topic";
|
|
326
|
-
if (/\b(channel|channels|thread|threads|conversation|conversations)\b/.test(text)) return "channel";
|
|
327
|
-
if (/\b(listing|listings|product|products|item|items|marketplace)\b/.test(text)) return "listing";
|
|
328
|
-
return null;
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
function inferIntentActionKind(intent: string | undefined): IntentActionKind {
|
|
332
|
-
const text = intent?.toLowerCase() ?? "";
|
|
333
|
-
if (/\b(create|add|new|compose|draft)\b/.test(text)) return "create";
|
|
334
|
-
if (/\b(update|edit|patch|modify)\b/.test(text)) return "update";
|
|
335
|
-
if (/\b(delete|remove|archive)\b/.test(text)) return "delete";
|
|
336
|
-
if (/\b(send|submit|post|publish)\b/.test(text)) return "send";
|
|
337
|
-
return "read";
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
function parseCookieHeader(header: string | undefined): Record<string, string> {
|
|
341
|
-
if (!header) return {};
|
|
342
|
-
const out: Record<string, string> = {};
|
|
343
|
-
for (const segment of header.split(";")) {
|
|
344
|
-
const idx = segment.indexOf("=");
|
|
345
|
-
if (idx <= 0) continue;
|
|
346
|
-
const key = segment.slice(0, idx).trim();
|
|
347
|
-
const value = segment.slice(idx + 1).trim();
|
|
348
|
-
if (key && !(key in out)) out[key] = value;
|
|
349
|
-
}
|
|
350
|
-
return out;
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
function normalizeBodyBindingKey(path: string): string {
|
|
354
|
-
const normalized = path
|
|
355
|
-
.replace(/\.(\d+)\./g, "_$1_")
|
|
356
|
-
.replace(/\[(\d+)\]/g, "_$1")
|
|
357
|
-
.replace(/[.[\]]+/g, "_")
|
|
358
|
-
.replace(/[^a-zA-Z0-9_]+/g, "_")
|
|
359
|
-
.replace(/_+/g, "_")
|
|
360
|
-
.replace(/^_+|_+$/g, "")
|
|
361
|
-
.toLowerCase();
|
|
362
|
-
return normalized || "value";
|
|
363
|
-
}
|
|
364
|
-
|
|
365
|
-
function shouldTemplateBodyValue(path: string, value: unknown, context?: ExtractionContext): boolean {
|
|
366
|
-
const lowerPath = path.toLowerCase();
|
|
367
|
-
if (value == null) return false;
|
|
368
|
-
if (typeof value === "boolean") return false;
|
|
369
|
-
if (typeof value === "number") {
|
|
370
|
-
return /(?:^|_)(id|count|offset|limit|page|cursor|index|position)(?:$|_)/.test(lowerPath);
|
|
371
|
-
}
|
|
372
|
-
if (typeof value !== "string") return false;
|
|
373
|
-
const trimmed = value.trim();
|
|
374
|
-
if (!trimmed) return false;
|
|
375
|
-
if (trimmed.length > 280) return false;
|
|
376
|
-
if (/^(true|false|null)$/i.test(trimmed)) return false;
|
|
377
|
-
if (/^[A-Z_]{2,24}$/.test(trimmed) && !/(id|slug|urn|token|email|name|title|query|search|message|text)/.test(lowerPath)) return false;
|
|
378
|
-
if (/(?:^|_)(title|name|description|content|text|message|body|query|search|keyword|email|username|slug|handle|identifier|id|urn|url)(?:$|_)/.test(lowerPath)) {
|
|
379
|
-
return true;
|
|
380
|
-
}
|
|
381
|
-
if (/^urn:[\w:-]+$/i.test(trimmed) || /^[0-9a-f]{8,}$/i.test(trimmed.replace(/-/g, ""))) return true;
|
|
382
|
-
if (context?.pageUrl) {
|
|
383
|
-
try {
|
|
384
|
-
const pageUrl = new URL(context.pageUrl);
|
|
385
|
-
if (pageUrl.searchParams.has(trimmed)) return true;
|
|
386
|
-
if (pageUrl.pathname.toLowerCase().includes(trimmed.toLowerCase())) return true;
|
|
387
|
-
} catch {
|
|
388
|
-
/* ignore */
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
return false;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
function templatizeBodyObject(
|
|
395
|
-
value: unknown,
|
|
396
|
-
context?: ExtractionContext,
|
|
397
|
-
path = "",
|
|
398
|
-
bodyParams: Record<string, unknown> = {},
|
|
399
|
-
): unknown {
|
|
400
|
-
if (Array.isArray(value)) {
|
|
401
|
-
return value.map((entry, index) => templatizeBodyObject(entry, context, `${path}[${index}]`, bodyParams));
|
|
402
|
-
}
|
|
403
|
-
if (!value || typeof value !== "object") {
|
|
404
|
-
if (!path || !shouldTemplateBodyValue(path, value, context)) return value;
|
|
405
|
-
const binding = normalizeBodyBindingKey(path);
|
|
406
|
-
if (!(binding in bodyParams)) bodyParams[binding] = value;
|
|
407
|
-
return `{${binding}}`;
|
|
408
|
-
}
|
|
409
|
-
return Object.fromEntries(
|
|
410
|
-
Object.entries(value as Record<string, unknown>).map(([key, next]) => [
|
|
411
|
-
key,
|
|
412
|
-
templatizeBodyObject(next, context, path ? `${path}.${key}` : key, bodyParams),
|
|
413
|
-
]),
|
|
414
|
-
);
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
function inferCsrfPlan(req: RawRequest, parsedBody?: unknown): CsrfPlan | undefined {
|
|
418
|
-
const headers = Object.fromEntries(
|
|
419
|
-
Object.entries(req.request_headers).map(([key, value]) => [key.toLowerCase(), value]),
|
|
420
|
-
);
|
|
421
|
-
const cookies = parseCookieHeader(headers["cookie"]);
|
|
422
|
-
const csrfCookieNames = Object.keys(cookies).filter((name) => /^(ct0|csrf_token|_csrf|csrftoken|xsrf-token|_xsrf|jsessionid)$/i.test(name));
|
|
423
|
-
const headerName = ["x-csrf-token", "x-xsrf-token", "x-csrftoken", "csrf-token"].find((name) => typeof headers[name] === "string" && headers[name].length > 0);
|
|
424
|
-
if (headerName && csrfCookieNames.length > 0) {
|
|
425
|
-
return {
|
|
426
|
-
source: "cookie",
|
|
427
|
-
param_name: headerName,
|
|
428
|
-
refresh_on_401: true,
|
|
429
|
-
extractor_sequence: csrfCookieNames,
|
|
430
|
-
};
|
|
431
|
-
}
|
|
432
|
-
if (parsedBody && typeof parsedBody === "object" && !Array.isArray(parsedBody)) {
|
|
433
|
-
const formField = Object.keys(parsedBody as Record<string, unknown>).find((key) => /^(csrf|csrf_token|_csrf|authenticity_token|xsrf)$/i.test(key));
|
|
434
|
-
if (formField && csrfCookieNames.length > 0) {
|
|
435
|
-
return {
|
|
436
|
-
source: "form",
|
|
437
|
-
param_name: formField,
|
|
438
|
-
refresh_on_401: true,
|
|
439
|
-
extractor_sequence: csrfCookieNames,
|
|
440
|
-
};
|
|
441
|
-
}
|
|
442
|
-
}
|
|
443
|
-
return undefined;
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
function getIntentEntityRules(kind: IntentEntityKind): { strong: string[]; weak: string[]; negative: RegExp; negativeSignals?: string[] } {
|
|
447
|
-
switch (kind) {
|
|
448
|
-
case "comment":
|
|
449
|
-
return {
|
|
450
|
-
strong: ["comment", "comments", "body", "bodyhtml", "author", "replies", "reply", "parentid", "permalink", "score"],
|
|
451
|
-
weak: ["text", "content", "created", "subreddit", "depth", "children"],
|
|
452
|
-
negative: /(subreddits?(\/|$)|communities|communityinfo|about(\.json)?$|accounts(\/|$)|people|profiles|instance|custom_emojis)/i,
|
|
453
|
-
negativeSignals: ["displayname", "subscribers", "communityicon", "activeusercount", "subreddittype", "bannerimg"],
|
|
454
|
-
};
|
|
455
|
-
case "post":
|
|
456
|
-
return {
|
|
457
|
-
strong: ["status", "statuses", "post", "posts", "feed", "timeline", "update", "updates", "content", "text", "title", "author", "actor", "commentary", "permalink", "score", "numcomments", "num_comments", "selftext", "reblog", "spoiler", "socialdetail", "socialactivitycounts"],
|
|
458
|
-
weak: ["blog", "body", "reply", "replies", "favourites", "favourited", "published", "visibility", "subreddit", "created", "activity", "activities", "element", "elements", "reshare", "reaction", "reactions"],
|
|
459
|
-
negative: /(subreddits?(\/|$)|communityinfo|about(\.json)?$|trends\/tags|custom_emojis|instance|filters|accounts(\/|$)|reports(\/|$)|packs\/assets)/i,
|
|
460
|
-
negativeSignals: ["displayname", "display_name", "subscribers", "communityicon", "community_icon", "activeusercount", "active_user_count", "subreddittype", "subreddit_type", "bannerimg", "banner_img"],
|
|
461
|
-
};
|
|
462
|
-
case "person":
|
|
463
|
-
return {
|
|
464
|
-
strong: ["publicidentifier", "firstname", "lastname", "headline", "displayname", "fullname", "occupation", "username", "acct", "screen", "followers", "following", "bio", "avatar", "verified"],
|
|
465
|
-
weak: ["person", "people", "profile", "profiles", "member", "members", "actor", "name", "title", "description", "viewer", "user", "users"],
|
|
466
|
-
negative: /(policy\/notices|globalalerts|badging|notification|messaging|mailbox|launchpad|identitymodule|globalnav|feeddash|topics|realtime|tracking|tracko11y|allowlist|preload|presence)/i,
|
|
467
|
-
};
|
|
468
|
-
case "company":
|
|
469
|
-
return {
|
|
470
|
-
strong: ["company", "organization", "organisation", "org", "staffcount", "employees", "industry", "industries", "tagline", "overview", "about", "headquarters", "websiteurl", "companyname"],
|
|
471
|
-
weak: ["name", "followers", "location", "specialties", "logo", "description"],
|
|
472
|
-
negative: /(people|profiles|members|globalnav|launchpad|messaging|mailbox|tracking|notification|preload|presence|metadata$)/i,
|
|
473
|
-
negativeSignals: ["navigationcontext", "trackingid", "tracking", "globalnav", "mailbox", "notification"],
|
|
474
|
-
};
|
|
475
|
-
case "repository":
|
|
476
|
-
return {
|
|
477
|
-
strong: ["repository", "repositories", "fullname", "stargazers", "forks", "owner", "language", "license", "defaultbranch"],
|
|
478
|
-
weak: ["repo", "repos", "topic", "topics", "description", "watchers", "openissues"],
|
|
479
|
-
negative: /(notifications|sponsors|settings|sessions|codespaces|copilot|marketplace)/i,
|
|
480
|
-
};
|
|
481
|
-
case "topic":
|
|
482
|
-
return {
|
|
483
|
-
strong: ["topic", "topics", "trend", "trends", "hashtag", "hashtags", "tag", "tags"],
|
|
484
|
-
weak: ["name", "volume", "url"],
|
|
485
|
-
negative: /(accounts|people|profiles|messages|mailbox|notifications)/i,
|
|
486
|
-
};
|
|
487
|
-
case "channel":
|
|
488
|
-
return {
|
|
489
|
-
strong: ["channel", "channels", "thread", "threads", "conversation", "conversations", "guild", "room"],
|
|
490
|
-
weak: ["message", "messages", "name", "topic"],
|
|
491
|
-
negative: /(experiments|affinities|promotions|settings|notifications|status)/i,
|
|
492
|
-
};
|
|
493
|
-
case "listing":
|
|
494
|
-
return {
|
|
495
|
-
strong: ["listing", "listings", "price", "seller", "currency", "product"],
|
|
496
|
-
weak: ["title", "bed", "bath", "address", "location"],
|
|
497
|
-
negative: /(tracking|ads|telemetry|config|status|auth)/i,
|
|
498
|
-
};
|
|
499
|
-
}
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
function isSemanticallyAdmissibleResponse(
|
|
503
|
-
req: RawRequest,
|
|
504
|
-
sampleResponse: unknown,
|
|
505
|
-
sampleRequest: Record<string, unknown>,
|
|
506
|
-
context?: ExtractionContext,
|
|
507
|
-
): { ok: boolean; reason: string } {
|
|
508
|
-
const kind = inferIntentEntityKind(context?.intent);
|
|
509
|
-
const action = inferIntentActionKind(context?.intent);
|
|
510
|
-
if (!kind) {
|
|
511
|
-
if (action === "read") return { ok: true, reason: "semantic_gate_not_applicable" };
|
|
512
|
-
const requestSignals = collectSemanticTokens(sampleRequest);
|
|
513
|
-
const responseSignals = collectSemanticTokens(sampleResponse);
|
|
514
|
-
const signalCount = requestSignals.size + responseSignals.size;
|
|
515
|
-
return signalCount >= 2
|
|
516
|
-
? { ok: true, reason: "semantic_action_request_match" }
|
|
517
|
-
: { ok: false, reason: "semantic_action_sparse" };
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
const bodyIsJson = isJsonResponseBody(req.response_body);
|
|
521
|
-
if (!bodyIsJson && isHtmlResponseBody(req.response_body)) {
|
|
522
|
-
try {
|
|
523
|
-
const reqPath = new URL(req.url).pathname;
|
|
524
|
-
const pagePath = context?.pageUrl ? new URL(context.pageUrl).pathname : "";
|
|
525
|
-
return reqPath === pagePath
|
|
526
|
-
? { ok: true, reason: "semantic_html_page_candidate" }
|
|
527
|
-
: { ok: false, reason: "semantic_html_not_page" };
|
|
528
|
-
} catch {
|
|
529
|
-
return { ok: false, reason: "semantic_html_bad_url" };
|
|
530
|
-
}
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
const { strong, weak, negative, negativeSignals = [] } = getIntentEntityRules(kind);
|
|
534
|
-
if (negative.test(req.url)) return { ok: false, reason: "semantic_negative_url" };
|
|
535
|
-
|
|
536
|
-
const signals = collectSemanticTokens(sampleResponse);
|
|
537
|
-
collectSemanticTokens(sampleRequest, signals);
|
|
538
|
-
for (const token of tokenize(req.url)) signals.add(token);
|
|
539
|
-
let strongHits = 0;
|
|
540
|
-
let weakHits = 0;
|
|
541
|
-
let negativeHits = 0;
|
|
542
|
-
for (const token of strong) {
|
|
543
|
-
if (signals.has(token)) strongHits++;
|
|
544
|
-
}
|
|
545
|
-
for (const token of weak) {
|
|
546
|
-
if (signals.has(token)) weakHits++;
|
|
547
|
-
}
|
|
548
|
-
for (const token of negativeSignals) {
|
|
549
|
-
if (signals.has(token)) negativeHits++;
|
|
550
|
-
}
|
|
551
|
-
if (negativeHits >= 2 && strongHits < 2) {
|
|
552
|
-
return { ok: false, reason: "semantic_negative_payload" };
|
|
553
|
-
}
|
|
554
|
-
if (action !== "read" && strongHits === 0 && weakHits >= 2) {
|
|
555
|
-
return { ok: true, reason: "semantic_action_request_match" };
|
|
556
|
-
}
|
|
557
|
-
return (strongHits >= 1) || (weakHits >= 3)
|
|
558
|
-
? { ok: true, reason: "semantic_match" }
|
|
559
|
-
: { ok: false, reason: "semantic_entity_mismatch" };
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
// On-domain noise patterns — framework plumbing, auth, tracking, ads that live
|
|
563
|
-
// on the site's own domain (not caught by SKIP_HOSTS since they're same-origin).
|
|
564
|
-
const ON_DOMAIN_NOISE = /\/(recaptcha|captcha|update-recaptcha|csrf|consent|data-protection|badge|drawer|header-action|geolocation|onboarding|wana\/bids|prebid|bids\/request|ads\/|pixel|beacon|collect|impression|click-tracking|heartbeat|webConfig|config\.json|manifest\.json|service-worker|sw\.js|favicon|robots\.txt|sitemap|opensearch|partial\/[a-zA-Z]+\/mod-|logging|csp-report|gen_204|generate_204|sodar|__|devvit-|user-drawer|action-item)/i;
|
|
565
|
-
|
|
566
|
-
// Score a request: higher = more likely to be a real data API (BUG-GC-004)
|
|
567
|
-
function scoreRequest(req: RawRequest): number {
|
|
568
|
-
let score = 0;
|
|
569
|
-
// GET is preferred — safe, idempotent, more useful for data retrieval
|
|
570
|
-
if (req.method === "GET") score += 2;
|
|
571
|
-
if (RPC_HINTS.test(req.url)) score += 3;
|
|
572
|
-
if (SKIP_JS_BUNDLES.test(req.url)) score -= 10;
|
|
573
|
-
const ct = req.response_headers?.["content-type"] ?? "";
|
|
574
|
-
if (ct.includes("application/json") && !ct.includes("protobuf")) score += 4;
|
|
575
|
-
// Fallback: if response_headers is empty (common in tracked requests), check if body is JSON
|
|
576
|
-
else if (!ct && req.response_body) {
|
|
577
|
-
try { JSON.parse(stripJsonPrefix(req.response_body)); score += 4; } catch { /* not JSON */ }
|
|
578
|
-
}
|
|
579
|
-
// Protobuf responses are not parseable — score neutral, don't reward (BUG-GC-006)
|
|
580
|
-
if (ct.includes("x-protobuf") || ct.includes("json+protobuf")) score += 0;
|
|
581
|
-
// Penalise long URLs — but only the path, not query params (GraphQL endpoints
|
|
582
|
-
// have long variables/features query strings that inflate the URL length)
|
|
583
|
-
try { if (new URL(req.url).pathname.length > 200) score -= 5; } catch { if (req.url.length > 500) score -= 5; }
|
|
584
|
-
// Penalise telemetry paths even if they passed the host filter
|
|
585
|
-
if (SKIP_TELEMETRY_PATHS.test(new URL(req.url).pathname)) score -= 8;
|
|
586
|
-
// Penalise Next.js RSC navigation requests — framework wire format, not data
|
|
587
|
-
if (req.url.includes("_rsc=")) score -= 3;
|
|
588
|
-
if (ct.includes("text/x-component")) score -= 10; // RSC wire format
|
|
589
|
-
// #227: Structural RSC body detection — catches payloads without URL/content-type hints
|
|
590
|
-
if (isRscPayload(req.response_body ?? "")) score -= 15;
|
|
591
|
-
// Penalise on-domain noise (framework plumbing, recaptcha, consent, ad bids)
|
|
592
|
-
try { if (ON_DOMAIN_NOISE.test(new URL(req.url).pathname)) score -= 15; } catch {}
|
|
593
|
-
// Reward rich JSON responses (data endpoints have deep objects, noise has shallow)
|
|
594
|
-
if (req.response_body) {
|
|
595
|
-
try {
|
|
596
|
-
const parsed = JSON.parse(stripJsonPrefix(req.response_body));
|
|
597
|
-
const bodyStr = req.response_body;
|
|
598
|
-
// Responses with many keys = likely data. Tiny responses = config/status.
|
|
599
|
-
if (bodyStr.length > 500) score += 3;
|
|
600
|
-
if (bodyStr.length > 2000) score += 2;
|
|
601
|
-
// Array responses are usually data listings
|
|
602
|
-
if (Array.isArray(parsed) && parsed.length > 0) score += 3;
|
|
603
|
-
} catch { /* not JSON */ }
|
|
604
|
-
}
|
|
605
|
-
return score;
|
|
606
|
-
}
|
|
607
|
-
|
|
608
|
-
export interface ExtractionContext {
|
|
609
|
-
/** The page URL that was captured (used to detect entity values in API paths) */
|
|
610
|
-
pageUrl?: string;
|
|
611
|
-
/** The final URL after redirects (e.g. lu.ma → luma.com) */
|
|
612
|
-
finalUrl?: string;
|
|
613
|
-
/** The user's intent string */
|
|
614
|
-
intent?: string;
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
export function extractEndpoints(requests: RawRequest[], wsMessages?: CapturedWsMessage[], context?: ExtractionContext): EndpointDescriptor[] {
|
|
618
|
-
const seen = new Set<string>();
|
|
619
|
-
const endpoints: EndpointDescriptor[] = [];
|
|
620
|
-
const traceRows: Array<Record<string, unknown>> = [];
|
|
621
|
-
|
|
622
|
-
// Extract the registrable domain(s) for affinity filtering.
|
|
623
|
-
// Include both pageUrl and finalUrl domains to handle redirects
|
|
624
|
-
// (e.g. lu.ma → luma.com where API lives on api2.luma.com).
|
|
625
|
-
const affinityDomains = new Set<string>();
|
|
626
|
-
for (const u of [context?.pageUrl, context?.finalUrl]) {
|
|
627
|
-
if (!u) continue;
|
|
628
|
-
try { affinityDomains.add(getRegistrableDomain(new URL(u).hostname)); } catch { /* bad url */ }
|
|
629
|
-
}
|
|
630
|
-
|
|
631
|
-
const scored: Array<{ req: RawRequest; score: number }> = [];
|
|
632
|
-
for (const req of requests) {
|
|
633
|
-
const score = scoreRequest(req);
|
|
634
|
-
if (!isApiLike(req)) {
|
|
635
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "not_api_like" });
|
|
636
|
-
continue;
|
|
637
|
-
}
|
|
638
|
-
if (score <= 0) {
|
|
639
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "score_non_positive" });
|
|
640
|
-
continue;
|
|
641
|
-
}
|
|
642
|
-
if (!hasAdmissibleParsedBody(req.response_body)) {
|
|
643
|
-
// API endpoints may have large/truncated/missing response bodies.
|
|
644
|
-
// Admit them anyway if the URL pattern is clearly an API endpoint.
|
|
645
|
-
const urlPath = (() => { try { return new URL(req.url).pathname; } catch { return ""; } })();
|
|
646
|
-
const isApiUrl = /\/(api|graphql)\b/i.test(urlPath) || /\.(json)(\?|$)/.test(req.url);
|
|
647
|
-
|
|
648
|
-
// For GraphQL: extract operationName from request body or URL
|
|
649
|
-
let graphqlOpName: string | undefined;
|
|
650
|
-
if (/graphql/i.test(req.url)) {
|
|
651
|
-
if (req.request_body) {
|
|
652
|
-
try {
|
|
653
|
-
const body = JSON.parse(req.request_body);
|
|
654
|
-
graphqlOpName = body.operationName ?? body.query?.match(/(?:query|mutation)\s+(\w+)/)?.[1];
|
|
655
|
-
} catch { /* not JSON */ }
|
|
656
|
-
}
|
|
657
|
-
// Also try extracting from URL query (GET GraphQL endpoints encode operationName in URL)
|
|
658
|
-
if (!graphqlOpName) {
|
|
659
|
-
const urlMatch = req.url.match(/\/graphql\/\w+\/(\w+)/);
|
|
660
|
-
if (urlMatch) graphqlOpName = urlMatch[1];
|
|
661
|
-
}
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
// For .json endpoints: use the last path segment as description
|
|
665
|
-
const jsonEndpointName = /\.(json)(\?|$)/.test(req.url) ? urlPath.split("/").pop()?.replace(".json", "") : undefined;
|
|
666
|
-
|
|
667
|
-
if (!isApiUrl) {
|
|
668
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "body_not_json_or_html" });
|
|
669
|
-
continue;
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
// Inject a synthetic response body so downstream processing works
|
|
673
|
-
const syntheticName = graphqlOpName ?? jsonEndpointName ?? "api_endpoint";
|
|
674
|
-
req.response_body = JSON.stringify({ data: { __typename: syntheticName } });
|
|
675
|
-
req.response_headers = { ...req.response_headers, "content-type": "application/json" };
|
|
676
|
-
}
|
|
677
|
-
// #227: Reject React Server Components wire format payloads — they are framework
|
|
678
|
-
// rendering wire format, not data APIs. Use the proper RSC parser instead of
|
|
679
|
-
// relying solely on URL heuristics (_rsc=) or content-type (text/x-component).
|
|
680
|
-
if (isRscPayload(req.response_body ?? "")) {
|
|
681
|
-
const rscUrls = extractRscDataEndpoints(req.response_body ?? "");
|
|
682
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "rsc_payload", rsc_embedded_urls: rscUrls.length > 0 ? rscUrls : undefined });
|
|
683
|
-
continue;
|
|
684
|
-
}
|
|
685
|
-
if (affinityDomains.size > 0) {
|
|
686
|
-
try {
|
|
687
|
-
const reqHost = new URL(req.url).hostname;
|
|
688
|
-
const reqDomain = getRegistrableDomain(reqHost);
|
|
689
|
-
if (!affinityDomains.has(reqDomain)) {
|
|
690
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "domain_mismatch" });
|
|
691
|
-
continue;
|
|
692
|
-
}
|
|
693
|
-
} catch {
|
|
694
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "bad_url" });
|
|
695
|
-
continue;
|
|
696
|
-
}
|
|
697
|
-
}
|
|
698
|
-
traceRows.push({ url: req.url, method: req.method, score, kept: true, reason: "candidate" });
|
|
699
|
-
scored.push({ req, score });
|
|
700
|
-
}
|
|
701
|
-
scored.sort((a, b) => b.score - a.score);
|
|
702
|
-
|
|
703
|
-
// For passive captures (no context page URL), pre-compute path templates across
|
|
704
|
-
// all candidate paths so individual endpoints can be annotated without needing
|
|
705
|
-
// collapseEndpoints' sibling grouping.
|
|
706
|
-
const minedTemplateMap = !context?.pageUrl
|
|
707
|
-
? minePathTemplates(scored.map(({ req }) => {
|
|
708
|
-
try { return new URL(req.url).pathname; } catch { return ""; }
|
|
709
|
-
}).filter(Boolean))
|
|
710
|
-
: new Map<string, string>();
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
for (const { req } of scored) {
|
|
714
|
-
const normalized = normalizeUrl(req.url);
|
|
715
|
-
const key = `${req.method}:${normalized}`;
|
|
716
|
-
if (seen.has(key)) continue;
|
|
717
|
-
seen.add(key);
|
|
718
|
-
|
|
719
|
-
// Schema-level ad detection: skip endpoints whose response body looks like ad-server data
|
|
720
|
-
if (looksLikeAdResponse(req.response_body)) {
|
|
721
|
-
traceRows.push({ url: req.url, method: req.method, kept: false, reason: "ad_response" });
|
|
722
|
-
continue;
|
|
723
|
-
}
|
|
724
|
-
|
|
725
|
-
// BUG-008: Detect Cloudflare challenge responses — exclude from skill
|
|
726
|
-
if (isCloudflareChallenge(req.response_body)) {
|
|
727
|
-
traceRows.push({ url: req.url, method: req.method, kept: false, reason: "cloudflare_challenge" });
|
|
728
|
-
continue;
|
|
729
|
-
}
|
|
730
|
-
|
|
731
|
-
// BUG-GC-006: Skip protobuf-only endpoints — we can't parse their bodies
|
|
732
|
-
const ct = req.response_headers?.["content-type"] ?? "";
|
|
733
|
-
if ((ct.includes("x-protobuf") || ct.includes("json+protobuf")) && !isJsonParseable(req.response_body)) {
|
|
734
|
-
traceRows.push({ url: req.url, method: req.method, kept: false, reason: "protobuf_unparseable" });
|
|
735
|
-
continue;
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
const isGet = req.method === "GET";
|
|
739
|
-
|
|
740
|
-
// Infer response schema from captured body
|
|
741
|
-
let response_schema = undefined;
|
|
742
|
-
if (req.response_body) {
|
|
743
|
-
try {
|
|
744
|
-
const cleaned = stripJsonPrefix(req.response_body);
|
|
745
|
-
const parsed = JSON.parse(cleaned);
|
|
746
|
-
response_schema = inferSchema([parsed]);
|
|
747
|
-
} catch {
|
|
748
|
-
// not valid JSON — skip schema inference
|
|
749
|
-
}
|
|
750
|
-
}
|
|
751
|
-
|
|
752
|
-
// BUG-008: mark endpoints with no response body as potentially CF-blocked
|
|
753
|
-
const verificationStatus = req.response_body ? "unverified" as const : "pending" as const;
|
|
754
|
-
|
|
755
|
-
// Skip endpoints with invalid URL templates
|
|
756
|
-
if (!normalized.startsWith("http://") && !normalized.startsWith("https://")) {
|
|
757
|
-
traceRows.push({ url: req.url, method: req.method, kept: false, reason: "normalized_url_invalid" });
|
|
758
|
-
continue;
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
// Build url_template with templatized query params so callers know what to pass.
|
|
762
|
-
// normalizeUrl strips the query string; we rebuild it with {param} placeholders.
|
|
763
|
-
// endpoint.query stores the captured defaults for execution-time fallback.
|
|
764
|
-
const sanitizedQParams = isGet ? sanitizeQueryParams(extractQueryParams(req.url)) : undefined;
|
|
765
|
-
let pathTemplate = sanitizeUrlTemplate(normalized);
|
|
766
|
-
const qBindings = sanitizedQParams ? buildQueryBindingMap(Object.keys(sanitizedQParams)) : {};
|
|
767
|
-
const qTemplateStr = sanitizedQParams && Object.keys(sanitizedQParams).length > 0
|
|
768
|
-
? Object.keys(sanitizedQParams).map((k) => `${encodeURIComponent(k)}={${qBindings[k] ?? k}}`).join("&")
|
|
769
|
-
: null;
|
|
770
|
-
|
|
771
|
-
// BUG-006: Parameterize dynamic path segments (comma lists, page URL entities)
|
|
772
|
-
const { url: templatizedPath, pathParams, pathBindingCandidates } = templatizePathSegments(pathTemplate, req.url, context);
|
|
773
|
-
pathTemplate = templatizedPath;
|
|
774
|
-
|
|
775
|
-
const parsedRequestBody = !isGet && req.request_body ? tryParseBody(req.request_body) : undefined;
|
|
776
|
-
const bodyParams: Record<string, unknown> = {};
|
|
777
|
-
const templatedRequestBody = !isGet && parsedRequestBody && typeof parsedRequestBody === "object" && !Array.isArray(parsedRequestBody)
|
|
778
|
-
? templatizeBodyObject(parsedRequestBody, context, "", bodyParams) as Record<string, unknown>
|
|
779
|
-
: parsedRequestBody;
|
|
780
|
-
const sampleResponse = req.response_body ? tryParseBody(req.response_body) : undefined;
|
|
781
|
-
const sampleRequest = flattenRequestExample({
|
|
782
|
-
path_params: Object.keys(pathParams).length > 0 ? pathParams : undefined,
|
|
783
|
-
query: sanitizedQParams,
|
|
784
|
-
body: templatedRequestBody,
|
|
785
|
-
});
|
|
786
|
-
const csrfPlan = inferCsrfPlan(req, parsedRequestBody);
|
|
787
|
-
|
|
788
|
-
let endpoint: EndpointDescriptor = {
|
|
789
|
-
endpoint_id: nanoid(),
|
|
790
|
-
method: req.method as EndpointDescriptor["method"],
|
|
791
|
-
url_template: qTemplateStr ? `${pathTemplate}?${qTemplateStr}` : pathTemplate,
|
|
792
|
-
description: buildEndpointDescription(req, sampleRequest, sampleResponse),
|
|
793
|
-
headers_template: sanitizeHeaders(req.request_headers),
|
|
794
|
-
query: sanitizedQParams,
|
|
795
|
-
path_params: Object.keys(pathParams).length > 0 ? pathParams : undefined,
|
|
796
|
-
...(Object.keys(bodyParams).length > 0 ? { body_params: bodyParams } : {}),
|
|
797
|
-
...(templatedRequestBody && typeof templatedRequestBody === "object" && !Array.isArray(templatedRequestBody) ? { body: templatedRequestBody as Record<string, unknown> } : {}),
|
|
798
|
-
...(csrfPlan ? { csrf_plan: csrfPlan } : {}),
|
|
799
|
-
idempotency: isGet ? "safe" : "unsafe",
|
|
800
|
-
verification_status: verificationStatus,
|
|
801
|
-
reliability_score: 0.5,
|
|
802
|
-
response_schema,
|
|
803
|
-
// Record which page triggered this API call — used for trigger-and-intercept execution
|
|
804
|
-
trigger_url: context?.pageUrl,
|
|
805
|
-
...(pathBindingCandidates.length > 0 ? { _path_binding_candidates: pathBindingCandidates } : {}),
|
|
806
|
-
};
|
|
807
|
-
endpoint = resolveEndpointPathBindings(endpoint);
|
|
808
|
-
endpoint.semantic = inferEndpointSemantic(endpoint, {
|
|
809
|
-
sampleResponse: compactForSemanticExample(sampleResponse),
|
|
810
|
-
sampleRequest,
|
|
811
|
-
observedAt: req.timestamp,
|
|
812
|
-
sampleRequestUrl: req.url,
|
|
813
|
-
});
|
|
814
|
-
if (csrfPlan) {
|
|
815
|
-
endpoint.semantic = {
|
|
816
|
-
...(endpoint.semantic ?? {}),
|
|
817
|
-
action_kind: endpoint.semantic?.action_kind ?? (isGet ? "detail" : "create"),
|
|
818
|
-
resource_kind: endpoint.semantic?.resource_kind ?? "resource",
|
|
819
|
-
auth_required: true,
|
|
820
|
-
};
|
|
821
|
-
}
|
|
822
|
-
endpoint.description = endpoint.semantic?.description_out ?? endpoint.description;
|
|
823
|
-
const admission = isSemanticallyAdmissibleResponse(req, sampleResponse, sampleRequest, context);
|
|
824
|
-
if (!admission.ok) {
|
|
825
|
-
traceRows.push({
|
|
826
|
-
url: req.url,
|
|
827
|
-
method: req.method,
|
|
828
|
-
kept: false,
|
|
829
|
-
reason: admission.reason,
|
|
830
|
-
});
|
|
831
|
-
continue;
|
|
832
|
-
}
|
|
833
|
-
traceRows.push({
|
|
834
|
-
url: req.url,
|
|
835
|
-
method: req.method,
|
|
836
|
-
kept: true,
|
|
837
|
-
reason: admission.reason === "semantic_match" ? "accepted_endpoint" : admission.reason,
|
|
838
|
-
endpoint_id: endpoint.endpoint_id,
|
|
839
|
-
description: endpoint.description,
|
|
840
|
-
action_kind: endpoint.semantic?.action_kind,
|
|
841
|
-
resource_kind: endpoint.semantic?.resource_kind,
|
|
842
|
-
});
|
|
843
|
-
// Annotate with mined template when available (passive capture, no page context)
|
|
844
|
-
try {
|
|
845
|
-
const pathname = new URL(req.url).pathname;
|
|
846
|
-
const mined = minedTemplateMap.get(pathname);
|
|
847
|
-
if (mined) endpoint._minedTemplate = mined;
|
|
848
|
-
} catch { /* ignore bad URLs */ }
|
|
849
|
-
endpoints.push(endpoint);
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
// Collapse sibling endpoints into templatized ones
|
|
853
|
-
// e.g. /ticker-sentiment/MSFT + /ticker-sentiment/NVDA → /ticker-sentiment/{ticker}
|
|
854
|
-
const deduped = collapseEndpoints(endpoints);
|
|
855
|
-
endpoints.length = 0;
|
|
856
|
-
endpoints.push(...deduped);
|
|
857
|
-
|
|
858
|
-
// Create endpoints from WebSocket messages
|
|
859
|
-
if (wsMessages && wsMessages.length > 0) {
|
|
860
|
-
const wsByUrl = new Map<string, CapturedWsMessage[]>();
|
|
861
|
-
for (const msg of wsMessages) {
|
|
862
|
-
const arr = wsByUrl.get(msg.url) ?? [];
|
|
863
|
-
arr.push(msg);
|
|
864
|
-
wsByUrl.set(msg.url, arr);
|
|
865
|
-
}
|
|
866
|
-
|
|
867
|
-
for (const [wsUrl, msgs] of wsByUrl) {
|
|
868
|
-
const received = msgs.filter((m) => m.direction === "received");
|
|
869
|
-
const wsMsgList: WsMessage[] = msgs.map((m) => ({
|
|
870
|
-
direction: m.direction,
|
|
871
|
-
data: m.data,
|
|
872
|
-
timestamp: m.timestamp,
|
|
873
|
-
}));
|
|
874
|
-
|
|
875
|
-
// Try to infer response schema from first few received JSON messages
|
|
876
|
-
let response_schema = undefined;
|
|
877
|
-
const jsonSamples: unknown[] = [];
|
|
878
|
-
for (const m of received.slice(0, 5)) {
|
|
879
|
-
try {
|
|
880
|
-
jsonSamples.push(JSON.parse(m.data));
|
|
881
|
-
} catch { /* not JSON */ }
|
|
882
|
-
}
|
|
883
|
-
if (jsonSamples.length > 0) {
|
|
884
|
-
response_schema = inferSchema(jsonSamples);
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
const endpoint: EndpointDescriptor = {
|
|
888
|
-
endpoint_id: nanoid(),
|
|
889
|
-
method: "WS",
|
|
890
|
-
url_template: wsUrl,
|
|
891
|
-
idempotency: "safe",
|
|
892
|
-
verification_status: "unverified",
|
|
893
|
-
reliability_score: jsonSamples.length > 0 ? 0.7 : 0.3,
|
|
894
|
-
response_schema,
|
|
895
|
-
ws_messages: wsMsgList,
|
|
896
|
-
};
|
|
897
|
-
endpoint.semantic = inferEndpointSemantic(endpoint, {
|
|
898
|
-
sampleResponse: jsonSamples[0],
|
|
899
|
-
observedAt: msgs[0]?.timestamp,
|
|
900
|
-
sampleRequestUrl: wsUrl,
|
|
901
|
-
});
|
|
902
|
-
endpoint.description = endpoint.semantic?.description_out ?? endpoint.description;
|
|
903
|
-
endpoints.push(endpoint);
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
|
|
907
|
-
writeDebugTrace("generation", {
|
|
908
|
-
page_url: context?.pageUrl ?? null,
|
|
909
|
-
final_url: context?.finalUrl ?? null,
|
|
910
|
-
intent: context?.intent ?? null,
|
|
911
|
-
candidate_count: scored.length,
|
|
912
|
-
accepted_count: endpoints.length,
|
|
913
|
-
decisions: traceRows,
|
|
914
|
-
accepted_endpoints: endpoints.map((endpoint) => ({
|
|
915
|
-
endpoint_id: endpoint.endpoint_id,
|
|
916
|
-
method: endpoint.method,
|
|
917
|
-
url_template: endpoint.url_template,
|
|
918
|
-
description: endpoint.description,
|
|
919
|
-
action_kind: endpoint.semantic?.action_kind,
|
|
920
|
-
resource_kind: endpoint.semantic?.resource_kind,
|
|
921
|
-
})),
|
|
922
|
-
});
|
|
923
|
-
|
|
924
|
-
return endpoints;
|
|
925
|
-
}
|
|
926
|
-
|
|
927
|
-
function isApiLike(req: RawRequest): boolean {
|
|
928
|
-
if (!ALLOWED_METHODS.has(req.method.toUpperCase())) return false;
|
|
929
|
-
if (SKIP_EXTENSIONS.test(req.url)) return false;
|
|
930
|
-
if (SKIP_JS_BUNDLES.test(req.url)) return false;
|
|
931
|
-
if (SKIP_PATHS.test(req.url)) return false;
|
|
932
|
-
try {
|
|
933
|
-
const { hostname, pathname } = new URL(req.url);
|
|
934
|
-
if (SKIP_HOSTS.test(hostname)) return false;
|
|
935
|
-
if (SKIP_TELEMETRY_HOSTS.test(hostname)) return false; // BUG-GC-004
|
|
936
|
-
if (SKIP_TELEMETRY_PATHS.test(pathname)) return false; // BUG-GC-004
|
|
937
|
-
if (AD_HOSTS.test(hostname)) return false;
|
|
938
|
-
// play.google.com/log is telemetry, not calendar data
|
|
939
|
-
if (hostname === "play.google.com" && pathname.startsWith("/log")) return false;
|
|
940
|
-
// Skip image CDN paths (coin images, avatars, etc.)
|
|
941
|
-
if (/\/(coin-image|avatar|profile-image)\//.test(pathname)) return false;
|
|
942
|
-
// Hard-skip on-domain noise that's never useful data
|
|
943
|
-
if (/\/(recaptcha|update-recaptcha|captcha|wana\/bids|prebid|bids\/request|pixel[s]?\/|beacon\/|csp-report|service-worker|sw\.js$|favicon|robots\.txt$|sitemap|opensearch)/.test(pathname)) return false;
|
|
944
|
-
} catch {
|
|
945
|
-
return false;
|
|
946
|
-
}
|
|
947
|
-
// Skip tiny responses — config/status/empty endpoints, not data
|
|
948
|
-
if (req.response_body && req.response_body.length < 20) return false;
|
|
949
|
-
return true;
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
function normalizeUrl(rawUrl: string): string {
|
|
953
|
-
try {
|
|
954
|
-
const u = new URL(rawUrl);
|
|
955
|
-
const path = u.pathname
|
|
956
|
-
.split("/")
|
|
957
|
-
.map((segment) => {
|
|
958
|
-
if (!segment) return segment;
|
|
959
|
-
if (/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(segment)) return "{id}";
|
|
960
|
-
if (/^\d{4,}$/.test(segment)) return "{id}";
|
|
961
|
-
if (/^[a-f0-9]{24,}$/i.test(segment)) return "{id}";
|
|
962
|
-
if (/^urn:[a-zA-Z0-9._-]+(?::[a-zA-Z0-9._-]+)+$/.test(segment)) return "{urn}";
|
|
963
|
-
if (/^[A-Za-z0-9_-]+(?:,[A-Za-z0-9_-]+)+$/.test(segment)) return "{list}";
|
|
964
|
-
return segment;
|
|
965
|
-
})
|
|
966
|
-
.join("/");
|
|
967
|
-
// Preserve queryId param for GraphQL endpoints so different queries aren't deduplicated
|
|
968
|
-
const queryId = u.searchParams.get("queryId");
|
|
969
|
-
if (queryId && path.includes("graphql")) {
|
|
970
|
-
return `${u.origin}${path}?queryId=${queryId}`;
|
|
971
|
-
}
|
|
972
|
-
return `${u.origin}${path}`;
|
|
973
|
-
} catch {
|
|
974
|
-
return rawUrl;
|
|
975
|
-
}
|
|
976
|
-
}
|
|
977
|
-
|
|
978
|
-
function extractQueryParams(rawUrl: string): Record<string, string> {
|
|
979
|
-
try {
|
|
980
|
-
const u = new URL(rawUrl);
|
|
981
|
-
const params: Record<string, string> = {};
|
|
982
|
-
u.searchParams.forEach((v, k) => { params[k] = v; });
|
|
983
|
-
return params;
|
|
984
|
-
} catch {
|
|
985
|
-
return {};
|
|
986
|
-
}
|
|
987
|
-
}
|
|
988
|
-
|
|
989
|
-
/** Returns true if a header name is sensitive and should be stripped from skill manifests. */
|
|
990
|
-
function isSensitiveHeader(name: string): boolean {
|
|
991
|
-
const lower = name.toLowerCase();
|
|
992
|
-
if (lower === "cookie" || lower === "content-length" || lower === "host") return false; // handled separately
|
|
993
|
-
if (STRIP_HEADERS.has(lower)) return true;
|
|
994
|
-
if (STRIP_HEADER_PREFIXES.some((p) => lower.startsWith(p))) return true;
|
|
995
|
-
if (lower.startsWith("x-goog-api")) return true;
|
|
996
|
-
if (lower.startsWith("x-server-")) return true;
|
|
997
|
-
if (!SAFE_HEADERS.has(lower) && SENSITIVE_HEADER_PATTERN.test(lower)) return true;
|
|
998
|
-
return false;
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
|
-
function isReplayCriticalHeader(name: string, value: string): boolean {
|
|
1002
|
-
const lower = name.toLowerCase();
|
|
1003
|
-
if (REPLAY_HEADER_EXACT.has(lower)) {
|
|
1004
|
-
if (lower !== "accept") return true;
|
|
1005
|
-
return /application\/vnd\./i.test(value);
|
|
1006
|
-
}
|
|
1007
|
-
return REPLAY_HEADER_PREFIXES.some((prefix) => lower.startsWith(prefix));
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
function sanitizeHeaders(headers: Record<string, string>): Record<string, string> {
|
|
1011
|
-
return Object.fromEntries(
|
|
1012
|
-
Object.entries(headers ?? {}).filter(([k]) => {
|
|
1013
|
-
const lower = k.toLowerCase();
|
|
1014
|
-
if (lower === "cookie" || lower === "content-length" || lower === "host") return false;
|
|
1015
|
-
return !isSensitiveHeader(k);
|
|
1016
|
-
})
|
|
1017
|
-
);
|
|
1018
|
-
}
|
|
1019
|
-
|
|
1020
|
-
/**
|
|
1021
|
-
* Extract auth-sensitive headers from captured requests — the inverse of sanitizeHeaders.
|
|
1022
|
-
* These are stored in the vault (not the skill manifest) so server-fetch can reconstruct
|
|
1023
|
-
* the full header set without launching a browser. This is what makes the 2nd call fast.
|
|
1024
|
-
*/
|
|
1025
|
-
export function extractAuthHeaders(requests: RawRequest[]): Record<string, string> {
|
|
1026
|
-
const authHeaders: Record<string, string> = {};
|
|
1027
|
-
for (const req of requests) {
|
|
1028
|
-
for (const [k, v] of Object.entries(req.request_headers)) {
|
|
1029
|
-
const lower = k.toLowerCase();
|
|
1030
|
-
if (lower === "cookie" || lower === "content-length" || lower === "host") continue;
|
|
1031
|
-
if ((isSensitiveHeader(k) || isReplayCriticalHeader(k, v)) && !authHeaders[lower]) {
|
|
1032
|
-
authHeaders[lower] = v;
|
|
1033
|
-
}
|
|
1034
|
-
}
|
|
1035
|
-
}
|
|
1036
|
-
return authHeaders;
|
|
1037
|
-
}
|
|
1038
|
-
|
|
1039
|
-
function sanitizeQueryParams(params: Record<string, string>): Record<string, string> {
|
|
1040
|
-
return Object.fromEntries(
|
|
1041
|
-
Object.entries(params).filter(([k]) =>
|
|
1042
|
-
!SENSITIVE_QUERY_PARAMS.test(k) && !FRAMEWORK_QUERY_PARAMS.test(k)
|
|
1043
|
-
)
|
|
1044
|
-
);
|
|
1045
|
-
}
|
|
1046
|
-
|
|
1047
|
-
function sanitizeUrlTemplate(url: string): string {
|
|
1048
|
-
try {
|
|
1049
|
-
const u = new URL(url);
|
|
1050
|
-
if (u.search.length <= 1) return url;
|
|
1051
|
-
const cleaned = new URLSearchParams();
|
|
1052
|
-
for (const [key, val] of u.searchParams) {
|
|
1053
|
-
if (!SENSITIVE_QUERY_PARAMS.test(key) && !FRAMEWORK_QUERY_PARAMS.test(key)) {
|
|
1054
|
-
cleaned.set(key, val);
|
|
1055
|
-
}
|
|
1056
|
-
}
|
|
1057
|
-
const qs = cleaned.toString();
|
|
1058
|
-
// Use the raw URL path (not u.pathname) to preserve {template} braces
|
|
1059
|
-
const pathMatch = url.match(/^https?:\/\/[^/]+(\/[^?]*)/);
|
|
1060
|
-
const rawPath = pathMatch ? pathMatch[1] : u.pathname;
|
|
1061
|
-
return qs ? `${u.origin}${rawPath}?${qs}` : `${u.origin}${rawPath}`;
|
|
1062
|
-
} catch {
|
|
1063
|
-
return url;
|
|
1064
|
-
}
|
|
1065
|
-
}
|
|
1066
|
-
|
|
1067
|
-
// ── BUG-006: Path segment parameterization ──────────────────────────────────
|
|
1068
|
-
|
|
1069
|
-
/** Extract entity-like values from the page URL that may appear in API paths */
|
|
1070
|
-
function extractEntityHints(context?: ExtractionContext): Set<string> {
|
|
1071
|
-
const hints = new Set<string>();
|
|
1072
|
-
if (!context?.pageUrl) return hints;
|
|
1073
|
-
try {
|
|
1074
|
-
const u = new URL(context.pageUrl);
|
|
1075
|
-
for (const seg of u.pathname.split("/").filter(Boolean)) {
|
|
1076
|
-
// Skip structural path parts
|
|
1077
|
-
if (/^(en|es|fr|de|ja|zh|ko|api|v\d+|www|static|assets|public|pages|app)$/i.test(seg)) continue;
|
|
1078
|
-
if (seg.length > 40 || seg.length < 2) continue;
|
|
1079
|
-
hints.add(seg.toLowerCase());
|
|
1080
|
-
const fileBase = stripFileExtension(seg);
|
|
1081
|
-
if (fileBase !== seg && fileBase.length >= 2) hints.add(fileBase.toLowerCase());
|
|
1082
|
-
}
|
|
1083
|
-
} catch { /* skip */ }
|
|
1084
|
-
return hints;
|
|
1085
|
-
}
|
|
1086
|
-
|
|
1087
|
-
function stripFileExtension(segment: string): string {
|
|
1088
|
-
return segment.replace(/\.[a-z0-9]{1,8}$/i, "");
|
|
1089
|
-
}
|
|
1090
|
-
|
|
1091
|
-
function looksLikeAcademicYear(segment: string): boolean {
|
|
1092
|
-
return /^\d{4}-\d{4}$/.test(segment);
|
|
1093
|
-
}
|
|
1094
|
-
|
|
1095
|
-
function looksLikeCodeIdentifier(segment: string): boolean {
|
|
1096
|
-
return /^[A-Z]{2,}\d[A-Z0-9]*$/i.test(segment) && /[A-Z]/.test(segment) && /\d/.test(segment);
|
|
1097
|
-
}
|
|
1098
|
-
|
|
1099
|
-
/**
|
|
1100
|
-
* Infer a meaningful param name from the preceding path segment.
|
|
1101
|
-
* e.g. /quote/{?} → {quote}, /coins/{?} → {coin}, /price_charts/{?} → {price_chart}
|
|
1102
|
-
*/
|
|
1103
|
-
function inferParamName(
|
|
1104
|
-
segments: string[],
|
|
1105
|
-
index: number,
|
|
1106
|
-
fallback: string,
|
|
1107
|
-
usedNames: Set<string>,
|
|
1108
|
-
): string {
|
|
1109
|
-
let name = fallback;
|
|
1110
|
-
const prev = segments[index - 1];
|
|
1111
|
-
if (prev && !prev.startsWith("{") && prev.length > 1) {
|
|
1112
|
-
// Naive singularize: "coins" → "coin", "charts" → "chart"
|
|
1113
|
-
const base = prev.endsWith("s") && prev.length > 3 ? prev.slice(0, -1) : prev;
|
|
1114
|
-
name = base.replace(/[^a-zA-Z0-9_]/g, "_").toLowerCase();
|
|
1115
|
-
}
|
|
1116
|
-
// Ensure uniqueness
|
|
1117
|
-
let unique = name;
|
|
1118
|
-
let counter = 2;
|
|
1119
|
-
while (usedNames.has(unique)) {
|
|
1120
|
-
unique = `${name}_${counter++}`;
|
|
1121
|
-
}
|
|
1122
|
-
usedNames.add(unique);
|
|
1123
|
-
return unique;
|
|
1124
|
-
}
|
|
1125
|
-
|
|
1126
|
-
function nextPathPlaceholder(index: number, usedNames: Set<string>): string {
|
|
1127
|
-
const base = `path_${index}`;
|
|
1128
|
-
let unique = base;
|
|
1129
|
-
let counter = 2;
|
|
1130
|
-
while (usedNames.has(unique)) unique = `${base}_${counter++}`;
|
|
1131
|
-
usedNames.add(unique);
|
|
1132
|
-
return unique;
|
|
1133
|
-
}
|
|
1134
|
-
|
|
1135
|
-
/**
|
|
1136
|
-
* BUG-006: Parameterize dynamic path segments in API URL templates.
|
|
1137
|
-
*
|
|
1138
|
-
* Two detection strategies:
|
|
1139
|
-
* 1. Comma-separated values (already collapsed to {list} by normalizeUrl) — capture defaults
|
|
1140
|
-
* 2. Context-aware: segments matching entity values from the page URL
|
|
1141
|
-
*
|
|
1142
|
-
* Returns the templatized URL and a map of param names → captured default values.
|
|
1143
|
-
* NOTE: Avoids `new URL()` on the template since it would percent-encode curly braces.
|
|
1144
|
-
*/
|
|
1145
|
-
function templatizePathSegments(
|
|
1146
|
-
templateUrl: string,
|
|
1147
|
-
originalUrl: string,
|
|
1148
|
-
context?: ExtractionContext,
|
|
1149
|
-
): { url: string; pathParams: Record<string, string>; pathBindingCandidates: EndpointPathBindingCandidate[] } {
|
|
1150
|
-
const pathParams: Record<string, string> = {};
|
|
1151
|
-
const pathBindingCandidates: EndpointPathBindingCandidate[] = [];
|
|
1152
|
-
|
|
1153
|
-
try {
|
|
1154
|
-
// Parse templateUrl manually to avoid encoding {braces}
|
|
1155
|
-
// Format: "https://host:port/path/segments" (query already stripped by normalizeUrl)
|
|
1156
|
-
const tMatch = templateUrl.match(/^(https?:\/\/[^/]+)(\/.*)?$/);
|
|
1157
|
-
if (!tMatch) return { url: templateUrl, pathParams };
|
|
1158
|
-
const tOrigin = tMatch[1];
|
|
1159
|
-
const tPath = tMatch[2] ?? "/";
|
|
1160
|
-
|
|
1161
|
-
const oPath = new URL(originalUrl).pathname;
|
|
1162
|
-
|
|
1163
|
-
const tSegments = tPath.split("/");
|
|
1164
|
-
const oSegments = oPath.split("/");
|
|
1165
|
-
const hints = extractEntityHints(context);
|
|
1166
|
-
const usedNames = new Set<string>();
|
|
1167
|
-
|
|
1168
|
-
for (let i = 0; i < tSegments.length; i++) {
|
|
1169
|
-
const tSeg = tSegments[i];
|
|
1170
|
-
const oSeg = oSegments[i] ?? tSeg;
|
|
1171
|
-
const prevSeg = tSegments[i - 1];
|
|
1172
|
-
const fileBase = stripFileExtension(tSeg);
|
|
1173
|
-
const originalFileBase = stripFileExtension(oSeg);
|
|
1174
|
-
|
|
1175
|
-
if (!tSeg) continue;
|
|
1176
|
-
|
|
1177
|
-
// Pattern 1: Already parameterized by normalizeUrl — capture defaults & rename
|
|
1178
|
-
const placeholderMatch = tSeg.match(/^\{([^}]+)\}$/);
|
|
1179
|
-
if (placeholderMatch) {
|
|
1180
|
-
const placeholder = nextPathPlaceholder(i, usedNames);
|
|
1181
|
-
tSegments[i] = `{${placeholder}}`;
|
|
1182
|
-
pathParams[placeholder] = oSeg;
|
|
1183
|
-
pathBindingCandidates.push({
|
|
1184
|
-
placeholder,
|
|
1185
|
-
observed_value: oSeg,
|
|
1186
|
-
segment_index: i,
|
|
1187
|
-
source: "normalized_placeholder",
|
|
1188
|
-
placeholder_hint: placeholderMatch[1] || "value",
|
|
1189
|
-
preceding_segment: prevSeg,
|
|
1190
|
-
});
|
|
1191
|
-
continue;
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
const shouldTemplatizeFileBase =
|
|
1195
|
-
fileBase !== tSeg &&
|
|
1196
|
-
(
|
|
1197
|
-
looksLikeCodeIdentifier(originalFileBase) ||
|
|
1198
|
-
looksLikeAcademicYear(originalFileBase)
|
|
1199
|
-
);
|
|
1200
|
-
|
|
1201
|
-
if (shouldTemplatizeFileBase) {
|
|
1202
|
-
const placeholder = nextPathPlaceholder(i, usedNames);
|
|
1203
|
-
const suffix = tSeg.slice(fileBase.length);
|
|
1204
|
-
tSegments[i] = `{${placeholder}}${suffix}`;
|
|
1205
|
-
pathParams[placeholder] = originalFileBase;
|
|
1206
|
-
pathBindingCandidates.push({
|
|
1207
|
-
placeholder,
|
|
1208
|
-
observed_value: originalFileBase,
|
|
1209
|
-
segment_index: i,
|
|
1210
|
-
source: "file_basename",
|
|
1211
|
-
preceding_segment: prevSeg,
|
|
1212
|
-
filename_suffix: suffix,
|
|
1213
|
-
matched_page_hint: hints.has(fileBase.toLowerCase()),
|
|
1214
|
-
});
|
|
1215
|
-
continue;
|
|
1216
|
-
}
|
|
1217
|
-
|
|
1218
|
-
// Skip segments that are already template vars, file extensions, or structural
|
|
1219
|
-
if (tSeg.startsWith("{")) continue;
|
|
1220
|
-
if (tSeg.includes(".")) continue; // e.g. "24_hours.json"
|
|
1221
|
-
if (/^(api|v\d+|www|en|es|fr|de|latest|dex|search)$/i.test(tSeg)) continue;
|
|
1222
|
-
if (/^@?me$/i.test(tSeg) || /^self$/i.test(tSeg)) continue;
|
|
1223
|
-
|
|
1224
|
-
if (looksLikeAcademicYear(oSeg) || (/^(semesters?)$/i.test(prevSeg ?? "") && /^\d{1,2}$/.test(oSeg))) {
|
|
1225
|
-
const placeholder = nextPathPlaceholder(i, usedNames);
|
|
1226
|
-
tSegments[i] = `{${placeholder}}`;
|
|
1227
|
-
pathParams[placeholder] = oSeg;
|
|
1228
|
-
pathBindingCandidates.push({
|
|
1229
|
-
placeholder,
|
|
1230
|
-
observed_value: oSeg,
|
|
1231
|
-
segment_index: i,
|
|
1232
|
-
source: "segment_pattern",
|
|
1233
|
-
preceding_segment: prevSeg,
|
|
1234
|
-
});
|
|
1235
|
-
continue;
|
|
1236
|
-
}
|
|
1237
|
-
|
|
1238
|
-
// Pattern 2: Segment matches a page URL entity hint (case-insensitive)
|
|
1239
|
-
if (hints.size > 0 && (hints.has(tSeg.toLowerCase()) || hints.has(fileBase.toLowerCase()))) {
|
|
1240
|
-
const paramName = inferParamName(tSegments, i, "slug", usedNames);
|
|
1241
|
-
tSegments[i] = `{${paramName}}`;
|
|
1242
|
-
pathParams[paramName] = oSeg;
|
|
1243
|
-
pathBindingCandidates.push({
|
|
1244
|
-
placeholder: paramName,
|
|
1245
|
-
observed_value: oSeg,
|
|
1246
|
-
segment_index: i,
|
|
1247
|
-
source: "page_hint",
|
|
1248
|
-
preceding_segment: prevSeg,
|
|
1249
|
-
matched_page_hint: true,
|
|
1250
|
-
});
|
|
1251
|
-
continue;
|
|
1252
|
-
}
|
|
1253
|
-
|
|
1254
|
-
// Pattern 3: Context-diff — endpoint URL segment differs from page URL at same position.
|
|
1255
|
-
// If the page URL has a different value at this position, this segment is likely an entity
|
|
1256
|
-
// that should be parameterized. e.g. page=/r/singularity/ endpoint=/r/programming/.json
|
|
1257
|
-
if (context?.pageUrl) {
|
|
1258
|
-
try {
|
|
1259
|
-
const contextSegments = new URL(context.pageUrl).pathname.split("/");
|
|
1260
|
-
const contextSeg = contextSegments[i];
|
|
1261
|
-
const prevSeg = tSegments[i - 1] ?? "";
|
|
1262
|
-
const prevContextSeg = contextSegments[i - 1] ?? "";
|
|
1263
|
-
const nextSeg = tSegments[i + 1] ?? "";
|
|
1264
|
-
const nextContextSeg = contextSegments[i + 1] ?? "";
|
|
1265
|
-
const hasStructuralNeighborMatch =
|
|
1266
|
-
(!!prevSeg && !!prevContextSeg && prevSeg === prevContextSeg) ||
|
|
1267
|
-
(!!nextSeg && !!nextContextSeg && nextSeg === nextContextSeg);
|
|
1268
|
-
if (contextSeg && contextSeg !== tSeg &&
|
|
1269
|
-
hasStructuralNeighborMatch &&
|
|
1270
|
-
!contextSeg.includes(".") &&
|
|
1271
|
-
contextSeg.length >= 2 && contextSeg.length <= 40 &&
|
|
1272
|
-
!/^(api|v\d+|www|en|es|fr|de|latest|search|i)$/i.test(contextSeg)) {
|
|
1273
|
-
const paramName = inferParamName(tSegments, i, "slug", usedNames);
|
|
1274
|
-
tSegments[i] = `{${paramName}}`;
|
|
1275
|
-
pathParams[paramName] = contextSeg; // use context URL value as default
|
|
1276
|
-
pathBindingCandidates.push({
|
|
1277
|
-
placeholder: paramName,
|
|
1278
|
-
observed_value: contextSeg,
|
|
1279
|
-
segment_index: i,
|
|
1280
|
-
source: "context_diff",
|
|
1281
|
-
preceding_segment: prevSeg,
|
|
1282
|
-
});
|
|
1283
|
-
continue;
|
|
1284
|
-
}
|
|
1285
|
-
} catch { /* skip */ }
|
|
1286
|
-
}
|
|
1287
|
-
}
|
|
1288
|
-
|
|
1289
|
-
return { url: `${tOrigin}${tSegments.join("/")}`, pathParams, pathBindingCandidates };
|
|
1290
|
-
} catch {
|
|
1291
|
-
return { url: templateUrl, pathParams, pathBindingCandidates };
|
|
1292
|
-
}
|
|
1293
|
-
}
|
|
1294
|
-
|
|
1295
|
-
function isJsonParseable(body?: string): boolean {
|
|
1296
|
-
if (!body) return false;
|
|
1297
|
-
try { JSON.parse(stripJsonPrefix(body)); return true; } catch { return false; }
|
|
1298
|
-
}
|
|
1299
|
-
|
|
1300
|
-
/** Strip Google/common API JSON prefixes like )]}'\n or )]}\n */
|
|
1301
|
-
function stripJsonPrefix(body: string): string {
|
|
1302
|
-
return body.replace(/^\)?\]?\}?'?\s*\n/, "");
|
|
1303
|
-
}
|
|
1304
|
-
|
|
1305
|
-
function tryParseBody(body: string): Record<string, unknown> | undefined {
|
|
1306
|
-
// Try JSON first
|
|
1307
|
-
try {
|
|
1308
|
-
return JSON.parse(body) as Record<string, unknown>;
|
|
1309
|
-
} catch {}
|
|
1310
|
-
|
|
1311
|
-
// Try URL-encoded form data (BUG-GC-008: calendar sync endpoints use x-www-form-urlencoded)
|
|
1312
|
-
try {
|
|
1313
|
-
const params = new URLSearchParams(body);
|
|
1314
|
-
const result: Record<string, unknown> = {};
|
|
1315
|
-
params.forEach((v, k) => { result[k] = v; });
|
|
1316
|
-
if (Object.keys(result).length > 0) return result;
|
|
1317
|
-
} catch {}
|
|
1318
|
-
|
|
1319
|
-
return undefined;
|
|
1320
|
-
}
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
/**
|
|
1324
|
-
* Determine whether a URL path segment looks like a variable entity ID
|
|
1325
|
-
* (UUID, numeric ID, hash, ticker symbol) vs. a fixed action/resource name
|
|
1326
|
-
* (camelCase, English word, REST resource).
|
|
1327
|
-
*
|
|
1328
|
-
* Used by collapseEndpoints to avoid merging distinct API actions
|
|
1329
|
-
* like /relationships/connectionsSummary + /relationships/invitationsSummary.
|
|
1330
|
-
*/
|
|
1331
|
-
/** Compute Shannon entropy (bits per character) for a string. */
|
|
1332
|
-
function computeEntropy(s: string): number {
|
|
1333
|
-
const freq = new Map<string, number>();
|
|
1334
|
-
for (const ch of s) freq.set(ch, (freq.get(ch) ?? 0) + 1);
|
|
1335
|
-
let h = 0;
|
|
1336
|
-
for (const count of freq.values()) {
|
|
1337
|
-
const p = count / s.length;
|
|
1338
|
-
h -= p * Math.log2(p);
|
|
1339
|
-
}
|
|
1340
|
-
return h;
|
|
1341
|
-
}
|
|
1342
|
-
|
|
1343
|
-
function looksLikeEntityId(segment: string): boolean {
|
|
1344
|
-
if (segment.startsWith("{")) return true;
|
|
1345
|
-
// UUID (with or without dashes)
|
|
1346
|
-
if (/^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$/i.test(segment)) return true;
|
|
1347
|
-
// Pure numeric
|
|
1348
|
-
if (/^\d+$/.test(segment)) return true;
|
|
1349
|
-
// Long hex string (hash, object ID) — 8+ hex chars
|
|
1350
|
-
if (/^[0-9a-f]{8,}$/i.test(segment)) return true;
|
|
1351
|
-
// URN identifiers
|
|
1352
|
-
if (segment.startsWith("urn:")) return true;
|
|
1353
|
-
// Short uppercase stock tickers (1-5 uppercase letters, possibly with dots like BRK.B)
|
|
1354
|
-
if (/^[A-Z]{1,5}(\.[A-Z])?$/.test(segment)) return true;
|
|
1355
|
-
// Comma-separated lists
|
|
1356
|
-
if (segment.includes(",")) return true;
|
|
1357
|
-
// Base64-encoded IDs: mixed case with = padding
|
|
1358
|
-
if (/^[A-Za-z0-9+/]{6,}={1,2}$/.test(segment)) return true;
|
|
1359
|
-
|
|
1360
|
-
// High-entropy strings are likely encoded IDs (tokens, hashes, opaque cursors, etc.)
|
|
1361
|
-
const entropy = computeEntropy(segment);
|
|
1362
|
-
if (entropy > 3.5 && segment.length > 5) return true;
|
|
1363
|
-
|
|
1364
|
-
// === NOT an entity ID — these are action/resource names ===
|
|
1365
|
-
// camelCase: lowercase letter followed by uppercase (e.g., connectionsSummary)
|
|
1366
|
-
if (/[a-z][A-Z]/.test(segment)) return false;
|
|
1367
|
-
// snake_case or kebab-case multi-word
|
|
1368
|
-
if (/[a-z][_-][a-z]/i.test(segment)) return false;
|
|
1369
|
-
// Pure lowercase alphabetic word 3+ chars (REST resource: "connections", "settings")
|
|
1370
|
-
if (/^[a-z]{3,}$/.test(segment)) return false;
|
|
1371
|
-
|
|
1372
|
-
// Low-entropy strings are likely readable names, not IDs
|
|
1373
|
-
if (entropy < 2.5 && segment.length > 3) return false;
|
|
1374
|
-
|
|
1375
|
-
// Ambiguous — allow collapsing (conservative)
|
|
1376
|
-
return true;
|
|
1377
|
-
}
|
|
1378
|
-
|
|
1379
|
-
/**
|
|
1380
|
-
* Collapse sibling endpoints that share the same base path into a single
|
|
1381
|
-
* templatized endpoint. e.g.:
|
|
1382
|
-
* GET /sentiment/MSFT + GET /sentiment/NVDA + GET /sentiment/HIMS
|
|
1383
|
-
* → GET /sentiment/{ticker}
|
|
1384
|
-
*
|
|
1385
|
-
* Strategy: group endpoints by (method, origin, pathPrefix) where pathPrefix is
|
|
1386
|
-
* all path segments except the last. If a group has 3+ members whose last
|
|
1387
|
-
* segment varies, replace the last segment with a template variable.
|
|
1388
|
-
* Keep the first endpoint's metadata (headers, schema, etc.) as representative.
|
|
1389
|
-
*
|
|
1390
|
-
* Only collapses when the majority (>50%) of varying segments look like entity
|
|
1391
|
-
* IDs, NOT distinct action/resource names (camelCase, REST words).
|
|
1392
|
-
*/
|
|
1393
|
-
|
|
1394
|
-
/**
|
|
1395
|
-
* Mine path templates from a batch of URL paths that lack a context page URL.
|
|
1396
|
-
* Builds a prefix trie and identifies positions where enough distinct children
|
|
1397
|
-
* look like entity IDs, replacing them with `{id}` placeholders.
|
|
1398
|
-
*
|
|
1399
|
-
* @param paths - Array of URL pathnames (e.g. "/api/users/123/posts")
|
|
1400
|
-
* @param maxChildren - Minimum distinct values at a position to trigger wildcarding (default 4)
|
|
1401
|
-
* @returns Map from original path to templated path (only paths that changed are included)
|
|
1402
|
-
*/
|
|
1403
|
-
export function minePathTemplates(
|
|
1404
|
-
paths: string[],
|
|
1405
|
-
maxChildren = 4,
|
|
1406
|
-
): Map<string, string> {
|
|
1407
|
-
// Build a prefix trie: prefix → Map<segment, count>
|
|
1408
|
-
const trie = new Map<string, Map<string, number>>();
|
|
1409
|
-
|
|
1410
|
-
for (const path of paths) {
|
|
1411
|
-
const segments = path.split("/").filter(Boolean);
|
|
1412
|
-
for (let i = 0; i < segments.length; i++) {
|
|
1413
|
-
const prefix = "/" + segments.slice(0, i).join("/");
|
|
1414
|
-
const children = trie.get(prefix) ?? new Map<string, number>();
|
|
1415
|
-
const seg = segments[i];
|
|
1416
|
-
children.set(seg, (children.get(seg) ?? 0) + 1);
|
|
1417
|
-
trie.set(prefix, children);
|
|
1418
|
-
}
|
|
1419
|
-
}
|
|
1420
|
-
|
|
1421
|
-
// Identify wildcard prefixes: positions where distinct children >= maxChildren
|
|
1422
|
-
// AND more than 50% of those children look like entity IDs.
|
|
1423
|
-
const wildcardPrefixes = new Set<string>();
|
|
1424
|
-
for (const [prefix, children] of trie) {
|
|
1425
|
-
if (children.size < maxChildren) continue;
|
|
1426
|
-
const segs = Array.from(children.keys());
|
|
1427
|
-
const entityCount = segs.filter((s) => looksLikeEntityId(s)).length;
|
|
1428
|
-
if (entityCount / segs.length > 0.5) {
|
|
1429
|
-
wildcardPrefixes.add(prefix);
|
|
1430
|
-
}
|
|
1431
|
-
}
|
|
1432
|
-
|
|
1433
|
-
if (wildcardPrefixes.size === 0) return new Map();
|
|
1434
|
-
|
|
1435
|
-
// Build original → template map for paths that contain wildcarded positions.
|
|
1436
|
-
const result = new Map<string, string>();
|
|
1437
|
-
for (const path of paths) {
|
|
1438
|
-
const segments = path.split("/").filter(Boolean);
|
|
1439
|
-
const templated: string[] = [];
|
|
1440
|
-
let changed = false;
|
|
1441
|
-
for (let i = 0; i < segments.length; i++) {
|
|
1442
|
-
const prefix = "/" + segments.slice(0, i).join("/");
|
|
1443
|
-
if (wildcardPrefixes.has(prefix)) {
|
|
1444
|
-
templated.push("{id}");
|
|
1445
|
-
changed = true;
|
|
1446
|
-
} else {
|
|
1447
|
-
templated.push(segments[i]);
|
|
1448
|
-
}
|
|
1449
|
-
}
|
|
1450
|
-
if (changed) {
|
|
1451
|
-
result.set(path, "/" + templated.join("/"));
|
|
1452
|
-
}
|
|
1453
|
-
}
|
|
1454
|
-
|
|
1455
|
-
return result;
|
|
1456
|
-
}
|
|
1457
|
-
|
|
1458
|
-
function collapseEndpoints(endpoints: EndpointDescriptor[]): EndpointDescriptor[] {
|
|
1459
|
-
// Group by method + origin + all-but-last path segment
|
|
1460
|
-
const groups = new Map<string, EndpointDescriptor[]>();
|
|
1461
|
-
const ungrouped: EndpointDescriptor[] = [];
|
|
1462
|
-
|
|
1463
|
-
for (const ep of endpoints) {
|
|
1464
|
-
try {
|
|
1465
|
-
const u = new URL(ep.url_template);
|
|
1466
|
-
const segments = u.pathname.split("/").filter(Boolean);
|
|
1467
|
-
if (segments.length < 2) {
|
|
1468
|
-
// Root or single-segment paths can't be collapsed
|
|
1469
|
-
ungrouped.push(ep);
|
|
1470
|
-
continue;
|
|
1471
|
-
}
|
|
1472
|
-
const prefix = segments.slice(0, -1).join("/");
|
|
1473
|
-
const key = `${ep.method}:${u.origin}/${prefix}`;
|
|
1474
|
-
const arr = groups.get(key) || [];
|
|
1475
|
-
arr.push(ep);
|
|
1476
|
-
groups.set(key, arr);
|
|
1477
|
-
} catch {
|
|
1478
|
-
ungrouped.push(ep);
|
|
1479
|
-
}
|
|
1480
|
-
}
|
|
1481
|
-
|
|
1482
|
-
const result: EndpointDescriptor[] = [...ungrouped];
|
|
1483
|
-
|
|
1484
|
-
for (const [key, group] of groups) {
|
|
1485
|
-
if (group.length < 3) {
|
|
1486
|
-
// Not enough siblings to justify templatizing — keep as-is
|
|
1487
|
-
result.push(...group);
|
|
1488
|
-
continue;
|
|
1489
|
-
}
|
|
1490
|
-
|
|
1491
|
-
// Check that the last segments actually vary (not all identical)
|
|
1492
|
-
const lastSegments = group.map((ep) => {
|
|
1493
|
-
const u = new URL(ep.url_template);
|
|
1494
|
-
const segs = u.pathname.split("/").filter(Boolean);
|
|
1495
|
-
return segs[segs.length - 1];
|
|
1496
|
-
});
|
|
1497
|
-
const unique = new Set(lastSegments);
|
|
1498
|
-
if (unique.size < 3) {
|
|
1499
|
-
// Last segments don't vary enough — keep as-is
|
|
1500
|
-
result.push(...group);
|
|
1501
|
-
continue;
|
|
1502
|
-
}
|
|
1503
|
-
|
|
1504
|
-
// Only collapse if the varying segments look like entity IDs (UUIDs, numbers,
|
|
1505
|
-
// tickers, hashes), NOT distinct action/resource names (camelCase, English words).
|
|
1506
|
-
const entityLikeCount = lastSegments.filter((s) => looksLikeEntityId(s)).length;
|
|
1507
|
-
if (entityLikeCount / lastSegments.length <= 0.5) {
|
|
1508
|
-
result.push(...group);
|
|
1509
|
-
continue;
|
|
1510
|
-
}
|
|
1511
|
-
|
|
1512
|
-
// Infer a template variable name from the path prefix
|
|
1513
|
-
const [, prefixPath] = key.split(":", 2);
|
|
1514
|
-
const u = new URL(group[0].url_template);
|
|
1515
|
-
const prefix = u.pathname.split("/").filter(Boolean).slice(0, -1);
|
|
1516
|
-
const paramName = inferParamName(prefix, prefix.length, "id", new Set<string>());
|
|
1517
|
-
const templatizedPath = "/" + [...prefix, `{${paramName}}`].join("/");
|
|
1518
|
-
|
|
1519
|
-
// Keep the first endpoint as representative, update its URL template
|
|
1520
|
-
const representative = { ...group[0] };
|
|
1521
|
-
representative.url_template = `${u.origin}${templatizedPath}`;
|
|
1522
|
-
// Merge all captured example values as a hint
|
|
1523
|
-
representative.query = {
|
|
1524
|
-
...(representative.query || {}),
|
|
1525
|
-
};
|
|
1526
|
-
|
|
1527
|
-
result.push(representative);
|
|
1528
|
-
}
|
|
1529
|
-
|
|
1530
|
-
return result;
|
|
1531
|
-
}
|
|
1532
|
-
|
|
1533
|
-
/**
|
|
1534
|
-
* BUG-008: Detect Cloudflare challenge/block responses.
|
|
1535
|
-
* CF challenge pages contain distinctive markers in the HTML body.
|
|
1536
|
-
*/
|
|
1537
|
-
function isCloudflareChallenge(responseBody?: string): boolean {
|
|
1538
|
-
if (!responseBody) return false;
|
|
1539
|
-
const CF_MARKERS = [
|
|
1540
|
-
"cf-error",
|
|
1541
|
-
"challenge-platform",
|
|
1542
|
-
"cf-chl-bypass",
|
|
1543
|
-
"Checking if the site connection is secure",
|
|
1544
|
-
"Enable JavaScript and cookies to continue",
|
|
1545
|
-
"cf_chl_opt",
|
|
1546
|
-
"jschl-answer",
|
|
1547
|
-
"_cf_chl_tk",
|
|
1548
|
-
];
|
|
1549
|
-
const bodyLower = responseBody.toLowerCase();
|
|
1550
|
-
return CF_MARKERS.some((marker) => bodyLower.includes(marker.toLowerCase()));
|
|
1551
|
-
}
|