crawlio-browser 1.5.9 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  import {
2
2
  PKG_VERSION
3
- } from "./chunk-RGSCESM6.js";
3
+ } from "./chunk-OIW6FN2G.js";
4
4
 
5
5
  // src/mcp-server/init.ts
6
6
  import { execFileSync, spawn } from "child_process";
7
- import { existsSync, mkdirSync, writeFileSync, readFileSync, readdirSync, copyFileSync, chmodSync } from "fs";
7
+ import { existsSync, mkdirSync, writeFileSync, readFileSync, readdirSync, copyFileSync, chmodSync, renameSync } from "fs";
8
8
  import { join, resolve, dirname, sep, basename } from "path";
9
9
  import { homedir, platform } from "os";
10
10
  import { createServer as createNetServer } from "net";
@@ -34,6 +34,21 @@ var LOGO_GRADIENT = [
34
34
  "\x1B[38;5;56m"
35
35
  // deep blue
36
36
  ];
37
+ function atomicWriteSync(filePath, data) {
38
+ const tmpPath = filePath + ".tmp";
39
+ writeFileSync(tmpPath, data);
40
+ renameSync(tmpPath, filePath);
41
+ }
42
+ function escapeToml(value) {
43
+ return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
44
+ }
45
+ function escapeYaml(value) {
46
+ if (/[:#\[\]{*&]/.test(value)) return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
47
+ return value;
48
+ }
49
+ function escapeShellSingleQuote(s) {
50
+ return "'" + s.replace(/'/g, "'\\''") + "'";
51
+ }
37
52
  function parseFlags(argv) {
38
53
  const opts = {
39
54
  portal: false,
@@ -174,8 +189,9 @@ function configureClient(client, entry, dryRun) {
174
189
  if (existsSync(client.configPath)) {
175
190
  try {
176
191
  config = JSON.parse(readFileSync(client.configPath, "utf-8"));
177
- } catch {
178
- config = {};
192
+ } catch (err) {
193
+ console.log(` ${yellow("!")} Corrupt JSON in ${client.configPath}: ${err instanceof Error ? err.message : String(err)}`);
194
+ return "error";
179
195
  }
180
196
  }
181
197
  const section = config[client.serverKey] || {};
@@ -183,8 +199,13 @@ function configureClient(client, entry, dryRun) {
183
199
  if (dryRun) return "configured";
184
200
  section["crawlio-browser"] = finalEntry;
185
201
  config[client.serverKey] = section;
186
- mkdirSync(dirname(client.configPath), { recursive: true });
187
- writeFileSync(client.configPath, JSON.stringify(config, null, 2) + "\n");
202
+ try {
203
+ mkdirSync(dirname(client.configPath), { recursive: true });
204
+ atomicWriteSync(client.configPath, JSON.stringify(config, null, 2) + "\n");
205
+ } catch (err) {
206
+ console.log(` ${yellow("!")} Failed to write ${client.configPath}: ${err instanceof Error ? err.message : String(err)}`);
207
+ return "error";
208
+ }
188
209
  return "configured";
189
210
  }
190
211
  if (client.format === "toml") {
@@ -197,14 +218,19 @@ function configureClient(client, entry, dryRun) {
197
218
  }
198
219
  if (dryRun) return "configured";
199
220
  const e = entry;
200
- const argsStr = (e.args || []).map((a) => `"${a}"`).join(", ");
221
+ const argsStr = (e.args || []).map((a) => `"${escapeToml(a)}"`).join(", ");
201
222
  const block = `
202
223
  [mcp_servers.crawlio-browser]
203
- command = "${e.command}"
224
+ command = "${escapeToml(e.command)}"
204
225
  args = [${argsStr}]
205
226
  `;
206
- mkdirSync(dirname(client.configPath), { recursive: true });
207
- writeFileSync(client.configPath, content + block);
227
+ try {
228
+ mkdirSync(dirname(client.configPath), { recursive: true });
229
+ atomicWriteSync(client.configPath, content + block);
230
+ } catch (err) {
231
+ console.log(` ${yellow("!")} Failed to write ${client.configPath}: ${err instanceof Error ? err.message : String(err)}`);
232
+ return "error";
233
+ }
208
234
  return "configured";
209
235
  }
210
236
  if (client.format === "yaml") {
@@ -217,26 +243,31 @@ args = [${argsStr}]
217
243
  }
218
244
  if (dryRun) return "configured";
219
245
  const e = entry;
220
- const argsYaml = (e.args || []).map((a) => ` - ${a}`).join("\n");
246
+ const argsYaml = (e.args || []).map((a) => ` - ${escapeYaml(a)}`).join("\n");
221
247
  const block = `
222
248
  crawlio-browser:
223
249
  name: crawlio-browser
224
250
  type: stdio
225
- cmd: ${e.command}
251
+ cmd: ${escapeYaml(e.command)}
226
252
  args:
227
253
  ${argsYaml}
228
254
  `;
229
255
  if (!content.includes("extensions:")) {
230
256
  content += "\nextensions:\n";
231
257
  }
232
- mkdirSync(dirname(client.configPath), { recursive: true });
233
- writeFileSync(client.configPath, content + block);
258
+ try {
259
+ mkdirSync(dirname(client.configPath), { recursive: true });
260
+ atomicWriteSync(client.configPath, content + block);
261
+ } catch (err) {
262
+ console.log(` ${yellow("!")} Failed to write ${client.configPath}: ${err instanceof Error ? err.message : String(err)}`);
263
+ return "error";
264
+ }
234
265
  return "configured";
235
266
  }
236
267
  return "error";
237
268
  }
238
269
  function configureAllClients(options) {
239
- const entry = options.portal ? buildPortalEntry() : buildStdioEntry({ full: options.full });
270
+ const entry = options.portal ? buildPortalEntry() : buildStdioEntry({ full: options.full, dryRun: options.dryRun });
240
271
  const candidates = options.agents.length > 0 ? CLIENT_REGISTRY.filter((c) => options.agents.some((a) => c.name.toLowerCase().includes(a.toLowerCase()))) : CLIENT_REGISTRY.filter((c) => c.detect());
241
272
  if (candidates.length === 0) {
242
273
  console.log(` ${dim(" No MCP clients detected on this machine")}`);
@@ -274,7 +305,7 @@ function printManualInstructions(entry) {
274
305
  console.log("");
275
306
  }
276
307
  function buildStdioEntry(options) {
277
- if (platform() === "darwin") {
308
+ if (platform() === "darwin" && !options?.dryRun) {
278
309
  const serverPath2 = getServerEntryPath();
279
310
  const wrapperPath = createAppWrapper(serverPath2);
280
311
  if (wrapperPath) {
@@ -451,7 +482,7 @@ function createAppWrapper(serverEntryPath) {
451
482
  }
452
483
  const nodePath = resolveNodePath();
453
484
  const script = `#!/bin/bash
454
- exec "${nodePath}" "${serverEntryPath}" "$@"
485
+ exec ${escapeShellSingleQuote(nodePath)} ${escapeShellSingleQuote(serverEntryPath)} "$@"
455
486
  `;
456
487
  try {
457
488
  writeFileSync(wrapperBin, script);
@@ -811,12 +842,12 @@ async function cloudflareFlow(options) {
811
842
  delete mcpConfig.config.mcpServers["cloudflare-builds"];
812
843
  }
813
844
  mcpConfig.config.mcpServers["cloudflare"] = entry;
814
- writeFileSync(mcpConfig.path, JSON.stringify(mcpConfig.config, null, 2) + "\n");
845
+ atomicWriteSync(mcpConfig.path, JSON.stringify(mcpConfig.config, null, 2) + "\n");
815
846
  console.log(` ${green("+")} Added cloudflare to ${mcpConfig.path}`);
816
847
  } else {
817
848
  const configPath = join(process.cwd(), ".mcp.json");
818
849
  const config = { mcpServers: { cloudflare: entry } };
819
- writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n");
850
+ atomicWriteSync(configPath, JSON.stringify(config, null, 2) + "\n");
820
851
  console.log(` ${green("+")} Created ${configPath} with cloudflare`);
821
852
  }
822
853
  console.log(` ${green("+")} 89 Cloudflare tools ready (Workers, KV, D1, R2, Queues, AI)`);
@@ -849,14 +880,14 @@ async function configureMetaMcp(found, options) {
849
880
  return;
850
881
  }
851
882
  }
852
- const entry = options.portal ? buildPortalEntry() : buildStdioEntry({ full: options.full });
883
+ const entry = options.portal ? buildPortalEntry() : buildStdioEntry({ full: options.full, dryRun: options.dryRun });
853
884
  if (options.dryRun) {
854
885
  console.log(` ${dim("~")} Would add to ${found.path}:`);
855
886
  console.log(` ${dim("~")} "crawlio-browser": ${JSON.stringify(entry)}`);
856
887
  return;
857
888
  }
858
889
  found.config.mcpServers["crawlio-browser"] = entry;
859
- writeFileSync(found.path, JSON.stringify(found.config, null, 2) + "\n");
890
+ atomicWriteSync(found.path, JSON.stringify(found.config, null, 2) + "\n");
860
891
  console.log(` ${green("+")} Added crawlio-browser to ${found.path}`);
861
892
  }
862
893
  function configureStdioClients(options) {
@@ -925,7 +956,7 @@ async function printSummary(options) {
925
956
  }
926
957
  } else {
927
958
  const modeLabel = options.full ? "Full mode" : "Code mode";
928
- const countLabel = options.full ? "(100 tools)" : "(3 tools, 133 commands)";
959
+ const countLabel = options.full ? "(114 tools)" : "(3 tools, 147 commands)";
929
960
  statusLines.push(`${green("+")} Mode ${modeLabel} ${countLabel}`);
930
961
  }
931
962
  statusLines.push(`${green("+")} Skill Browser automation installed`);
@@ -995,6 +1026,8 @@ export {
995
1026
  configureAllClients,
996
1027
  configureClient,
997
1028
  createAppWrapper,
1029
+ escapeToml,
1030
+ escapeYaml,
998
1031
  extractSkillName,
999
1032
  findConflictingConfigs,
1000
1033
  findMcpConfig,
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "crawlio-browser",
3
- "version": "1.5.9",
4
- "description": "MCP server with 100 CDP-backed tools for browser automation — screenshots, DOM, network capture, framework detection, cookies, storage, session recording, structured data extraction, performance metrics via Chrome",
3
+ "version": "1.6.1",
4
+ "description": "MCP server with 114 CDP-backed tools for browser automation — screenshots, DOM, network capture, framework detection, cookies, storage, session recording, structured data extraction, tracking analysis, SEO auditing, technographic fingerprinting, performance metrics via Chrome",
5
5
  "type": "module",
6
6
  "main": "dist/mcp-server/index.js",
7
7
  "bin": {
@@ -57,6 +57,7 @@
57
57
  "dependencies": {
58
58
  "@modelcontextprotocol/sdk": "^1.8.0",
59
59
  "express-rate-limit": "^8.2.1",
60
+ "idb": "^8.0.3",
60
61
  "ws": "^8.18.1",
61
62
  "zod": "^3.24.2"
62
63
  },
@@ -64,9 +65,11 @@
64
65
  "@types/chrome": "^0.0.287",
65
66
  "@types/ws": "^8.18.0",
66
67
  "@vitest/coverage-v8": "^4.0.18",
67
- "sharp": "^0.34.5",
68
68
  "tsup": "^8.4.0",
69
69
  "typescript": "^5.6.2",
70
70
  "vitest": "^4.0.18"
71
+ },
72
+ "optionalDependencies": {
73
+ "sharp": "^0.34.5"
71
74
  }
72
75
  }
@@ -0,0 +1,103 @@
1
+ ---
2
+ name: clone
3
+ description: "Clone a site — capture design tokens, component tree, assets, and compile a replayable skill"
4
+ allowed-tools: Agent
5
+ argument-hint: <url>
6
+ context: fork
7
+ agent: crawlio-investigator
8
+ ---
9
+
10
+ # Clone Investigation
11
+
12
+ You are running a **clone** investigation. Your goal is to capture the design system, component structure, and assets of a target URL, then compile the investigation into a replayable skill.
13
+
14
+ ## Loop Definition
15
+
16
+ Read `loops/clone.json` to understand the phase sequence. The clone loop has 5 phases:
17
+
18
+ 1. **crawl** — Spawn `crawlio-crawler` to capture the target URL. Record the `EVIDENCE_ID`.
19
+ 2. **analyze** — Spawn `crawlio-analyzer` with the crawl evidence ID. Identifies framework, rendering mode, component patterns.
20
+ 3. **extract-design** — Spawn `crawlio-extractor` with the crawl evidence ID and `what: "design"`. Extracts design tokens (colors, typography, spacing, breakpoints).
21
+ 4. **compile** (optional) — Spawn `crawlio-recorder` to compile the investigation into a replayable SKILL.md.
22
+ 5. **synthesize** — Spawn `crawlio-synthesizer` with all phase evidence to produce the final `CloneBlueprint`.
23
+
24
+ ## Execution
25
+
26
+ 1. Read `loops/clone.json` to confirm phase order.
27
+ 2. Parse the user's argument: `<url>`.
28
+ 3. Spawn `crawlio-crawler` to capture the page:
29
+ ```
30
+ Crawl <url> and write PageEvidence to .crawlio/evidence/.
31
+ ```
32
+ Record `EVIDENCE_ID=<crawlId>`.
33
+
34
+ 4. Spawn `crawlio-analyzer` with the crawl evidence:
35
+ ```
36
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
37
+ Analyze framework, rendering mode, and component patterns.
38
+ Write FrameworkEvidence to .crawlio/evidence/.
39
+ Target URL: <url>
40
+ ```
41
+ Record `EVIDENCE_ID=<analyzeId>`.
42
+
43
+ 5. Spawn `crawlio-extractor` for design token extraction:
44
+ ```
45
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
46
+ Extract "design" data — colors, typography, spacing, breakpoints.
47
+ Write DesignTokens evidence to .crawlio/evidence/.
48
+ Target URL: <url>
49
+ ```
50
+ Record `EVIDENCE_ID=<designId>`.
51
+
52
+ 6. Spawn `crawlio-recorder` to compile the investigation:
53
+ ```
54
+ Read evidence chain: <crawlId>, <analyzeId>, <designId>.
55
+ Compile into a replayable SKILL.md.
56
+ ```
57
+ Record the skill path.
58
+
59
+ 7. Spawn `crawlio-synthesizer` to produce the CloneBlueprint:
60
+ ```
61
+ Read all evidence: <crawlId>, <analyzeId>, <designId>.
62
+ Produce a CloneBlueprint with design tokens, component tree, assets, and compiled skill path.
63
+ Write to .crawlio/evidence/.
64
+ Target URL: <url>
65
+ ```
66
+ Record `EVIDENCE_ID=<blueprintId>`.
67
+
68
+ 8. Read the CloneBlueprint evidence and summarize results for the user.
69
+
70
+ ## Output Format
71
+
72
+ ```
73
+ ## Clone: <url>
74
+
75
+ ### Design Tokens
76
+ - Colors: [count] tokens extracted
77
+ - Typography: [count] font stacks
78
+ - Spacing: [count] spacing values
79
+ - Breakpoints: [count] responsive breakpoints
80
+
81
+ ### Component Tree
82
+ - Root: <root component>
83
+ - Components: [count] total
84
+ - Types: [breakdown by type]
85
+
86
+ ### Assets
87
+ - [count] total assets ([breakdown by type])
88
+
89
+ ### Compiled Skill
90
+ - Path: <skill path or "not compiled">
91
+
92
+ ### Evidence Chain
93
+ - Crawler: <crawlId> (quality: ...)
94
+ - Analyzer: <analyzeId> (quality: ...)
95
+ - Design: <designId> (quality: ...)
96
+ - Blueprint: <blueprintId> (quality: ...)
97
+
98
+ ### Coverage Gaps
99
+ - [Any gaps from the investigation]
100
+
101
+ ### Confidence
102
+ - Overall: high/medium/low
103
+ ```
@@ -0,0 +1,104 @@
1
+ ---
2
+ name: compare
3
+ description: "Compare two URLs side-by-side across 10 typed dimensions"
4
+ allowed-tools: Agent
5
+ argument-hint: <urlA> <urlB>
6
+ context: fork
7
+ agent: crawlio-investigator
8
+ ---
9
+
10
+ # Compare Investigation
11
+
12
+ You are running a **compare** investigation. Your goal is to capture two URLs, analyze their frameworks, and produce a `ComparisonReport` with typed findings across 10 dimensions.
13
+
14
+ ## The 10 Dimensions
15
+
16
+ | # | Dimension | What It Measures |
17
+ |---|-----------|------------------|
18
+ | 1 | Framework | Technology stack, versions, SSR mode |
19
+ | 2 | Performance | Web Vitals, load metrics, bottlenecks |
20
+ | 3 | Security | TLS, headers, cookies, mixed content |
21
+ | 4 | SEO | Meta tags, structured data, heading hierarchy |
22
+ | 5 | Accessibility | ARIA, semantic HTML, keyboard nav, contrast |
23
+ | 6 | Error Surface | Console errors, network failures, JS exceptions |
24
+ | 7 | Third-Party Load | External scripts, tracking, CDN, SDK risk |
25
+ | 8 | Architecture | SSR vs CSR, routing, data fetching, state management |
26
+ | 9 | Content Delivery | Caching, compression, asset optimization |
27
+ | 10 | Mobile Readiness | Viewport, responsive signals, device emulation |
28
+
29
+ ## Loop Definition
30
+
31
+ Read `loops/compare.json` to understand the phase sequence. The compare loop has 6 phases:
32
+
33
+ 1. **crawl-a** — Spawn `crawlio-crawler` to capture URL A. Record the `EVIDENCE_ID`.
34
+ 2. **crawl-b** — Spawn `crawlio-crawler` to capture URL B. Record the `EVIDENCE_ID`.
35
+ 3. **analyze-a** (optional) — Spawn `crawlio-analyzer` with crawl-a evidence to identify frameworks.
36
+ 4. **analyze-b** (optional) — Spawn `crawlio-analyzer` with crawl-b evidence to identify frameworks.
37
+ 5. **compare** — Spawn `crawlio-comparator` with all evidence IDs. It reads both URLs' evidence, compares across 10 dimensions, and writes an `EvidenceEnvelope<ComparisonReport>`.
38
+ 6. **synthesize** (optional) — Spawn `crawlio-synthesizer` if a full blueprint is useful.
39
+
40
+ ## Execution
41
+
42
+ 1. Read `loops/compare.json` to confirm phase order.
43
+ 2. Parse the user's arguments: `<urlA>` and `<urlB>`.
44
+ 3. Spawn `crawlio-crawler` for URL A:
45
+ ```
46
+ Crawl <urlA> and write PageEvidence to .crawlio/evidence/.
47
+ ```
48
+ Record `EVIDENCE_ID=<crawlAId>`.
49
+
50
+ 4. Spawn `crawlio-crawler` for URL B:
51
+ ```
52
+ Crawl <urlB> and write PageEvidence to .crawlio/evidence/.
53
+ ```
54
+ Record `EVIDENCE_ID=<crawlBId>`.
55
+
56
+ 5. Spawn `crawlio-analyzer` for URL A (optional):
57
+ ```
58
+ Analyze page evidence <crawlAId> for <urlA>. Read from .crawlio/evidence/. Write FrameworkEvidence to .crawlio/evidence/.
59
+ ```
60
+ Record `EVIDENCE_ID=<analyzeAId>`.
61
+
62
+ 6. Spawn `crawlio-analyzer` for URL B (optional):
63
+ ```
64
+ Analyze page evidence <crawlBId> for <urlB>. Read from .crawlio/evidence/. Write FrameworkEvidence to .crawlio/evidence/.
65
+ ```
66
+ Record `EVIDENCE_ID=<analyzeBId>`.
67
+
68
+ 7. Spawn `crawlio-comparator` with all evidence:
69
+ ```
70
+ Compare URL A (<urlA>) against URL B (<urlB>).
71
+ Evidence IDs — crawl-a: <crawlAId>, crawl-b: <crawlBId>, analyze-a: <analyzeAId>, analyze-b: <analyzeBId>.
72
+ Read all evidence from .crawlio/evidence/. Write EvidenceEnvelope<ComparisonReport> to .crawlio/evidence/.
73
+ ```
74
+ Record `EVIDENCE_ID=<compareId>`.
75
+
76
+ 8. Read the ComparisonReport evidence and summarize for the user.
77
+
78
+ ## Output Format
79
+
80
+ ```
81
+ ## Compare: <urlA> vs <urlB>
82
+
83
+ ### Winner: <A|B|Tie|Inconclusive>
84
+ <winnerReason>
85
+
86
+ ### Dimension Results
87
+ | Dimension | Verdict | Confidence | Key Differences |
88
+ |-----------|---------|------------|-----------------|
89
+ | [per-dimension rows] |
90
+
91
+ ### Summary
92
+ - Total differences: N
93
+ - Critical differences: N
94
+
95
+ ### Evidence Chain
96
+ - Crawl A: <crawlAId> (quality: ...)
97
+ - Crawl B: <crawlBId> (quality: ...)
98
+ - Analyze A: <analyzeAId> (quality: ...)
99
+ - Analyze B: <analyzeBId> (quality: ...)
100
+ - Compare: <compareId> (quality: ...)
101
+
102
+ ### Confidence
103
+ - Overall: high/medium/low
104
+ ```
@@ -0,0 +1,148 @@
1
+ ---
2
+ name: dossier
3
+ description: "Competitive dossier — orchestrate investigate + test + extract into a unified analysis"
4
+ allowed-tools: Agent
5
+ argument-hint: <url>
6
+ context: fork
7
+ agent: crawlio-investigator
8
+ ---
9
+
10
+ # Dossier Investigation
11
+
12
+ You are running a **compose** investigation. Your goal is to orchestrate multiple investigation families (investigate, test, extract) into a unified `CompetitiveDossier` for a target URL.
13
+
14
+ ## Loop Definition
15
+
16
+ Read `loops/compose.json` to understand the phase sequence. The compose loop has 8 phases:
17
+
18
+ 1. **crawl** — Spawn `crawlio-crawler` to capture the target URL. Record the `EVIDENCE_ID`.
19
+ 2. **analyze** — Spawn `crawlio-analyzer` with the crawl evidence ID. Identifies framework and rendering mode.
20
+ 3. **network** (optional) — Spawn `crawlio-network` with the crawl evidence ID. Discovers API endpoints, auth, third-party services.
21
+ 4. **synthesize** — Spawn `crawlio-synthesizer` with all evidence to produce a `TechBlueprint`.
22
+ 5. **audit** (optional) — Spawn `crawlio-auditor` with the crawl evidence ID. Runs accessibility, performance, security, SEO, and best-practices audits.
23
+ 6. **extract-design** (optional) — Spawn `crawlio-extractor` to extract design tokens.
24
+ 7. **extract-api** (optional) — Spawn `crawlio-extractor` to extract API surface data.
25
+ 8. **compile-dossier** — Spawn `crawlio-composer` with all accumulated evidence IDs. Produces the final `CompetitiveDossier`.
26
+
27
+ ## Execution
28
+
29
+ 1. Read `loops/compose.json` to confirm phase order.
30
+ 2. Parse the user's argument: `<url>`.
31
+ 3. Spawn `crawlio-crawler` to capture the page:
32
+ ```
33
+ Crawl <url> and write PageEvidence to .crawlio/evidence/.
34
+ ```
35
+ Record `EVIDENCE_ID=<crawlId>`.
36
+
37
+ 4. Spawn `crawlio-analyzer` with the crawl evidence:
38
+ ```
39
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
40
+ Analyze framework, rendering mode, and component patterns.
41
+ Write FrameworkEvidence to .crawlio/evidence/.
42
+ Target URL: <url>
43
+ ```
44
+ Record `EVIDENCE_ID=<analyzeId>`.
45
+
46
+ 5. Spawn `crawlio-network` to discover API surface (optional):
47
+ ```
48
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
49
+ Discover API endpoints, authentication patterns, rate limiting, third-party integrations.
50
+ Write APIMap to .crawlio/evidence/.
51
+ Target URL: <url>
52
+ ```
53
+ Record `EVIDENCE_ID=<networkId>`.
54
+
55
+ 6. Spawn `crawlio-synthesizer` to produce a TechBlueprint:
56
+ ```
57
+ Read all evidence: <crawlId>, <analyzeId>, <networkId>.
58
+ Produce a TechBlueprint with typed findings.
59
+ Write to .crawlio/evidence/.
60
+ Target URL: <url>
61
+ ```
62
+ Record `EVIDENCE_ID=<blueprintId>`.
63
+
64
+ 7. Spawn `crawlio-auditor` to run audits (optional):
65
+ ```
66
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
67
+ Run accessibility, performance, security, SEO, and best-practices audits.
68
+ Write TestSuite to .crawlio/evidence/.
69
+ Target URL: <url>
70
+ ```
71
+ Record `EVIDENCE_ID=<auditId>`.
72
+
73
+ 8. Spawn `crawlio-extractor` to extract design tokens (optional):
74
+ ```
75
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
76
+ Extract "design" data — colors, typography, spacing, breakpoints.
77
+ Write DesignTokens to .crawlio/evidence/.
78
+ Target URL: <url>
79
+ ```
80
+ Record `EVIDENCE_ID=<designId>`.
81
+
82
+ 9. Spawn `crawlio-extractor` to extract API surface (optional):
83
+ ```
84
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
85
+ Extract "api" data — endpoints, auth, third-party services.
86
+ Write APIMap to .crawlio/evidence/.
87
+ Target URL: <url>
88
+ ```
89
+ Record `EVIDENCE_ID=<apiExtractId>`.
90
+
91
+ 10. Spawn `crawlio-composer` with all accumulated evidence:
92
+ ```
93
+ Read all evidence from prior phases. Evidence IDs:
94
+ - crawl: <crawlId>
95
+ - analyze: <analyzeId>
96
+ - network: <networkId> (if available)
97
+ - blueprint: <blueprintId>
98
+ - audit: <auditId> (if available)
99
+ - design: <designId> (if available)
100
+ - api-extract: <apiExtractId> (if available)
101
+ Compile a CompetitiveDossier with strengths, weaknesses, opportunities, and recommendations.
102
+ Write to .crawlio/evidence/.
103
+ Target URL: <url>
104
+ ```
105
+ Record `EVIDENCE_ID=<dossierId>`.
106
+
107
+ 11. Read the CompetitiveDossier evidence and summarize for the user.
108
+
109
+ ## Output Format
110
+
111
+ ```
112
+ ## Dossier: <url>
113
+
114
+ ### Executive Summary
115
+ <executiveSummary>
116
+
117
+ ### Strengths
118
+ - [bullet list of strengths with confidence levels]
119
+
120
+ ### Weaknesses
121
+ - [bullet list of weaknesses with confidence levels]
122
+
123
+ ### Opportunities
124
+ - [bullet list of opportunities]
125
+
126
+ ### Recommendations
127
+ | Priority | Category | Action |
128
+ |----------|----------|--------|
129
+ | [per-recommendation rows, sorted by priority] |
130
+
131
+ ### Families Executed
132
+ - [list of families that contributed evidence]
133
+
134
+ ### Evidence Chain
135
+ - Crawler: <crawlId> (quality: ...)
136
+ - Analyzer: <analyzeId> (quality: ...)
137
+ - Network: <networkId> (quality: ...)
138
+ - Blueprint: <blueprintId> (quality: ...)
139
+ - Auditor: <auditId> (quality: ...)
140
+ - Design: <designId> (quality: ...)
141
+ - Dossier: <dossierId> (quality: ...)
142
+
143
+ ### Coverage Gaps
144
+ - [Aggregated gaps from all phases]
145
+
146
+ ### Confidence
147
+ - Overall: high/medium/low
148
+ ```
@@ -0,0 +1,69 @@
1
+ ---
2
+ name: extract
3
+ description: "Extract structured data from a URL — tables, API surface, design tokens, auth flows"
4
+ allowed-tools: Agent
5
+ argument-hint: <url> <what>
6
+ context: fork
7
+ agent: crawlio-investigator
8
+ ---
9
+
10
+ # Extract Investigation
11
+
12
+ You are running an **extract** investigation. Your goal is to capture a page and extract specific structured data from it based on the `what` parameter.
13
+
14
+ ## Extraction Targets
15
+
16
+ | `what` | Evidence Type | What It Extracts |
17
+ |--------|---------------|------------------|
18
+ | `tables` | `TableExtraction` | Tabular data from DOM patterns |
19
+ | `data` | `DataExtraction` | All structured data (tables + JSON-LD) |
20
+ | `api` | `APIMap` | API endpoints, auth, third-party services |
21
+ | `design` | `DesignTokens` | Colors, typography, spacing, breakpoints |
22
+ | `auth` | `AuthFlow` | Login flows, token storage, CSRF, OAuth |
23
+
24
+ ## Loop Definition
25
+
26
+ Read `loops/extract.json` to understand the phase sequence. The extract loop has 3 phases:
27
+
28
+ 1. **crawl** — Spawn `crawlio-crawler` to capture the target URL. Record the `EVIDENCE_ID`.
29
+ 2. **extract** — Spawn `crawlio-extractor` with the crawl evidence ID and the `what` parameter. It reads the `EvidenceEnvelope<PageEvidence>`, runs the appropriate extraction strategy, and writes a typed evidence envelope.
30
+ 3. **synthesize** (optional) — Spawn `crawlio-synthesizer` if a full blueprint is useful.
31
+
32
+ ## Execution
33
+
34
+ 1. Read `loops/extract.json` to confirm phase order.
35
+ 2. Parse the user's arguments: `<url>` and `<what>` (one of: tables, data, api, design, auth).
36
+ 3. Spawn `crawlio-crawler` to capture the page:
37
+ ```
38
+ Crawl <url> and write PageEvidence to .crawlio/evidence/.
39
+ ```
40
+ Record `EVIDENCE_ID=<crawlId>`.
41
+
42
+ 4. Spawn `crawlio-extractor` with the crawl evidence and extraction target:
43
+ ```
44
+ Read PageEvidence from .crawlio/evidence/<crawlId>.json.
45
+ Extract "<what>" data and write the appropriate typed evidence to .crawlio/evidence/.
46
+ Target URL: <url>
47
+ ```
48
+ Record `EVIDENCE_ID=<extractId>`.
49
+
50
+ 5. Read the extraction evidence and summarize results for the user.
51
+
52
+ ## Output Format
53
+
54
+ ```
55
+ ## Extract: <what> from <url>
56
+
57
+ ### Results
58
+ - [Key findings from the extraction]
59
+
60
+ ### Evidence Chain
61
+ - Crawler: <crawlId> (quality: ...)
62
+ - Extractor: <extractId> (quality: ...)
63
+
64
+ ### Coverage Gaps
65
+ - [Any gaps from extraction]
66
+
67
+ ### Confidence
68
+ - Overall: high/medium/low
69
+ ```