@render-harness/cap-scrape-firecrawl 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -0
- package/dist/index.d.ts +31 -0
- package/dist/index.js +174 -0
- package/dist/index.js.map +1 -0
- package/package.json +50 -0
- package/skills/firecrawl-scrape.md +24 -0
package/README.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
## @render-harness/cap-scrape-firecrawl
|
|
2
|
+
|
|
3
|
+
Wires Firecrawl into a Render harness entry, with optional Postgres persistence:
|
|
4
|
+
|
|
5
|
+
```yaml
|
|
6
|
+
capabilities:
|
|
7
|
+
- pack: "@render-harness/cap-scrape-firecrawl"
|
|
8
|
+
config:
|
|
9
|
+
persist: true # default
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Set `FIRECRAWL_API_KEY` in the entry's environment.
|
|
13
|
+
|
|
14
|
+
The pack contributes:
|
|
15
|
+
|
|
16
|
+
- An MCP server `cap-scrape-firecrawl.firecrawl` (stdio transport, runs `npx -y firecrawl-mcp`).
|
|
17
|
+
- A `LocalToolHandler` `cap-scrape-firecrawl.scrape_and_store` that calls Firecrawl's REST API and persists the rendered markdown to a `firecrawl_scrapes` table.
|
|
18
|
+
- A skill (`firecrawl-scrape`).
|
|
19
|
+
- A `FIRECRAWL_API_KEY` entry in the effective env schema.
|
|
20
|
+
|
|
21
|
+
The Postgres table is created on first call via `CREATE TABLE IF NOT EXISTS`, so no separate migration step is needed.
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import * as _render_harness_registry from '@render-harness/registry';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* cap-scrape-firecrawl — wires Firecrawl's MCP server AND adds a
|
|
5
|
+
* `scrape_and_store` LocalToolHandler that calls Firecrawl's REST API
|
|
6
|
+
* directly and persists the result in Postgres for later retrieval.
|
|
7
|
+
*
|
|
8
|
+
* Usage in render-harness.yaml:
|
|
9
|
+
*
|
|
10
|
+
* capabilities:
|
|
11
|
+
* - pack: "@render-harness/cap-scrape-firecrawl"
|
|
12
|
+
* config:
|
|
13
|
+
* persist: true # default
|
|
14
|
+
*
|
|
15
|
+
* Surfaces:
|
|
16
|
+
* - One MCP server `firecrawl` (stdio transport).
|
|
17
|
+
* - One LocalToolHandler `scrape_and_store` (when `persist: true`).
|
|
18
|
+
* - One skill (skills/firecrawl-scrape.md).
|
|
19
|
+
* - envSchema entry for FIRECRAWL_API_KEY.
|
|
20
|
+
*
|
|
21
|
+
* Config keys:
|
|
22
|
+
* - `apiKeyEnv` (string, default "FIRECRAWL_API_KEY")
|
|
23
|
+
* - `persist` (boolean, default true) — disable to skip the Postgres
|
|
24
|
+
* bootstrap and the scrape_and_store tool. Useful when the entry
|
|
25
|
+
* wants Firecrawl tools but already has its own storage.
|
|
26
|
+
* - `apiBase` (string, default "https://api.firecrawl.dev") — the
|
|
27
|
+
* REST base URL. Override for self-hosted Firecrawl.
|
|
28
|
+
*/
|
|
29
|
+
declare const pack: _render_harness_registry.CapabilityPack;
|
|
30
|
+
|
|
31
|
+
export { pack as default };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import { dirname, join } from 'path';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { getPool } from '@render-harness/core';
|
|
4
|
+
import { definePack } from '@render-harness/registry';
|
|
5
|
+
|
|
6
|
+
// src/index.ts
|
|
7
|
+
var HERE = dirname(fileURLToPath(import.meta.url));
|
|
8
|
+
var SKILLS_DIR = join(HERE, "..", "skills");
|
|
9
|
+
function readConfig(ctx) {
|
|
10
|
+
const cfg = ctx.config;
|
|
11
|
+
return {
|
|
12
|
+
apiKeyEnv: cfg.apiKeyEnv ?? "FIRECRAWL_API_KEY",
|
|
13
|
+
persist: cfg.persist ?? true,
|
|
14
|
+
apiBase: cfg.apiBase ?? "https://api.firecrawl.dev"
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
var SCHEMA_BOOTSTRAP_SQL = `
|
|
18
|
+
CREATE TABLE IF NOT EXISTS firecrawl_scrapes (
|
|
19
|
+
id BIGSERIAL PRIMARY KEY,
|
|
20
|
+
run_id UUID NOT NULL,
|
|
21
|
+
url TEXT NOT NULL,
|
|
22
|
+
fetched_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
23
|
+
status_code INTEGER,
|
|
24
|
+
markdown TEXT,
|
|
25
|
+
raw JSONB,
|
|
26
|
+
CONSTRAINT firecrawl_scrapes_url_run UNIQUE (run_id, url)
|
|
27
|
+
);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS firecrawl_scrapes_run_idx ON firecrawl_scrapes (run_id);
|
|
29
|
+
`;
|
|
30
|
+
var bootstrapped = false;
|
|
31
|
+
async function ensureSchema() {
|
|
32
|
+
if (bootstrapped) return;
|
|
33
|
+
const pool = getPool();
|
|
34
|
+
await pool.query(SCHEMA_BOOTSTRAP_SQL);
|
|
35
|
+
bootstrapped = true;
|
|
36
|
+
}
|
|
37
|
+
var pack = definePack({
|
|
38
|
+
name: "cap-scrape-firecrawl",
|
|
39
|
+
version: "0.1.0",
|
|
40
|
+
envSchema: [
|
|
41
|
+
{
|
|
42
|
+
name: "FIRECRAWL_API_KEY",
|
|
43
|
+
required: true,
|
|
44
|
+
secret: true,
|
|
45
|
+
description: "API key for Firecrawl (https://firecrawl.dev)."
|
|
46
|
+
}
|
|
47
|
+
],
|
|
48
|
+
mcpServers(ctx) {
|
|
49
|
+
const cfg = readConfig(ctx);
|
|
50
|
+
const apiKey = ctx.env(cfg.apiKeyEnv);
|
|
51
|
+
if (!apiKey) {
|
|
52
|
+
throw new Error(
|
|
53
|
+
`cap-scrape-firecrawl: env var ${cfg.apiKeyEnv} is not set. Set it before building or starting the agent.`
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
return [
|
|
57
|
+
{
|
|
58
|
+
name: "firecrawl",
|
|
59
|
+
transport: "stdio",
|
|
60
|
+
command: "npx",
|
|
61
|
+
args: ["-y", "firecrawl-mcp"],
|
|
62
|
+
env: { FIRECRAWL_API_KEY: apiKey }
|
|
63
|
+
}
|
|
64
|
+
];
|
|
65
|
+
},
|
|
66
|
+
localTools(ctx) {
|
|
67
|
+
const cfg = readConfig(ctx);
|
|
68
|
+
if (!cfg.persist) return [];
|
|
69
|
+
const apiKey = ctx.env(cfg.apiKeyEnv);
|
|
70
|
+
if (!apiKey) return [];
|
|
71
|
+
const apiBase = cfg.apiBase;
|
|
72
|
+
const scrapeAndStore = {
|
|
73
|
+
definition: {
|
|
74
|
+
name: "scrape_and_store",
|
|
75
|
+
description: "Scrape a URL via Firecrawl and persist the rendered markdown + raw JSON to Postgres. Returns the row id and a short excerpt. Use this when you need to come back to the page later in the run.",
|
|
76
|
+
source: "pack:cap-scrape-firecrawl",
|
|
77
|
+
inputSchema: {
|
|
78
|
+
type: "object",
|
|
79
|
+
additionalProperties: false,
|
|
80
|
+
properties: {
|
|
81
|
+
url: { type: "string", description: "The URL to scrape." },
|
|
82
|
+
timeout_ms: {
|
|
83
|
+
type: "integer",
|
|
84
|
+
description: "Per-request timeout in milliseconds. Defaults to 60000.",
|
|
85
|
+
minimum: 1e3,
|
|
86
|
+
maximum: 3e5
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
required: ["url"]
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
async handler({ input, runId, signal, logger }) {
|
|
93
|
+
const args = input ?? {};
|
|
94
|
+
if (!args.url || typeof args.url !== "string") {
|
|
95
|
+
return { content: "scrape_and_store: missing or invalid `url`", isError: true };
|
|
96
|
+
}
|
|
97
|
+
await ensureSchema();
|
|
98
|
+
const timeoutMs = args.timeout_ms ?? 6e4;
|
|
99
|
+
const ctrl = new AbortController();
|
|
100
|
+
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
101
|
+
const onUpstreamAbort = () => ctrl.abort();
|
|
102
|
+
signal.addEventListener("abort", onUpstreamAbort, { once: true });
|
|
103
|
+
try {
|
|
104
|
+
const res = await fetch(`${apiBase}/v1/scrape`, {
|
|
105
|
+
method: "POST",
|
|
106
|
+
headers: {
|
|
107
|
+
Authorization: `Bearer ${apiKey}`,
|
|
108
|
+
"content-type": "application/json"
|
|
109
|
+
},
|
|
110
|
+
body: JSON.stringify({ url: args.url, formats: ["markdown"] }),
|
|
111
|
+
signal: ctrl.signal
|
|
112
|
+
});
|
|
113
|
+
if (!res.ok) {
|
|
114
|
+
const body = await res.text().catch(() => "");
|
|
115
|
+
logger.warn({ status: res.status, body }, "firecrawl scrape failed");
|
|
116
|
+
return {
|
|
117
|
+
content: `scrape_and_store: firecrawl returned ${res.status}: ${body.slice(0, 500)}`,
|
|
118
|
+
isError: true
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
const json = await res.json();
|
|
122
|
+
const markdown = json.data?.markdown ?? "";
|
|
123
|
+
const statusCode = json.data?.metadata?.statusCode ?? null;
|
|
124
|
+
const pool = getPool();
|
|
125
|
+
const insert = await pool.query(
|
|
126
|
+
`INSERT INTO firecrawl_scrapes (run_id, url, status_code, markdown, raw)
|
|
127
|
+
VALUES ($1, $2, $3, $4, $5)
|
|
128
|
+
ON CONFLICT (run_id, url) DO UPDATE SET
|
|
129
|
+
fetched_at = now(),
|
|
130
|
+
status_code = EXCLUDED.status_code,
|
|
131
|
+
markdown = EXCLUDED.markdown,
|
|
132
|
+
raw = EXCLUDED.raw
|
|
133
|
+
RETURNING id`,
|
|
134
|
+
[runId, args.url, statusCode, markdown, json]
|
|
135
|
+
);
|
|
136
|
+
const rowId = insert.rows[0]?.id ?? "?";
|
|
137
|
+
const excerpt = markdown.slice(0, 1e3);
|
|
138
|
+
return {
|
|
139
|
+
content: `Stored as firecrawl_scrapes.id=${rowId}. status=${statusCode ?? "?"}. excerpt:
|
|
140
|
+
|
|
141
|
+
${excerpt}`
|
|
142
|
+
};
|
|
143
|
+
} catch (err) {
|
|
144
|
+
if (err.name === "AbortError") {
|
|
145
|
+
return { content: "scrape_and_store: timed out", isError: true };
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
content: `scrape_and_store: ${err.message}`,
|
|
149
|
+
isError: true
|
|
150
|
+
};
|
|
151
|
+
} finally {
|
|
152
|
+
clearTimeout(timer);
|
|
153
|
+
signal.removeEventListener("abort", onUpstreamAbort);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
return [scrapeAndStore];
|
|
158
|
+
},
|
|
159
|
+
skills(_ctx) {
|
|
160
|
+
return [
|
|
161
|
+
{
|
|
162
|
+
name: "firecrawl-scrape",
|
|
163
|
+
description: "Use Firecrawl to render a URL into clean markdown and store it.",
|
|
164
|
+
whenToUse: "When the user gives you a URL whose content you'll need later in the run, OR you've already searched and want to read the top result. Prefer scrape_and_store over the raw MCP tool when persistence matters.",
|
|
165
|
+
contentPath: join(SKILLS_DIR, "firecrawl-scrape.md")
|
|
166
|
+
}
|
|
167
|
+
];
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
var src_default = pack;
|
|
171
|
+
|
|
172
|
+
export { src_default as default };
|
|
173
|
+
//# sourceMappingURL=index.js.map
|
|
174
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"names":[],"mappings":";;;;;;AAqCA,IAAM,IAAA,GAAO,OAAA,CAAQ,aAAA,CAAc,MAAA,CAAA,IAAA,CAAY,GAAG,CAAC,CAAA;AACnD,IAAM,UAAA,GAAa,IAAA,CAAK,IAAA,EAAM,IAAA,EAAM,QAAQ,CAAA;AAQ5C,SAAS,WAAW,GAAA,EAAkB;AACpC,EAAA,MAAM,MAAM,GAAA,CAAI,MAAA;AAChB,EAAA,OAAO;AAAA,IACL,SAAA,EAAW,IAAI,SAAA,IAAa,mBAAA;AAAA,IAC5B,OAAA,EAAS,IAAI,OAAA,IAAW,IAAA;AAAA,IACxB,OAAA,EAAS,IAAI,OAAA,IAAW;AAAA,GAC1B;AACF;AAEA,IAAM,oBAAA,GAAuB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,CAAA;AAc7B,IAAI,YAAA,GAAe,KAAA;AACnB,eAAe,YAAA,GAA8B;AAC3C,EAAA,IAAI,YAAA,EAAc;AAClB,EAAA,MAAM,OAAO,OAAA,EAAQ;AACrB,EAAA,MAAM,IAAA,CAAK,MAAM,oBAAoB,CAAA;AACrC,EAAA,YAAA,GAAe,IAAA;AACjB;AAEA,IAAM,OAAO,UAAA,CAAW;AAAA,EACtB,IAAA,EAAM,sBAAA;AAAA,EACN,OAAA,EAAS,OAAA;AAAA,EACT,SAAA,EAAW;AAAA,IACT;AAAA,MACE,IAAA,EAAM,mBAAA;AAAA,MACN,QAAA,EAAU,IAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,WAAA,EAAa;AAAA;AACf,GACF;AAAA,EACA,WAAW,GAAA,EAAqC;AAC9C,IAAA,MAAM,GAAA,GAAM,WAAW,GAAG,CAAA;AAC1B,IAAA,MAAM,MAAA,GAAS,GAAA,CAAI,GAAA,CAAI,GAAA,CAAI,SAAS,CAAA;AACpC,IAAA,IAAI,CAAC,MAAA,EAAQ;AACX,MAAA,MAAM,IAAI,KAAA;AAAA,QACR,CAAA,8BAAA,EAAiC,IAAI,SAAS,CAAA,0DAAA;AAAA,OAChD;AAAA,IACF;AACA,IAAA,OAAO;AAAA,MACL;AAAA,QACE,IAAA,EAAM,WAAA;AAAA,QACN,SAAA,EAAW,OAAA;AAAA,QACX,OAAA,EAAS,KAAA;AAAA,QACT,IAAA,EAAM,CAAC,IAAA,EAAM,eAAe,CAAA;AAAA,QAC5B,GAAA,EAAK,EAAE,iBAAA,EAAmB,MAAA;AAAO;AACnC,KACF;AAAA,EACF,CAAA;AAAA,EACA,WAAW,GAAA,EAAsC;AAC/C,IAAA,MAAM,GAAA,GAAM,WAAW,GAAG,CAAA;AAC1B,IAAA,IAAI,CAAC,GAAA,CAAI,OAAA,EAAS,OAAO,EAAC;AAC1B,IAAA,MAAM,MAAA,GAAS,GAAA,CAAI,GAAA,CAAI,GAAA,CAAI,SAAS,CAAA;AACpC,IAAA,IAAI,CAAC,MAAA,EAAQ,OAAO,EAAC;AACrB,IAAA,MAAM,UAAU,GAAA,CAAI,OAAA;AAEpB,IAAA,MAAM,cAAA,GAAmC;AAAA,MACvC,UAAA,EAAY;AAAA,QACV,IAAA,EAAM,kBAAA;AAAA,QACN,WAAA,EACE,gMAAA;AAAA,QACF,MAAA,EAAQ,2BAAA;AAAA,QACR,WAAA,EAAa;AAAA,UACX,IAAA,EAAM,QAAA;AAAA,UACN,oBAAA,EAAsB,KAAA;AAAA,UACtB,UAAA,EAAY;AAAA,YACV,GAAA,EAAK,EAAE,IAAA,EAAM,QAAA,EAAU,aAAa,oBAAA,EAAqB;AAAA,YACzD,UAAA,EAAY;AAAA,cACV,IAAA,EAAM,SAAA;AAAA,cACN,WAAA,EAAa,yDAAA;AAAA,cACb,OAAA,EAAS,GAAA;AAAA,cACT,OAAA,EAAS;AAAA;AACX,WACF;AAAA,UACA,QAAA,EAAU,CAAC,KAAK;AAAA;AAClB,OACF;AAAA,MACA,MAAM,OAAA,CAAQ,EAAE,OAAO,KAAA,EAAO,MAAA,EAAQ,QAAO,EAAG;AAC9C,QAAA,MAAM,IAAA,GAAQ,SAAS,EAAC;AACxB,QAAA,IAAI,CAAC,IAAA,CAAK,GAAA,IAAO,OAAO,IAAA,CAAK,QAAQ,QAAA,EAAU;AAC7C,UAAA,OAAO,EAAE,OAAA,EAAS,4CAAA,EAA8C,OAAA,EAAS,IAAA,EAAK;AAAA,QAChF;AACA,QAAA,MAAM,YAAA,EAAa;AACnB,QAAA,MAAM,SAAA,GAAY,KAAK,UAAA,IAAc,GAAA;AACrC,QAAA,MAAM,IAAA,GAAO,IAAI,eAAA,EAAgB;AACjC,QAAA,MAAM,QAAQ,UAAA,CAAW,MAAM,IAAA,CAAK,KAAA,IAAS,SAAS,CAAA;AACtD,QAAA,MAAM,eAAA,GAAkB,MAAM,IAAA,CAAK,KAAA,EAAM;AACzC,QAAA,MAAA,CAAO,iBAAiB,OAAA,EAAS,eAAA,EAAiB,EAAE,IAAA,EAAM,MAAM,CAAA;AAChE,QAAA,IAAI;AACF,UAAA,MAAM,GAAA,GAAM,MAAM,KAAA,CAAM,CAAA,EAAG,OAAO,CAAA,UAAA,CAAA,EAAc;AAAA,YAC9C,MAAA,EAAQ,MAAA;AAAA,YACR,OAAA,EAAS;AAAA,cACP,aAAA,EAAe,UAAU,MAAM,CAAA,CAAA;AAAA,cAC/B,cAAA,EAAgB;AAAA,aAClB;AAAA,YACA,IAAA,EAAM,IAAA,CAAK,SAAA,CAAU,EAAE,GAAA,EAAK,IAAA,CAAK,GAAA,EAAK,OAAA,EAAS,CAAC,UAAU,CAAA,EAAG,CAAA;AAAA,YAC7D,QAAQ,IAAA,CAAK;AAAA,WACd,CAAA;AACD,UAAA,IAAI,CAAC,IAAI,EAAA,EAAI;AACX,YAAA,MAAM,OAAO,MAAM,GAAA,CAAI,MAAK,CAAE,KAAA,CAAM,MAAM,EAAE,CAAA;AAC5C,YAAA,MAAA,CAAO,KAAK,EAAE,MAAA,EAAQ,IAAI,MAAA,EAAQ,IAAA,IAAQ,yBAAyB,CAAA;AACnE,YAAA,OAAO;AAAA,cACL,OAAA,EAAS,wCAAwC,GAAA,CAAI,MAAM,KAAK,IAAA,CAAK,KAAA,CAAM,CAAA,EAAG,GAAG,CAAC,CAAA,CAAA;AAAA,cAClF,OAAA,EAAS;AAAA,aACX;AAAA,UACF;AACA,UAAA,MAAM,IAAA,GAAQ,MAAM,GAAA,CAAI,IAAA,EAAK;AAG7B,UAAA,MAAM,QAAA,GAAW,IAAA,CAAK,IAAA,EAAM,QAAA,IAAY,EAAA;AACxC,UAAA,MAAM,UAAA,GAAa,IAAA,CAAK,IAAA,EAAM,QAAA,EAAU,UAAA,IAAc,IAAA;AACtD,UAAA,MAAM,OAAO,OAAA,EAAQ;AACrB,UAAA,MAAM,MAAA,GAAS,MAAM,IAAA,CAAK,KAAA;AAAA,YACxB,CAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,yBAAA,CAAA;AAAA,YAQA,CAAC,KAAA,EAAO,IAAA,CAAK,GAAA,EAAK,UAAA,EAAY,UAAU,IAAI;AAAA,WAC9C;AACA,UAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,IAAA,CAAK,CAAC,GAAG,EAAA,IAAM,GAAA;AACpC,UAAA,MAAM,OAAA,GAAU,QAAA,CAAS,KAAA,CAAM,CAAA,EAAG,GAAI,CAAA;AACtC,UAAA,OAAO;AAAA,YACL,OAAA,EAAS,CAAA,+BAAA,EAAkC,KAAK,CAAA,SAAA,EAAY,cAAc,GAAG,CAAA;;AAAA,EAAiB,OAAO,CAAA;AAAA,WACvG;AAAA,QACF,SAAS,GAAA,EAAK;AACZ,UAAA,IAAK,GAAA,CAAc,SAAS,YAAA,EAAc;AACxC,YAAA,OAAO,EAAE,OAAA,EAAS,6BAAA,EAA+B,OAAA,EAAS,IAAA,EAAK;AAAA,UACjE;AACA,UAAA,OAAO;AAAA,YACL,OAAA,EAAS,CAAA,kBAAA,EAAsB,GAAA,CAAc,OAAO,CAAA,CAAA;AAAA,YACpD,OAAA,EAAS;AAAA,WACX;AAAA,QACF,CAAA,SAAE;AACA,UAAA,YAAA,CAAa,KAAK,CAAA;AAClB,UAAA,MAAA,CAAO,mBAAA,CAAoB,SAAS,eAAe,CAAA;AAAA,QACrD;AAAA,MACF;AAAA,KACF;AACA,IAAA,OAAO,CAAC,cAAc,CAAA;AAAA,EACxB,CAAA;AAAA,EACA,OAAO,IAAA,EAAoC;AACzC,IAAA,OAAO;AAAA,MACL;AAAA,QACE,IAAA,EAAM,kBAAA;AAAA,QACN,WAAA,EAAa,iEAAA;AAAA,QACb,SAAA,EACE,+MAAA;AAAA,QACF,WAAA,EAAa,IAAA,CAAK,UAAA,EAAY,qBAAqB;AAAA;AACrD,KACF;AAAA,EACF;AACF,CAAC,CAAA;AAED,IAAO,WAAA,GAAQ","file":"index.js","sourcesContent":["/**\n * cap-scrape-firecrawl — wires Firecrawl's MCP server AND adds a\n * `scrape_and_store` LocalToolHandler that calls Firecrawl's REST API\n * directly and persists the result in Postgres for later retrieval.\n *\n * Usage in render-harness.yaml:\n *\n * capabilities:\n * - pack: \"@render-harness/cap-scrape-firecrawl\"\n * config:\n * persist: true # default\n *\n * Surfaces:\n * - One MCP server `firecrawl` (stdio transport).\n * - One LocalToolHandler `scrape_and_store` (when `persist: true`).\n * - One skill (skills/firecrawl-scrape.md).\n * - envSchema entry for FIRECRAWL_API_KEY.\n *\n * Config keys:\n * - `apiKeyEnv` (string, default \"FIRECRAWL_API_KEY\")\n * - `persist` (boolean, default true) — disable to skip the Postgres\n * bootstrap and the scrape_and_store tool. Useful when the entry\n * wants Firecrawl tools but already has its own storage.\n * - `apiBase` (string, default \"https://api.firecrawl.dev\") — the\n * REST base URL. Override for self-hosted Firecrawl.\n */\n\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport {\n type LocalToolHandler,\n type McpServerConfig,\n type SkillMetadata,\n getPool,\n} from \"@render-harness/core\";\nimport { definePack, type PackContext } from \"@render-harness/registry\";\n\nconst HERE = dirname(fileURLToPath(import.meta.url));\nconst SKILLS_DIR = join(HERE, \"..\", \"skills\");\n\ninterface FirecrawlConfig {\n apiKeyEnv?: string;\n persist?: boolean;\n apiBase?: string;\n}\n\nfunction readConfig(ctx: PackContext) {\n const cfg = ctx.config as FirecrawlConfig;\n return {\n apiKeyEnv: cfg.apiKeyEnv ?? \"FIRECRAWL_API_KEY\",\n persist: cfg.persist ?? true,\n apiBase: cfg.apiBase ?? \"https://api.firecrawl.dev\",\n };\n}\n\nconst SCHEMA_BOOTSTRAP_SQL = `\nCREATE TABLE IF NOT EXISTS firecrawl_scrapes (\n id BIGSERIAL PRIMARY KEY,\n run_id UUID NOT NULL,\n url TEXT NOT NULL,\n fetched_at TIMESTAMPTZ NOT NULL DEFAULT now(),\n status_code INTEGER,\n markdown TEXT,\n raw JSONB,\n CONSTRAINT firecrawl_scrapes_url_run UNIQUE (run_id, url)\n);\nCREATE INDEX IF NOT EXISTS firecrawl_scrapes_run_idx ON firecrawl_scrapes (run_id);\n`;\n\nlet bootstrapped = false;\nasync function ensureSchema(): Promise<void> {\n if (bootstrapped) return;\n const pool = getPool();\n await pool.query(SCHEMA_BOOTSTRAP_SQL);\n bootstrapped = true;\n}\n\nconst pack = definePack({\n name: \"cap-scrape-firecrawl\",\n version: \"0.1.0\",\n envSchema: [\n {\n name: \"FIRECRAWL_API_KEY\",\n required: true,\n secret: true,\n description: \"API key for Firecrawl (https://firecrawl.dev).\",\n },\n ],\n mcpServers(ctx: PackContext): McpServerConfig[] {\n const cfg = readConfig(ctx);\n const apiKey = ctx.env(cfg.apiKeyEnv);\n if (!apiKey) {\n throw new Error(\n `cap-scrape-firecrawl: env var ${cfg.apiKeyEnv} is not set. Set it before building or starting the agent.`,\n );\n }\n return [\n {\n name: \"firecrawl\",\n transport: \"stdio\",\n command: \"npx\",\n args: [\"-y\", \"firecrawl-mcp\"],\n env: { FIRECRAWL_API_KEY: apiKey },\n },\n ];\n },\n localTools(ctx: PackContext): LocalToolHandler[] {\n const cfg = readConfig(ctx);\n if (!cfg.persist) return [];\n const apiKey = ctx.env(cfg.apiKeyEnv);\n if (!apiKey) return [];\n const apiBase = cfg.apiBase;\n\n const scrapeAndStore: LocalToolHandler = {\n definition: {\n name: \"scrape_and_store\",\n description:\n \"Scrape a URL via Firecrawl and persist the rendered markdown + raw JSON to Postgres. Returns the row id and a short excerpt. Use this when you need to come back to the page later in the run.\",\n source: \"pack:cap-scrape-firecrawl\",\n inputSchema: {\n type: \"object\",\n additionalProperties: false,\n properties: {\n url: { type: \"string\", description: \"The URL to scrape.\" },\n timeout_ms: {\n type: \"integer\",\n description: \"Per-request timeout in milliseconds. Defaults to 60000.\",\n minimum: 1000,\n maximum: 300000,\n },\n },\n required: [\"url\"],\n },\n },\n async handler({ input, runId, signal, logger }) {\n const args = (input ?? {}) as { url?: string; timeout_ms?: number };\n if (!args.url || typeof args.url !== \"string\") {\n return { content: \"scrape_and_store: missing or invalid `url`\", isError: true };\n }\n await ensureSchema();\n const timeoutMs = args.timeout_ms ?? 60_000;\n const ctrl = new AbortController();\n const timer = setTimeout(() => ctrl.abort(), timeoutMs);\n const onUpstreamAbort = () => ctrl.abort();\n signal.addEventListener(\"abort\", onUpstreamAbort, { once: true });\n try {\n const res = await fetch(`${apiBase}/v1/scrape`, {\n method: \"POST\",\n headers: {\n Authorization: `Bearer ${apiKey}`,\n \"content-type\": \"application/json\",\n },\n body: JSON.stringify({ url: args.url, formats: [\"markdown\"] }),\n signal: ctrl.signal,\n });\n if (!res.ok) {\n const body = await res.text().catch(() => \"\");\n logger.warn({ status: res.status, body }, \"firecrawl scrape failed\");\n return {\n content: `scrape_and_store: firecrawl returned ${res.status}: ${body.slice(0, 500)}`,\n isError: true,\n };\n }\n const json = (await res.json()) as {\n data?: { markdown?: string; metadata?: { statusCode?: number } };\n };\n const markdown = json.data?.markdown ?? \"\";\n const statusCode = json.data?.metadata?.statusCode ?? null;\n const pool = getPool();\n const insert = await pool.query<{ id: string }>(\n `INSERT INTO firecrawl_scrapes (run_id, url, status_code, markdown, raw)\n VALUES ($1, $2, $3, $4, $5)\n ON CONFLICT (run_id, url) DO UPDATE SET\n fetched_at = now(),\n status_code = EXCLUDED.status_code,\n markdown = EXCLUDED.markdown,\n raw = EXCLUDED.raw\n RETURNING id`,\n [runId, args.url, statusCode, markdown, json],\n );\n const rowId = insert.rows[0]?.id ?? \"?\";\n const excerpt = markdown.slice(0, 1000);\n return {\n content: `Stored as firecrawl_scrapes.id=${rowId}. status=${statusCode ?? \"?\"}. excerpt:\\n\\n${excerpt}`,\n };\n } catch (err) {\n if ((err as Error).name === \"AbortError\") {\n return { content: \"scrape_and_store: timed out\", isError: true };\n }\n return {\n content: `scrape_and_store: ${(err as Error).message}`,\n isError: true,\n };\n } finally {\n clearTimeout(timer);\n signal.removeEventListener(\"abort\", onUpstreamAbort);\n }\n },\n };\n return [scrapeAndStore];\n },\n skills(_ctx: PackContext): SkillMetadata[] {\n return [\n {\n name: \"firecrawl-scrape\",\n description: \"Use Firecrawl to render a URL into clean markdown and store it.\",\n whenToUse:\n \"When the user gives you a URL whose content you'll need later in the run, OR you've already searched and want to read the top result. Prefer scrape_and_store over the raw MCP tool when persistence matters.\",\n contentPath: join(SKILLS_DIR, \"firecrawl-scrape.md\"),\n },\n ];\n },\n});\n\nexport default pack;\n"]}
|
package/package.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@render-harness/cap-scrape-firecrawl",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Firecrawl scrape capability pack with optional Postgres persistence.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"main": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"types": "./dist/index.d.ts",
|
|
12
|
+
"import": "./dist/index.js"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"dist",
|
|
17
|
+
"skills"
|
|
18
|
+
],
|
|
19
|
+
"keywords": [
|
|
20
|
+
"render-harness-cap",
|
|
21
|
+
"render-harness",
|
|
22
|
+
"firecrawl",
|
|
23
|
+
"scrape",
|
|
24
|
+
"mcp"
|
|
25
|
+
],
|
|
26
|
+
"renderHarness": {
|
|
27
|
+
"gallery": {
|
|
28
|
+
"label": "Firecrawl page scrape",
|
|
29
|
+
"envHint": "FIRECRAWL_API_KEY"
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"scripts": {
|
|
33
|
+
"build": "tsup",
|
|
34
|
+
"typecheck": "tsc --noEmit",
|
|
35
|
+
"test": "vitest run --passWithNoTests"
|
|
36
|
+
},
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"@render-harness/core": "workspace:*",
|
|
39
|
+
"@render-harness/registry": "workspace:*"
|
|
40
|
+
},
|
|
41
|
+
"devDependencies": {
|
|
42
|
+
"@types/node": "^25.6.2",
|
|
43
|
+
"tsup": "^8.5.1",
|
|
44
|
+
"typescript": "^6.0.3",
|
|
45
|
+
"vitest": "^4.1.5"
|
|
46
|
+
},
|
|
47
|
+
"publishConfig": {
|
|
48
|
+
"access": "public"
|
|
49
|
+
}
|
|
50
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: firecrawl-scrape
|
|
3
|
+
description: Use Firecrawl to render a URL into clean markdown and store it.
|
|
4
|
+
when_to_use: When the user gives you a URL whose content you'll need later in the run, OR you've already searched and want to read the top result. Prefer scrape_and_store over the raw MCP tool when persistence matters.
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Scraping with Firecrawl
|
|
8
|
+
|
|
9
|
+
Two ways to use Firecrawl from this entry:
|
|
10
|
+
|
|
11
|
+
1. The `cap-scrape-firecrawl.firecrawl` MCP — exposes Firecrawl's full tool set: `firecrawl_scrape`, `firecrawl_crawl`, `firecrawl_search`, etc. Use this for one-off lookups whose output you don't need to revisit.
|
|
12
|
+
|
|
13
|
+
2. The `scrape_and_store` local tool — same scrape semantics, but the rendered markdown + raw JSON are inserted into Postgres (`firecrawl_scrapes` table) keyed on `(run_id, url)`. The tool returns the row id plus a 1k-character excerpt. Use this when you'll come back to the page later in the run, or when you want a durable record of what you saw.
|
|
14
|
+
|
|
15
|
+
## When to use scrape_and_store vs the MCP
|
|
16
|
+
|
|
17
|
+
- The page is large (>~5k tokens) and you only need an excerpt now: scrape_and_store; later, query the row id directly.
|
|
18
|
+
- You're scraping many pages: scrape_and_store gives you a stable row id you can reference instead of re-passing the markdown through the context.
|
|
19
|
+
- Quick one-shot answer where you'll never come back: use the MCP.
|
|
20
|
+
|
|
21
|
+
## Hard rules
|
|
22
|
+
|
|
23
|
+
- Always prefer the page's canonical URL over a query-string variant. The unique constraint is on `(run_id, url)`.
|
|
24
|
+
- If a scrape fails, surface the error in your final answer; don't keep retrying with the same URL.
|