@robot-resources/scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +228 -0
- package/bin/setup.js +135 -0
- package/dist/index.cjs +1002 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +373 -0
- package/dist/index.d.ts +373 -0
- package/dist/index.js +976 -0
- package/dist/index.js.map +1 -0
- package/package.json +93 -0
package/README.md
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
[](https://github.com/robot-resources/scraper/actions/workflows/ci.yml)
|
|
2
|
+
[](https://www.npmjs.com/package/@robot-resources/scraper)
|
|
3
|
+
[](https://github.com/robot-resources/scraper/blob/main/LICENSE)
|
|
4
|
+
[](https://www.npmjs.com/package/@robot-resources/scraper)
|
|
5
|
+
[](https://codecov.io/gh/robot-resources/scraper)
|
|
6
|
+
[](https://bundlephobia.com/package/@robot-resources/scraper)
|
|
7
|
+
|
|
8
|
+
# @robot-resources/scraper
|
|
9
|
+
|
|
10
|
+
> Context compression for AI agents. Fetch → Extract → Convert pipeline without LLM dependency.
|
|
11
|
+
|
|
12
|
+
Reduces web page tokens by 70-80% for AI agent consumption. 3-tier fetch with auto-fallback, BFS multi-page crawl, robots.txt compliance.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npm install @robot-resources/scraper
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Optional peer dependencies** (install only what you need):
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install impit # Stealth mode — TLS fingerprint impersonation
|
|
24
|
+
npm install playwright # Render mode — headless browser for JS-rendered pages
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```typescript
|
|
30
|
+
import { scrape } from '@robot-resources/scraper';
|
|
31
|
+
|
|
32
|
+
const result = await scrape('https://example.com/article');
|
|
33
|
+
|
|
34
|
+
console.log(result.markdown); // Compressed content
|
|
35
|
+
console.log(result.tokenCount); // Estimated tokens
|
|
36
|
+
console.log(result.title); // Page title
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Fetch Modes
|
|
40
|
+
|
|
41
|
+
Control how pages are fetched with the `mode` option:
|
|
42
|
+
|
|
43
|
+
| Mode | How | When to use |
|
|
44
|
+
|------|-----|-------------|
|
|
45
|
+
| `'fast'` | Plain HTTP fetch | Default sites, APIs, docs |
|
|
46
|
+
| `'stealth'` | TLS fingerprint impersonation (impit) | Sites with anti-bot protection |
|
|
47
|
+
| `'render'` | Headless Playwright browser | JS-rendered SPAs, dynamic content |
|
|
48
|
+
| `'auto'` | Fast first, falls back to stealth on 403/challenge | **Default** — best for unknown sites |
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
// Explicit stealth for a protected site
|
|
52
|
+
const result = await scrape('https://protected-site.com', { mode: 'stealth' });
|
|
53
|
+
|
|
54
|
+
// Auto mode (default) — tries fast, falls back to stealth if blocked
|
|
55
|
+
const result = await scrape('https://unknown-site.com');
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Crawling Multiple Pages
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
import { crawl } from '@robot-resources/scraper';
|
|
62
|
+
|
|
63
|
+
const result = await crawl({
|
|
64
|
+
url: 'https://docs.example.com',
|
|
65
|
+
depth: 2, // Max link depth (default: 2)
|
|
66
|
+
limit: 20, // Max pages (default: 50)
|
|
67
|
+
mode: 'auto', // Fetch mode per page
|
|
68
|
+
concurrency: 3, // Parallel fetches (default: 3)
|
|
69
|
+
respectRobots: true, // Obey robots.txt (default: true)
|
|
70
|
+
include: ['**/docs/**'], // Only crawl docs paths
|
|
71
|
+
exclude: ['**/archive/**'], // Skip archive
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
console.log(`Crawled ${result.totalCrawled} pages in ${result.duration}ms`);
|
|
75
|
+
|
|
76
|
+
for (const page of result.pages) {
|
|
77
|
+
console.log(`[depth ${page.depth}] ${page.title}: ${page.tokenCount} tokens`);
|
|
78
|
+
}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The crawler uses BFS link discovery, seeds from sitemap.xml when available, and respects crawl-delay from robots.txt.
|
|
82
|
+
|
|
83
|
+
## API
|
|
84
|
+
|
|
85
|
+
### `scrape(url, options?)`
|
|
86
|
+
|
|
87
|
+
Fetch a URL and return compressed markdown.
|
|
88
|
+
|
|
89
|
+
```typescript
|
|
90
|
+
const result = await scrape('https://example.com', {
|
|
91
|
+
mode: 'auto', // Fetch mode (default: 'auto')
|
|
92
|
+
timeout: 5000, // Request timeout ms (default: 10000)
|
|
93
|
+
maxRetries: 2, // Retry attempts (default: 3)
|
|
94
|
+
userAgent: '...', // Custom user agent
|
|
95
|
+
respectRobots: false, // Check robots.txt (default: false)
|
|
96
|
+
});
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Returns:** `ScrapeResult`
|
|
100
|
+
|
|
101
|
+
```typescript
|
|
102
|
+
interface ScrapeResult {
|
|
103
|
+
markdown: string; // Compressed content
|
|
104
|
+
tokenCount: number; // Estimated token count
|
|
105
|
+
title?: string; // Page title
|
|
106
|
+
author?: string; // Author if found
|
|
107
|
+
siteName?: string; // Site name if found
|
|
108
|
+
publishedAt?: string; // Publish date if found
|
|
109
|
+
url: string; // Final URL after redirects
|
|
110
|
+
}
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### `crawl(options)`
|
|
114
|
+
|
|
115
|
+
BFS multi-page crawl from a starting URL.
|
|
116
|
+
|
|
117
|
+
```typescript
|
|
118
|
+
const result = await crawl({
|
|
119
|
+
url: 'https://example.com', // Starting URL (required)
|
|
120
|
+
depth: 2, // Max depth (default: 2)
|
|
121
|
+
limit: 50, // Max pages (default: 50)
|
|
122
|
+
mode: 'auto', // Fetch mode (default: 'auto')
|
|
123
|
+
include: ['**/blog/**'], // Include patterns (glob)
|
|
124
|
+
exclude: ['**/admin/**'], // Exclude patterns (glob)
|
|
125
|
+
timeout: 10000, // Per-page timeout ms
|
|
126
|
+
concurrency: 3, // Parallel fetches (default: 3)
|
|
127
|
+
respectRobots: true, // Obey robots.txt (default: true)
|
|
128
|
+
});
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Returns:** `CrawlResult`
|
|
132
|
+
|
|
133
|
+
```typescript
|
|
134
|
+
interface CrawlResult {
|
|
135
|
+
pages: CrawlPageResult[]; // Scraped pages (extends ScrapeResult + depth)
|
|
136
|
+
totalDiscovered: number; // Total URLs found
|
|
137
|
+
totalCrawled: number; // Successfully scraped
|
|
138
|
+
totalSkipped: number; // Skipped (robots, filter, limit)
|
|
139
|
+
errors: CrawlError[]; // Per-URL errors
|
|
140
|
+
duration: number; // Total ms
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Individual Layers
|
|
145
|
+
|
|
146
|
+
For advanced usage, use the pipeline layers directly:
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
import {
|
|
150
|
+
fetchUrl,
|
|
151
|
+
fetchStealth,
|
|
152
|
+
fetchRender,
|
|
153
|
+
extractContent,
|
|
154
|
+
convertToMarkdown,
|
|
155
|
+
estimateTokens,
|
|
156
|
+
} from '@robot-resources/scraper';
|
|
157
|
+
|
|
158
|
+
// Layer 1: Fetch HTML (choose your tier)
|
|
159
|
+
const fetched = await fetchUrl('https://example.com');
|
|
160
|
+
// or: await fetchStealth(url, options)
|
|
161
|
+
// or: await fetchRender(url, options)
|
|
162
|
+
|
|
163
|
+
// Layer 2: Extract main content
|
|
164
|
+
const extracted = await extractContent(fetched);
|
|
165
|
+
|
|
166
|
+
// Layer 3: Convert to markdown
|
|
167
|
+
const converted = await convertToMarkdown(extracted);
|
|
168
|
+
|
|
169
|
+
// Token estimation
|
|
170
|
+
const htmlTokens = estimateTokens(fetched.html);
|
|
171
|
+
console.log(`Compressed ${htmlTokens} → ${converted.tokenCount} tokens`);
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Robots & Sitemap
|
|
175
|
+
|
|
176
|
+
```typescript
|
|
177
|
+
import {
|
|
178
|
+
isAllowedByRobots,
|
|
179
|
+
getCrawlDelay,
|
|
180
|
+
getSitemapUrls,
|
|
181
|
+
parseSitemap,
|
|
182
|
+
} from '@robot-resources/scraper';
|
|
183
|
+
|
|
184
|
+
const allowed = await isAllowedByRobots('https://example.com/page');
|
|
185
|
+
const delay = await getCrawlDelay('https://example.com');
|
|
186
|
+
const entries = await parseSitemap('https://example.com/sitemap.xml');
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Error Handling
|
|
190
|
+
|
|
191
|
+
```typescript
|
|
192
|
+
import { scrape, FetchError, ExtractionError } from '@robot-resources/scraper';
|
|
193
|
+
|
|
194
|
+
try {
|
|
195
|
+
const result = await scrape(url);
|
|
196
|
+
} catch (error) {
|
|
197
|
+
if (error instanceof FetchError) {
|
|
198
|
+
console.log('Fetch failed:', error.statusCode, error.retryable);
|
|
199
|
+
}
|
|
200
|
+
if (error instanceof ExtractionError) {
|
|
201
|
+
console.log('Extraction failed:', error.code);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Token Reduction
|
|
207
|
+
|
|
208
|
+
| Page Type | HTML Tokens | Markdown Tokens | Reduction |
|
|
209
|
+
|-----------|-------------|-----------------|-----------|
|
|
210
|
+
| News article | ~15,000 | ~3,000 | 80% |
|
|
211
|
+
| Documentation | ~12,000 | ~2,500 | 79% |
|
|
212
|
+
| Blog post | ~8,000 | ~1,800 | 77% |
|
|
213
|
+
|
|
214
|
+
## Requirements
|
|
215
|
+
|
|
216
|
+
- Node.js 18+
|
|
217
|
+
- ESM or CommonJS
|
|
218
|
+
|
|
219
|
+
## Related
|
|
220
|
+
|
|
221
|
+
- [@robot-resources/scraper-mcp](https://npm.im/@robot-resources/scraper-mcp) - MCP server for AI agents
|
|
222
|
+
- [@robot-resources/scraper-tracking](https://npm.im/@robot-resources/scraper-tracking) - Usage tracking
|
|
223
|
+
- [scraper.robotresources.ai](https://scraper.robotresources.ai) - Hosted API
|
|
224
|
+
- [Robot Resources](https://robotresources.ai) - Human Resources, but for your AI agents
|
|
225
|
+
|
|
226
|
+
## License
|
|
227
|
+
|
|
228
|
+
MIT
|
package/bin/setup.js
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* robot-resources-scraper — Setup wizard for @robot-resources/scraper.
|
|
5
|
+
*
|
|
6
|
+
* Triggered via `npx @robot-resources/scraper`.
|
|
7
|
+
* Offers optional GitHub login and shows usage instructions.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { readFileSync, writeFileSync, copyFileSync, mkdirSync, existsSync } from "node:fs";
|
|
11
|
+
import { homedir } from "node:os";
|
|
12
|
+
import { join } from "node:path";
|
|
13
|
+
|
|
14
|
+
// ─── ANSI helpers ────────────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
const c = {
|
|
17
|
+
reset: "\x1b[0m",
|
|
18
|
+
bold: "\x1b[1m",
|
|
19
|
+
dim: "\x1b[2m",
|
|
20
|
+
green: "\x1b[32m",
|
|
21
|
+
yellow: "\x1b[33m",
|
|
22
|
+
cyan: "\x1b[36m",
|
|
23
|
+
blue: "\x1b[34m",
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
function success(msg) { console.log(` ${c.green}✓${c.reset} ${msg}`); }
|
|
27
|
+
function step(msg) { console.log(` ${c.cyan}→${c.reset} ${msg}`); }
|
|
28
|
+
function info(msg) { console.log(` ${c.dim}${msg}${c.reset}`); }
|
|
29
|
+
function warn(msg) { console.log(` ${c.yellow}!${c.reset} ${msg}`); }
|
|
30
|
+
|
|
31
|
+
// ─── Config helpers (inline — no external deps for this bin) ─────────────────
|
|
32
|
+
|
|
33
|
+
const CONFIG_DIR = join(homedir(), ".robot-resources");
|
|
34
|
+
const CONFIG_FILE = join(CONFIG_DIR, "config.json");
|
|
35
|
+
|
|
36
|
+
function readConfig() {
|
|
37
|
+
try { return JSON.parse(readFileSync(CONFIG_FILE, "utf-8")); }
|
|
38
|
+
catch { return {}; }
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ─── MCP auto-config ─────────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
const MCP_KEY = "robot-resources-scraper";
|
|
44
|
+
const MCP_ENTRY = { command: "npx", args: ["-y", "@robot-resources/scraper-mcp"] };
|
|
45
|
+
|
|
46
|
+
function detectAgents() {
|
|
47
|
+
const home = homedir();
|
|
48
|
+
const agents = [
|
|
49
|
+
{
|
|
50
|
+
name: "Claude Desktop",
|
|
51
|
+
configPath: process.platform === "darwin"
|
|
52
|
+
? join(home, "Library", "Application Support", "Claude", "claude_desktop_config.json")
|
|
53
|
+
: join(home, ".config", "Claude", "claude_desktop_config.json"),
|
|
54
|
+
},
|
|
55
|
+
{ name: "Cursor", configPath: join(home, ".cursor", "mcp.json") },
|
|
56
|
+
];
|
|
57
|
+
return agents.filter((a) => existsSync(a.configPath) || existsSync(join(a.configPath, "..")));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function configureAgentMCP() {
|
|
61
|
+
const agents = detectAgents();
|
|
62
|
+
const results = [];
|
|
63
|
+
|
|
64
|
+
for (const agent of agents) {
|
|
65
|
+
try {
|
|
66
|
+
let config;
|
|
67
|
+
try { config = JSON.parse(readFileSync(agent.configPath, "utf-8")); }
|
|
68
|
+
catch { config = {}; }
|
|
69
|
+
|
|
70
|
+
config.mcpServers = config.mcpServers || {};
|
|
71
|
+
if (config.mcpServers[MCP_KEY]) {
|
|
72
|
+
results.push({ name: agent.name, action: "exists" });
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Backup before modifying
|
|
77
|
+
if (existsSync(agent.configPath)) {
|
|
78
|
+
copyFileSync(agent.configPath, `${agent.configPath}.bak`);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
config.mcpServers[MCP_KEY] = MCP_ENTRY;
|
|
82
|
+
mkdirSync(join(agent.configPath, ".."), { recursive: true });
|
|
83
|
+
writeFileSync(agent.configPath, JSON.stringify(config, null, 2) + "\n");
|
|
84
|
+
results.push({ name: agent.name, action: "added" });
|
|
85
|
+
} catch (err) {
|
|
86
|
+
results.push({ name: agent.name, action: "error", reason: err.message });
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return results;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// ─── Main ────────────────────────────────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
async function main() {
|
|
95
|
+
console.log(`\n ${c.blue}${c.bold}██ Robot Resources — Scraper Setup${c.reset}\n`);
|
|
96
|
+
|
|
97
|
+
// Step 1: Auth status
|
|
98
|
+
const config = readConfig();
|
|
99
|
+
if (config.api_key) {
|
|
100
|
+
success(`Logged in as ${config.user_name || config.user_email || "unknown"}`);
|
|
101
|
+
} else {
|
|
102
|
+
info("Not logged in. Scraper works without login.");
|
|
103
|
+
info(`To enable telemetry, run: ${c.cyan}npx robot-resources${c.reset}`);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Step 2: MCP auto-config
|
|
107
|
+
console.log("");
|
|
108
|
+
step("Configuring MCP in detected agents...");
|
|
109
|
+
|
|
110
|
+
const mcpResults = configureAgentMCP();
|
|
111
|
+
if (mcpResults.length === 0) {
|
|
112
|
+
info("No supported agents detected (Claude Desktop, Cursor)");
|
|
113
|
+
} else {
|
|
114
|
+
for (const r of mcpResults) {
|
|
115
|
+
if (r.action === "added") success(`${r.name}: scraper MCP configured`);
|
|
116
|
+
else if (r.action === "exists") success(`${r.name}: already configured`);
|
|
117
|
+
else warn(`${r.name}: ${r.reason || r.action}`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Step 3: Usage
|
|
122
|
+
console.log(`\n ${c.blue}${c.bold}── Ready ──${c.reset}\n`);
|
|
123
|
+
console.log(" Use as a library:");
|
|
124
|
+
console.log(` ${c.dim}import { scrape } from '@robot-resources/scraper';${c.reset}`);
|
|
125
|
+
console.log(` ${c.dim}const result = await scrape('https://example.com');${c.reset}`);
|
|
126
|
+
console.log("");
|
|
127
|
+
console.log(" Use via MCP (already configured above):");
|
|
128
|
+
console.log(` ${c.dim}Your agent can call scraper_compress_url(url)${c.reset}`);
|
|
129
|
+
console.log("");
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
main().catch((err) => {
|
|
133
|
+
console.error(`\n Setup failed: ${err.message}\n`);
|
|
134
|
+
process.exit(1);
|
|
135
|
+
});
|