arcfetch 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +157 -158
- package/index.ts +1 -1
- package/package.json +1 -1
- package/src/core/fetch-links.ts +1 -1
- package/src/core/pipeline.ts +1 -0
- package/src/core/playwright/local.ts +21 -13
- package/src/core/playwright/manager.ts +55 -5
package/README.md
CHANGED
|
@@ -12,6 +12,7 @@ Perfect for AI workflows, research, and documentation. Fetches URLs, extracts ar
|
|
|
12
12
|
| Problem | Solution |
|
|
13
13
|
|---------|----------|
|
|
14
14
|
| **JS-heavy sites return blank** | Auto-detects and retries with Playwright |
|
|
15
|
+
| **Login walls / error pages scored as content** | Boilerplate detection (22 patterns) catches them |
|
|
15
16
|
| **Too much HTML clutter** | Mozilla Readability extracts just the article |
|
|
16
17
|
| **High token costs for LLMs** | 90-95% token reduction vs raw HTML |
|
|
17
18
|
| **No good caching story** | Temp → Docs workflow for easy curation |
|
|
@@ -20,12 +21,14 @@ Perfect for AI workflows, research, and documentation. Fetches URLs, extracts ar
|
|
|
20
21
|
## Features
|
|
21
22
|
|
|
22
23
|
- **Smart Fetching**: Simple HTTP first, automatic Playwright fallback for JS-heavy sites
|
|
23
|
-
- **Quality Gates**:
|
|
24
|
+
- **Quality Gates**: Scoring (0-100) with boilerplate, login wall, paywall, and error page detection
|
|
25
|
+
- **Content-to-Source Ratio**: Catches JS-rendered or gated content by comparing extracted text to source HTML size
|
|
26
|
+
- **Anti-Bot Detection**: Stealth plugin, viewport/timezone/locale rotation, realistic headers, navigator overrides
|
|
24
27
|
- **Clean Markdown**: Mozilla Readability + Turndown for 90-95% token reduction
|
|
25
28
|
- **Temp → Docs Workflow**: Cache to temp folder, promote to docs when ready
|
|
26
|
-
- **
|
|
29
|
+
- **Link Extraction**: Extract and batch-fetch all links from a cached reference
|
|
30
|
+
- **CLI & MCP**: Available as command-line tool and MCP server (6 tools)
|
|
27
31
|
- **Multiple Output Formats**: Plain text, JSON, filepath, or summary
|
|
28
|
-
- **Configurable Thresholds**: Set quality minimums and retry strategies
|
|
29
32
|
|
|
30
33
|
## Quick Start
|
|
31
34
|
|
|
@@ -54,7 +57,7 @@ arcfetch fetch https://example.com/article
|
|
|
54
57
|
### Development
|
|
55
58
|
|
|
56
59
|
```bash
|
|
57
|
-
git clone https://github.com/
|
|
60
|
+
git clone https://github.com/briansunter/arcfetch.git
|
|
58
61
|
cd arcfetch
|
|
59
62
|
bun install
|
|
60
63
|
bun run cli.ts fetch https://example.com
|
|
@@ -62,23 +65,32 @@ bun run cli.ts fetch https://example.com
|
|
|
62
65
|
|
|
63
66
|
## CLI Usage
|
|
64
67
|
|
|
65
|
-
###
|
|
68
|
+
### Commands
|
|
66
69
|
|
|
67
70
|
```bash
|
|
68
|
-
# Fetch and
|
|
71
|
+
# Fetch a URL and save to temp folder
|
|
69
72
|
arcfetch fetch https://example.com/article
|
|
70
73
|
|
|
71
74
|
# List all cached references
|
|
72
75
|
arcfetch list
|
|
73
76
|
|
|
77
|
+
# Extract links from a cached reference
|
|
78
|
+
arcfetch links my-article
|
|
79
|
+
|
|
80
|
+
# Fetch all links from a cached reference (parallel)
|
|
81
|
+
arcfetch fetch-links my-article
|
|
82
|
+
|
|
74
83
|
# Promote from temp to permanent docs
|
|
75
|
-
arcfetch promote
|
|
84
|
+
arcfetch promote my-article
|
|
76
85
|
|
|
77
86
|
# Delete a cached reference
|
|
78
|
-
arcfetch delete
|
|
87
|
+
arcfetch delete my-article
|
|
79
88
|
|
|
80
89
|
# Show current configuration
|
|
81
90
|
arcfetch config
|
|
91
|
+
|
|
92
|
+
# Start MCP server
|
|
93
|
+
arcfetch mcp
|
|
82
94
|
```
|
|
83
95
|
|
|
84
96
|
### Output Formats
|
|
@@ -90,7 +102,7 @@ arcfetch fetch https://example.com -o text
|
|
|
90
102
|
# Just the filepath (for scripts)
|
|
91
103
|
arcfetch fetch https://example.com -o path
|
|
92
104
|
|
|
93
|
-
# Summary:
|
|
105
|
+
# Summary: slug|filepath
|
|
94
106
|
arcfetch fetch https://example.com -o summary
|
|
95
107
|
|
|
96
108
|
# Structured JSON
|
|
@@ -115,6 +127,9 @@ arcfetch fetch https://example.com --force-playwright
|
|
|
115
127
|
# Use faster wait strategy for simple sites
|
|
116
128
|
arcfetch fetch https://example.com --wait-strategy load
|
|
117
129
|
|
|
130
|
+
# Re-fetch even if URL already cached
|
|
131
|
+
arcfetch fetch https://example.com --refetch
|
|
132
|
+
|
|
118
133
|
# Custom directories
|
|
119
134
|
arcfetch fetch https://example.com --temp-dir .cache --docs-dir content
|
|
120
135
|
|
|
@@ -126,7 +141,7 @@ arcfetch fetch https://example.com -v
|
|
|
126
141
|
|
|
127
142
|
### Installation (Recommended: npx/bunx)
|
|
128
143
|
|
|
129
|
-
Add to your Claude Code MCP configuration
|
|
144
|
+
Add to your Claude Code MCP configuration:
|
|
130
145
|
|
|
131
146
|
```json
|
|
132
147
|
{
|
|
@@ -170,16 +185,12 @@ Or using bunx (faster):
|
|
|
170
185
|
|
|
171
186
|
| Tool | Parameters | Description |
|
|
172
187
|
|------|------------|-------------|
|
|
173
|
-
| `fetch_url` | `url`, `query?`, `minQuality?`, `
|
|
174
|
-
| `list_cached` |
|
|
175
|
-
| `promote_reference` | `refId` | Move from temp to docs folder |
|
|
188
|
+
| `fetch_url` | `url`, `query?`, `minQuality?`, `refetch?`, `outputFormat?` | Fetch URL with auto JS fallback |
|
|
189
|
+
| `list_cached` | `tempDir?` | List all cached references |
|
|
190
|
+
| `promote_reference` | `refId`, `docsDir?` | Move from temp to docs folder |
|
|
176
191
|
| `delete_cached` | `refId` | Delete a cached reference |
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
```
|
|
180
|
-
User: Fetch https://example.com/article for me
|
|
181
|
-
Claude: [Calls fetch_url tool]
|
|
182
|
-
```
|
|
192
|
+
| `extract_links` | `refId`, `outputFormat?` | Extract links from a cached reference |
|
|
193
|
+
| `fetch_links` | `refId`, `refetch?`, `outputFormat?` | Fetch all links from a cached reference |
|
|
183
194
|
|
|
184
195
|
## Configuration
|
|
185
196
|
|
|
@@ -204,34 +215,71 @@ Create `arcfetch.config.json` in your project root:
|
|
|
204
215
|
}
|
|
205
216
|
```
|
|
206
217
|
|
|
218
|
+
Config files checked (in order): `arcfetch.config.json`, `.arcfetchrc`, `.arcfetchrc.json`
|
|
219
|
+
|
|
207
220
|
### Environment Variables
|
|
208
221
|
|
|
209
222
|
```bash
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
223
|
+
SOFETCH_MIN_SCORE=60
|
|
224
|
+
SOFETCH_JS_RETRY_THRESHOLD=85
|
|
225
|
+
SOFETCH_TEMP_DIR=.tmp/arcfetch
|
|
226
|
+
SOFETCH_DOCS_DIR=docs/ai/references
|
|
213
227
|
```
|
|
214
228
|
|
|
229
|
+
### Priority Order
|
|
230
|
+
|
|
231
|
+
CLI arguments > Environment variables > Config file > Built-in defaults
|
|
232
|
+
|
|
215
233
|
## Quality Pipeline
|
|
216
234
|
|
|
217
235
|
```
|
|
218
|
-
URL → Simple Fetch → Quality
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
236
|
+
URL → Simple Fetch → Extract → Quality Score (0-100)
|
|
237
|
+
│
|
|
238
|
+
┌──────────────────┼──────────────────┐
|
|
239
|
+
▼ ▼ ▼
|
|
240
|
+
Score >= 85 60 - 84 < 60
|
|
241
|
+
│ │ │
|
|
242
|
+
▼ ▼ ▼
|
|
243
|
+
Save Try Playwright Try Playwright
|
|
244
|
+
pick best score (required)
|
|
245
|
+
│
|
|
246
|
+
▼
|
|
247
|
+
Score >= 60?
|
|
248
|
+
Yes → Save
|
|
249
|
+
No → Error
|
|
232
250
|
```
|
|
233
251
|
|
|
234
|
-
|
|
252
|
+
### Quality Scoring
|
|
253
|
+
|
|
254
|
+
Score starts at 100, deductions apply:
|
|
255
|
+
|
|
256
|
+
| Check | Deduction |
|
|
257
|
+
|-------|-----------|
|
|
258
|
+
| Blank content | Score = 0 |
|
|
259
|
+
| Content < 50 chars | -50 |
|
|
260
|
+
| Content < 300 chars | -15 |
|
|
261
|
+
| HTML tags > 100 | -40 |
|
|
262
|
+
| HTML tags > 50 | -20 |
|
|
263
|
+
| HTML ratio > 30% | -25 |
|
|
264
|
+
| Extraction ratio < 0.5% (large page) | -35 |
|
|
265
|
+
| Extraction ratio < 2% (large page) | -20 |
|
|
266
|
+
| Boilerplate detected | -40 |
|
|
267
|
+
| Script/style tags | -10 to -15 |
|
|
268
|
+
|
|
269
|
+
### Boilerplate Detection
|
|
270
|
+
|
|
271
|
+
On short content (< 2000 chars), 22 patterns are checked:
|
|
272
|
+
|
|
273
|
+
- **Error pages**: "something went wrong", "an error occurred"
|
|
274
|
+
- **404 pages**: "page not found"
|
|
275
|
+
- **Login walls**: "log in to continue", "please log in", "sign in to continue"
|
|
276
|
+
- **Paywalls**: "subscribe to continue reading"
|
|
277
|
+
- **Bot detection**: "are you a robot", "complete the captcha"
|
|
278
|
+
- **Access denied**, **JS-required**, **unsupported browser**
|
|
279
|
+
|
|
280
|
+
Long articles (>= 2000 chars) are not checked for boilerplate to avoid false positives.
|
|
281
|
+
|
|
282
|
+
### Playwright Wait Strategies
|
|
235
283
|
|
|
236
284
|
| Strategy | Speed | Reliability | Best For |
|
|
237
285
|
|----------|-------|-------------|----------|
|
|
@@ -239,25 +287,29 @@ URL → Simple Fetch → Quality Check
|
|
|
239
287
|
| `domcontentloaded` | Medium | Medium | Most SPAs, modern sites |
|
|
240
288
|
| `load` | Fastest | Basic | Static sites, simple pages |
|
|
241
289
|
|
|
242
|
-
##
|
|
290
|
+
## File Format
|
|
243
291
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
292
|
+
Cached files use markdown with YAML frontmatter:
|
|
293
|
+
|
|
294
|
+
```markdown
|
|
295
|
+
---
|
|
296
|
+
title: "Article Title"
|
|
297
|
+
source_url: https://example.com/article
|
|
298
|
+
fetched_date: 2026-02-06
|
|
299
|
+
type: web
|
|
300
|
+
status: temporary
|
|
301
|
+
query: "optional search query"
|
|
302
|
+
---
|
|
303
|
+
|
|
304
|
+
# Article Title
|
|
305
|
+
|
|
306
|
+
Extracted markdown content...
|
|
256
307
|
```
|
|
257
308
|
|
|
258
|
-
**
|
|
259
|
-
-
|
|
260
|
-
-
|
|
309
|
+
- **Ref IDs** are slugified titles (e.g., `how-to-build-react-apps`)
|
|
310
|
+
- **Temp storage**: `.tmp/arcfetch/<slug>.md` (status: temporary)
|
|
311
|
+
- **Permanent storage**: `docs/ai/references/<slug>.md` (status: permanent, after promote)
|
|
312
|
+
- **Duplicate detection**: re-fetching same URL returns existing ref unless `--refetch`
|
|
261
313
|
|
|
262
314
|
## Real-World Examples
|
|
263
315
|
|
|
@@ -265,97 +317,74 @@ URL → Simple Fetch → Quality Check (0-100)
|
|
|
265
317
|
|
|
266
318
|
```bash
|
|
267
319
|
# Fetch multiple articles for research
|
|
268
|
-
|
|
269
|
-
|
|
320
|
+
arcfetch fetch https://arxiv.org/abs/2301.00001 -q "LLM research"
|
|
321
|
+
arcfetch fetch https://openai.com/research/gpt-4 -q "GPT-4"
|
|
322
|
+
|
|
323
|
+
# Review all cached references
|
|
324
|
+
arcfetch list --pretty
|
|
325
|
+
|
|
326
|
+
# Promote the good ones to docs
|
|
327
|
+
arcfetch promote llm-research-paper
|
|
328
|
+
arcfetch promote gpt-4-technical-report
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
### Link Crawling Workflow
|
|
332
|
+
|
|
333
|
+
```bash
|
|
334
|
+
# Fetch a page with lots of links
|
|
335
|
+
arcfetch fetch https://example.com/resources --pretty
|
|
270
336
|
|
|
271
|
-
#
|
|
272
|
-
|
|
337
|
+
# See what links it contains
|
|
338
|
+
arcfetch links resources --pretty
|
|
273
339
|
|
|
274
|
-
#
|
|
275
|
-
|
|
276
|
-
npx arcfetch promote REF-002
|
|
340
|
+
# Fetch all of them in parallel
|
|
341
|
+
arcfetch fetch-links resources --pretty
|
|
277
342
|
```
|
|
278
343
|
|
|
279
344
|
### Script Integration
|
|
280
345
|
|
|
281
346
|
```bash
|
|
282
347
|
#!/bin/bash
|
|
283
|
-
# fetch-and-process.sh
|
|
284
|
-
|
|
285
348
|
# Fetch and get filepath
|
|
286
|
-
filepath=$(
|
|
349
|
+
filepath=$(arcfetch fetch https://example.com -o path)
|
|
287
350
|
|
|
288
351
|
# Process with other tools
|
|
289
352
|
cat "$filepath" | other-tool
|
|
290
353
|
|
|
291
|
-
# Or get
|
|
292
|
-
|
|
293
|
-
ref_id=$(echo "$summary" | cut -d'|' -f1)
|
|
294
|
-
|
|
295
|
-
# Promote if it meets quality standards
|
|
296
|
-
if npx arcfetch promote "$ref_id"; then
|
|
297
|
-
echo "Successfully promoted $ref_id"
|
|
298
|
-
fi
|
|
354
|
+
# Or get JSON for structured processing
|
|
355
|
+
arcfetch fetch https://example.com -o json | jq '.quality'
|
|
299
356
|
```
|
|
300
357
|
|
|
301
358
|
### Handling JS-Heavy Sites
|
|
302
359
|
|
|
303
360
|
```bash
|
|
304
361
|
# Modern React/Vue/Angular apps
|
|
305
|
-
arcfetch fetch https://spa-example.com --force-playwright
|
|
362
|
+
arcfetch fetch https://spa-example.com --force-playwright
|
|
306
363
|
|
|
307
364
|
# Simple blogs (use faster strategy)
|
|
308
365
|
arcfetch fetch https://blog.example.com --wait-strategy load
|
|
309
366
|
|
|
310
|
-
# Unknown site (let arcfetch decide)
|
|
311
|
-
arcfetch fetch https://unknown-site.com
|
|
312
|
-
```
|
|
313
|
-
|
|
314
|
-
### Bulk Fetching with JSON Output
|
|
315
|
-
|
|
316
|
-
```bash
|
|
317
|
-
# Fetch multiple URLs and parse JSON
|
|
318
|
-
for url in "${urls[@]}"; do
|
|
319
|
-
arcfetch fetch "$url" -o json >> results.json
|
|
320
|
-
done
|
|
321
|
-
|
|
322
|
-
# Or use jq to extract specific fields
|
|
323
|
-
arcfetch fetch https://example.com -o json | jq '.filepath'
|
|
367
|
+
# Unknown site (let arcfetch decide automatically)
|
|
368
|
+
arcfetch fetch https://unknown-site.com -v
|
|
324
369
|
```
|
|
325
370
|
|
|
326
371
|
## Troubleshooting
|
|
327
372
|
|
|
328
|
-
### "Playwright not found" Error
|
|
329
|
-
|
|
330
|
-
**Problem:** Playwright fails to launch
|
|
331
|
-
|
|
332
|
-
**Solution:**
|
|
333
|
-
```bash
|
|
334
|
-
# If using npm globally
|
|
335
|
-
npm install -g playwright
|
|
336
|
-
|
|
337
|
-
# If using npx (auto-installed)
|
|
338
|
-
npx arcfetch fetch https://example.com --force-playwright
|
|
339
|
-
```
|
|
340
|
-
|
|
341
373
|
### Low Quality Score
|
|
342
374
|
|
|
343
|
-
**Problem:** Content is rejected due to low quality
|
|
344
|
-
|
|
345
|
-
**Solution:**
|
|
346
375
|
```bash
|
|
347
376
|
# Lower the threshold temporarily
|
|
348
377
|
arcfetch fetch https://example.com --min-quality 40
|
|
349
378
|
|
|
350
379
|
# Or force Playwright (often produces better results)
|
|
351
380
|
arcfetch fetch https://example.com --force-playwright
|
|
381
|
+
|
|
382
|
+
# Check what's happening with verbose mode
|
|
383
|
+
arcfetch fetch https://example.com -v
|
|
352
384
|
```
|
|
353
385
|
|
|
354
386
|
### Timeout on Slow Sites
|
|
355
387
|
|
|
356
|
-
**Problem:** Site takes too long to load
|
|
357
|
-
|
|
358
|
-
**Solution:**
|
|
359
388
|
```bash
|
|
360
389
|
# Use faster wait strategy
|
|
361
390
|
arcfetch fetch https://example.com --wait-strategy load
|
|
@@ -364,70 +393,40 @@ arcfetch fetch https://example.com --wait-strategy load
|
|
|
364
393
|
arcfetch fetch https://example.com --force-playwright --wait-strategy domcontentloaded
|
|
365
394
|
```
|
|
366
395
|
|
|
367
|
-
### MCP Server Not Connecting
|
|
368
|
-
|
|
369
|
-
**Problem:** Claude Code can't connect to MCP server
|
|
370
|
-
|
|
371
|
-
**Solution:**
|
|
372
|
-
```bash
|
|
373
|
-
# Test if the MCP server works manually
|
|
374
|
-
npx arcfetch fetch https://example.com
|
|
375
|
-
|
|
376
|
-
# Check your MCP config path
|
|
377
|
-
# macOS: ~/.config/claude-code/mcp_config.json
|
|
378
|
-
# Linux: ~/.config/claude-code/mcp_config.json
|
|
379
|
-
# Windows: %APPDATA%\claude-code\mcp_config.json
|
|
380
|
-
```
|
|
381
|
-
|
|
382
|
-
## Comparison
|
|
383
|
-
|
|
384
|
-
| Feature | arcfetch | html-to-markdown | url-to-markdown | playwright-extra |
|
|
385
|
-
|---------|----------|------------------|-----------------|------------------|
|
|
386
|
-
| Auto JS fallback | ✅ | ❌ | ❌ | Manual |
|
|
387
|
-
| Quality scoring | ✅ | ❌ | ❌ | ❌ |
|
|
388
|
-
| Temp → Docs workflow | ✅ | ❌ | ❌ | ❌ |
|
|
389
|
-
| MCP server | ✅ | ❌ | ❌ | ❌ |
|
|
390
|
-
| Multiple output formats | ✅ | ❌ | Some | ❌ |
|
|
391
|
-
| Zero-config | ✅ | ✅ | ✅ | ❌ |
|
|
392
|
-
| Playwright included | ✅ | ❌ | ❌ | Manual setup |
|
|
393
|
-
|
|
394
396
|
## Architecture
|
|
395
397
|
|
|
396
398
|
```
|
|
397
|
-
|
|
398
|
-
│
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
│
|
|
404
|
-
│ 1. Simple HTTP Fetch
|
|
405
|
-
│ 2. Extract with Readability + Turndown
|
|
406
|
-
│ 3.
|
|
407
|
-
│ 4. Conditional Playwright Retry
|
|
408
|
-
│ 5. Cache with Frontmatter
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
399
|
+
┌──────────────────────────────────────────────────────┐
|
|
400
|
+
│ CLI / MCP Interface │
|
|
401
|
+
└──────────────────────────────────────────────────────┘
|
|
402
|
+
│
|
|
403
|
+
▼
|
|
404
|
+
┌──────────────────────────────────────────────────────┐
|
|
405
|
+
│ Core Pipeline │
|
|
406
|
+
│ 1. Simple HTTP Fetch (browser-like UA) │
|
|
407
|
+
│ 2. Extract with Readability + Turndown │
|
|
408
|
+
│ 3. Quality Score + Boilerplate Detection │
|
|
409
|
+
│ 4. Conditional Playwright Retry (with stealth) │
|
|
410
|
+
│ 5. Cache with YAML Frontmatter │
|
|
411
|
+
└──────────────────────────────────────────────────────┘
|
|
412
|
+
│
|
|
413
|
+
┌───────────────┼───────────────┐
|
|
414
|
+
▼ ▼ ▼
|
|
415
|
+
┌───────────┐ ┌───────────┐ ┌───────────┐
|
|
416
|
+
│ Cache │ │ Playwright│ │ Quality │
|
|
417
|
+
│ Manager │ │ Manager │ │ Validator │
|
|
418
|
+
└───────────┘ └───────────┘ └───────────┘
|
|
417
419
|
```
|
|
418
420
|
|
|
419
421
|
## Contributing
|
|
420
422
|
|
|
421
|
-
Contributions welcome! Please read our contributing guidelines and submit pull requests to the main branch.
|
|
422
|
-
|
|
423
|
-
### Development Setup
|
|
424
|
-
|
|
425
423
|
```bash
|
|
426
|
-
git clone https://github.com/
|
|
424
|
+
git clone https://github.com/briansunter/arcfetch.git
|
|
427
425
|
cd arcfetch
|
|
428
426
|
bun install
|
|
429
|
-
bun test # Run tests
|
|
427
|
+
bun test # Run tests (199 tests)
|
|
430
428
|
bun run typecheck # Type checking
|
|
429
|
+
bun run check # Lint + format check
|
|
431
430
|
```
|
|
432
431
|
|
|
433
432
|
## License
|
package/index.ts
CHANGED
|
@@ -146,7 +146,7 @@ Returns summary with title, author, excerpt. Use Read tool to access full conten
|
|
|
146
146
|
{
|
|
147
147
|
name: 'fetch_links',
|
|
148
148
|
description:
|
|
149
|
-
'Fetch all links from a cached reference. Extracts links and fetches each one, caching as new references. Uses parallel fetching (max
|
|
149
|
+
'Fetch all links from a cached reference. Extracts links and fetches each one, caching as new references. Uses parallel fetching (max 3 concurrent).',
|
|
150
150
|
inputSchema: {
|
|
151
151
|
type: 'object',
|
|
152
152
|
properties: {
|
package/package.json
CHANGED
package/src/core/fetch-links.ts
CHANGED
|
@@ -31,7 +31,7 @@ export async function fetchLinksFromRef(
|
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
const results: FetchLinkResult[] = [];
|
|
34
|
-
const concurrency =
|
|
34
|
+
const concurrency = 3;
|
|
35
35
|
const urls = linksResult.links.map((l) => l.href);
|
|
36
36
|
const verbose = options?.verbose ?? false;
|
|
37
37
|
const refetch = options?.refetch ?? false;
|
package/src/core/pipeline.ts
CHANGED
|
@@ -30,6 +30,7 @@ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchRes
|
|
|
30
30
|
|
|
31
31
|
const response = await fetch(url, {
|
|
32
32
|
redirect: 'follow',
|
|
33
|
+
signal: AbortSignal.timeout(30_000),
|
|
33
34
|
headers: {
|
|
34
35
|
'User-Agent':
|
|
35
36
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
@@ -17,19 +17,27 @@ export class LocalBrowserManager implements BrowserManager {
|
|
|
17
17
|
|
|
18
18
|
async getBrowser(): Promise<Browser> {
|
|
19
19
|
if (!browserInstance) {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
20
|
+
try {
|
|
21
|
+
browserInstance = await chromium.launch({
|
|
22
|
+
headless: true,
|
|
23
|
+
timeout: this.config.timeout,
|
|
24
|
+
args: [
|
|
25
|
+
'--disable-blink-features=AutomationControlled',
|
|
26
|
+
'--disable-features=IsolateOrigins,site-per-process',
|
|
27
|
+
'--disable-infobars',
|
|
28
|
+
'--no-first-run',
|
|
29
|
+
'--no-default-browser-check',
|
|
30
|
+
'--disable-background-networking',
|
|
31
|
+
'--disable-dev-shm-usage',
|
|
32
|
+
],
|
|
33
|
+
});
|
|
34
|
+
} catch (error) {
|
|
35
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
36
|
+
if (message.includes('Executable') || message.includes('browserType.launch')) {
|
|
37
|
+
throw new Error('Playwright browsers are not installed. Run: npx playwright install chromium');
|
|
38
|
+
}
|
|
39
|
+
throw error;
|
|
40
|
+
}
|
|
33
41
|
}
|
|
34
42
|
return browserInstance;
|
|
35
43
|
}
|
|
@@ -3,6 +3,7 @@ import { LocalBrowserManager } from './local';
|
|
|
3
3
|
import type { BrowserManager, FetchWithBrowserResult } from './types';
|
|
4
4
|
|
|
5
5
|
let currentManager: BrowserManager | null = null;
|
|
6
|
+
let activeContexts = 0;
|
|
6
7
|
|
|
7
8
|
export async function getBrowserManager(config: PlaywrightConfig): Promise<BrowserManager> {
|
|
8
9
|
if (currentManager) {
|
|
@@ -31,10 +32,50 @@ function pick<T>(arr: T[]): T {
|
|
|
31
32
|
return arr[Math.floor(Math.random() * arr.length)];
|
|
32
33
|
}
|
|
33
34
|
|
|
35
|
+
/** Hard timeout for the entire browser fetch operation (browser launch + navigation + content extraction) */
|
|
36
|
+
const BROWSER_FETCH_TIMEOUT = 45_000;
|
|
37
|
+
|
|
38
|
+
function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> {
|
|
39
|
+
return new Promise<T>((resolve, reject) => {
|
|
40
|
+
const timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms);
|
|
41
|
+
promise.then(
|
|
42
|
+
(val) => {
|
|
43
|
+
clearTimeout(timer);
|
|
44
|
+
resolve(val);
|
|
45
|
+
},
|
|
46
|
+
(err) => {
|
|
47
|
+
clearTimeout(timer);
|
|
48
|
+
reject(err);
|
|
49
|
+
}
|
|
50
|
+
);
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
34
54
|
export async function fetchWithBrowser(
|
|
35
55
|
url: string,
|
|
36
56
|
config: PlaywrightConfig,
|
|
37
57
|
verbose = false
|
|
58
|
+
): Promise<FetchWithBrowserResult> {
|
|
59
|
+
activeContexts++;
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
return await withTimeout(
|
|
63
|
+
doFetchWithBrowser(url, config, verbose),
|
|
64
|
+
BROWSER_FETCH_TIMEOUT,
|
|
65
|
+
`Playwright fetch ${url}`
|
|
66
|
+
);
|
|
67
|
+
} catch (error) {
|
|
68
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
69
|
+
return { html: '', error: message };
|
|
70
|
+
} finally {
|
|
71
|
+
activeContexts--;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
async function doFetchWithBrowser(
|
|
76
|
+
url: string,
|
|
77
|
+
config: PlaywrightConfig,
|
|
78
|
+
verbose: boolean
|
|
38
79
|
): Promise<FetchWithBrowserResult> {
|
|
39
80
|
const manager = await getBrowserManager(config);
|
|
40
81
|
const browser = await manager.getBrowser();
|
|
@@ -120,14 +161,23 @@ export async function fetchWithBrowser(
|
|
|
120
161
|
const message = error instanceof Error ? error.message : String(error);
|
|
121
162
|
return { html: '', error: message };
|
|
122
163
|
} finally {
|
|
123
|
-
await page.close();
|
|
124
|
-
await context.close();
|
|
164
|
+
await page.close().catch(() => {});
|
|
165
|
+
await context.close().catch(() => {});
|
|
125
166
|
}
|
|
126
167
|
}
|
|
127
168
|
|
|
128
169
|
export async function closeBrowser(): Promise<void> {
|
|
129
|
-
if (currentManager)
|
|
130
|
-
|
|
131
|
-
|
|
170
|
+
if (!currentManager) return;
|
|
171
|
+
|
|
172
|
+
// Don't close if other contexts are still active
|
|
173
|
+
if (activeContexts > 0) {
|
|
174
|
+
return;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
try {
|
|
178
|
+
await withTimeout(currentManager.closeBrowser(), 5_000, 'closeBrowser');
|
|
179
|
+
} catch {
|
|
180
|
+
// Force-clear even if close times out
|
|
132
181
|
}
|
|
182
|
+
currentManager = null;
|
|
133
183
|
}
|