imperium-crawl 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +49 -0
- package/README.md +69 -11
- package/dist/cli-explore.d.ts +30 -0
- package/dist/cli-explore.d.ts.map +1 -0
- package/dist/cli-explore.js +427 -0
- package/dist/cli-explore.js.map +1 -0
- package/dist/cli-recorder.d.ts +44 -0
- package/dist/cli-recorder.d.ts.map +1 -0
- package/dist/cli-recorder.js +67 -0
- package/dist/cli-recorder.js.map +1 -0
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +28 -0
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +3 -0
- package/dist/config.js.map +1 -1
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +1 -1
- package/dist/knowledge/index.d.ts +1 -0
- package/dist/knowledge/index.d.ts.map +1 -1
- package/dist/knowledge/index.js +1 -0
- package/dist/knowledge/index.js.map +1 -1
- package/dist/knowledge/record-browser.d.ts +17 -0
- package/dist/knowledge/record-browser.d.ts.map +1 -0
- package/dist/knowledge/record-browser.js +29 -0
- package/dist/knowledge/record-browser.js.map +1 -0
- package/dist/llm/retry.d.ts +4 -2
- package/dist/llm/retry.d.ts.map +1 -1
- package/dist/llm/retry.js +15 -4
- package/dist/llm/retry.js.map +1 -1
- package/dist/sessions/index.d.ts +1 -1
- package/dist/sessions/index.d.ts.map +1 -1
- package/dist/sessions/index.js +1 -1
- package/dist/sessions/index.js.map +1 -1
- package/dist/sessions/manager.d.ts +20 -0
- package/dist/sessions/manager.d.ts.map +1 -1
- package/dist/sessions/manager.js +57 -0
- package/dist/sessions/manager.js.map +1 -1
- package/dist/sessions/types.d.ts +2 -0
- package/dist/sessions/types.d.ts.map +1 -1
- package/dist/skills/chain.d.ts +61 -0
- package/dist/skills/chain.d.ts.map +1 -0
- package/dist/skills/chain.js +182 -0
- package/dist/skills/chain.js.map +1 -0
- package/dist/skills/conditions.d.ts +14 -0
- package/dist/skills/conditions.d.ts.map +1 -0
- package/dist/skills/conditions.js +208 -0
- package/dist/skills/conditions.js.map +1 -0
- package/dist/skills/manager.d.ts +47 -2
- package/dist/skills/manager.d.ts.map +1 -1
- package/dist/skills/manager.js.map +1 -1
- package/dist/skills/parameters.d.ts +49 -0
- package/dist/skills/parameters.d.ts.map +1 -0
- package/dist/skills/parameters.js +157 -0
- package/dist/skills/parameters.js.map +1 -0
- package/dist/stealth/index.d.ts +1 -0
- package/dist/stealth/index.d.ts.map +1 -1
- package/dist/stealth/index.js +61 -13
- package/dist/stealth/index.js.map +1 -1
- package/dist/tools/action-executor.d.ts +64 -0
- package/dist/tools/action-executor.d.ts.map +1 -0
- package/dist/tools/action-executor.js +365 -0
- package/dist/tools/action-executor.js.map +1 -0
- package/dist/tools/batch-scrape.d.ts +2 -2
- package/dist/tools/crawl.d.ts +2 -2
- package/dist/tools/create-skill.d.ts +2 -2
- package/dist/tools/discover-apis.d.ts +1 -1
- package/dist/tools/discover-apis.d.ts.map +1 -1
- package/dist/tools/discover-apis.js +3 -0
- package/dist/tools/discover-apis.js.map +1 -1
- package/dist/tools/download.d.ts +4 -4
- package/dist/tools/download.d.ts.map +1 -1
- package/dist/tools/download.js +3 -0
- package/dist/tools/download.js.map +1 -1
- package/dist/tools/extract.d.ts +1 -1
- package/dist/tools/image-search.d.ts +1 -1
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +3 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/instagram.d.ts +2 -2
- package/dist/tools/interact.d.ts +72 -44
- package/dist/tools/interact.d.ts.map +1 -1
- package/dist/tools/interact.js +33 -299
- package/dist/tools/interact.js.map +1 -1
- package/dist/tools/knowledge.d.ts +24 -0
- package/dist/tools/knowledge.d.ts.map +1 -0
- package/dist/tools/knowledge.js +99 -0
- package/dist/tools/knowledge.js.map +1 -0
- package/dist/tools/list-skills.js +1 -1
- package/dist/tools/list-skills.js.map +1 -1
- package/dist/tools/manifest.d.ts.map +1 -1
- package/dist/tools/manifest.js +5 -0
- package/dist/tools/manifest.js.map +1 -1
- package/dist/tools/monitor-websocket.d.ts +1 -1
- package/dist/tools/news-search.d.ts +1 -1
- package/dist/tools/query-api.d.ts +6 -6
- package/dist/tools/readability.d.ts +2 -2
- package/dist/tools/reddit.d.ts +4 -4
- package/dist/tools/run-skill.d.ts +14 -4
- package/dist/tools/run-skill.d.ts.map +1 -1
- package/dist/tools/run-skill.js +73 -0
- package/dist/tools/run-skill.js.map +1 -1
- package/dist/tools/scrape.d.ts +9 -6
- package/dist/tools/scrape.d.ts.map +1 -1
- package/dist/tools/scrape.js +2 -0
- package/dist/tools/scrape.js.map +1 -1
- package/dist/tools/screenshot.d.ts.map +1 -1
- package/dist/tools/screenshot.js +6 -0
- package/dist/tools/screenshot.js.map +1 -1
- package/dist/tools/search.d.ts +1 -1
- package/dist/tools/snapshot.d.ts +5 -5
- package/dist/tools/snapshot.d.ts.map +1 -1
- package/dist/tools/snapshot.js +3 -0
- package/dist/tools/snapshot.js.map +1 -1
- package/dist/tools/video-search.d.ts +1 -1
- package/dist/tools/youtube.d.ts +2 -2
- package/dist/utils/fetcher.d.ts.map +1 -1
- package/dist/utils/fetcher.js +33 -0
- package/dist/utils/fetcher.js.map +1 -1
- package/package.json +7 -3
package/.env.example
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# ── API Keys ──────────────────────────────────────────────
|
|
2
|
+
# Brave Search — unlocks 4 search tools (web, news, image, video)
|
|
3
|
+
# Free tier: https://brave.com/search/api/
|
|
4
|
+
BRAVE_API_KEY=
|
|
5
|
+
|
|
6
|
+
# LLM — unlocks ai_extract tool + llm_fallback in extract
|
|
7
|
+
# Supports Anthropic, OpenAI, or MiniMax API keys
|
|
8
|
+
LLM_API_KEY=
|
|
9
|
+
LLM_PROVIDER=anthropic # anthropic | openai | minimax
|
|
10
|
+
LLM_MODEL= # override default model (optional)
|
|
11
|
+
|
|
12
|
+
# OpenAI — Whisper transcription for YouTube videos without captions
|
|
13
|
+
# https://platform.openai.com/
|
|
14
|
+
OPENAI_API_KEY=
|
|
15
|
+
|
|
16
|
+
# 2Captcha — auto CAPTCHA solving (reCAPTCHA v2/v3, hCaptcha, Turnstile)
|
|
17
|
+
# https://2captcha.com/
|
|
18
|
+
TWOCAPTCHA_API_KEY=
|
|
19
|
+
|
|
20
|
+
# ── Proxy ─────────────────────────────────────────────────
|
|
21
|
+
# Single proxy (http/https/socks4/socks5)
|
|
22
|
+
PROXY_URL=
|
|
23
|
+
# Rotating proxy pool (comma-separated)
|
|
24
|
+
PROXY_URLS=
|
|
25
|
+
|
|
26
|
+
# ── Browser ───────────────────────────────────────────────
|
|
27
|
+
# Max pooled browser instances (default: 3)
|
|
28
|
+
BROWSER_POOL_SIZE=3
|
|
29
|
+
# Chrome user data dir for authenticated sessions
|
|
30
|
+
CHROME_PROFILE_PATH=
|
|
31
|
+
|
|
32
|
+
# ── Security ──────────────────────────────────────────────
|
|
33
|
+
# 32-byte hex key for encrypting session files at rest
|
|
34
|
+
# Generate: node -e "console.log(require('crypto').randomBytes(32).toString('hex'))"
|
|
35
|
+
SESSION_ENCRYPTION_KEY=
|
|
36
|
+
|
|
37
|
+
# ── Behavior ──────────────────────────────────────────────
|
|
38
|
+
# Honor robots.txt (default: true)
|
|
39
|
+
RESPECT_ROBOTS=true
|
|
40
|
+
|
|
41
|
+
# ── Instagram (advanced — for influencer recipes) ─────────
|
|
42
|
+
# IG_SESSION_ID=
|
|
43
|
+
# IG_CSRF_TOKEN=
|
|
44
|
+
# IG_DS_USER_ID=
|
|
45
|
+
|
|
46
|
+
# ── Debug ─────────────────────────────────────────────────
|
|
47
|
+
# DEBUG=1 # enable debug logging to stderr
|
|
48
|
+
# VERBOSE=1 # same as DEBUG=1
|
|
49
|
+
# NO_COLOR=1 # disable colored output
|
package/README.md
CHANGED
|
@@ -4,11 +4,11 @@
|
|
|
4
4
|
|
|
5
5
|
**The most powerful open-source CLI tool for web scraping, crawling, and data extraction.**
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
29 tools. Zero API keys required. One `npx` command.
|
|
8
8
|
|
|
9
9
|
[](https://www.npmjs.com/package/imperium-crawl)
|
|
10
10
|
[](./LICENSE)
|
|
11
|
-
[]()
|
|
12
12
|
[](https://www.npmjs.com/package/imperium-crawl)
|
|
13
13
|
|
|
14
14
|
</div>
|
|
@@ -31,7 +31,7 @@ npx -y imperium-crawl scrape --url https://example.com
|
|
|
31
31
|
npm install -g imperium-crawl
|
|
32
32
|
```
|
|
33
33
|
|
|
34
|
-
> That's it.
|
|
34
|
+
> That's it. 23 of 29 tools work with zero API keys. Add optional keys later to unlock search, AI extraction, and CAPTCHA solving.
|
|
35
35
|
|
|
36
36
|
---
|
|
37
37
|
|
|
@@ -107,7 +107,7 @@ Scraping 4 URLs (concurrency: 3)...
|
|
|
107
107
|
## Why imperium-crawl?
|
|
108
108
|
|
|
109
109
|
🔓 **Zero API Keys Required**
|
|
110
|
-
|
|
110
|
+
23 of 29 tools work out of the box. No accounts, no tokens, no credit cards. Just `npx` and go.
|
|
111
111
|
|
|
112
112
|
🛡️ **3-Level Auto-Escalating Stealth**
|
|
113
113
|
Headers → TLS fingerprinting → headless browser + CAPTCHA solving. Automatically escalates until it gets through.
|
|
@@ -115,7 +115,7 @@ Headers → TLS fingerprinting → headless browser + CAPTCHA solving. Automatic
|
|
|
115
115
|
🧠 **Self-Improving**
|
|
116
116
|
Adaptive learning engine remembers what works per domain. Second visit is 3x faster. The more you use it, the smarter it gets.
|
|
117
117
|
|
|
118
|
-
🧰 **
|
|
118
|
+
🧰 **29 Tools, 2 Modes**
|
|
119
119
|
CLI tool or interactive TUI. Scraping, crawling, search, extraction, API discovery, WebSocket monitoring, browser automation, batch processing.
|
|
120
120
|
|
|
121
121
|
📜 **14 Built-in Recipes**
|
|
@@ -131,7 +131,7 @@ Teach it once, run forever. Auto-detect patterns on any page, save as reusable s
|
|
|
131
131
|
| Feature | **imperium-crawl** | Firecrawl | Crawl4AI | Browserbase | Puppeteer |
|
|
132
132
|
|---------|:------------------:|:---------:|:--------:|:-----------:|:---------:|
|
|
133
133
|
| Price | **Free forever** | $19+/month | Free | $0.01/min | Free |
|
|
134
|
-
| Total tools | **
|
|
134
|
+
| Total tools | **29** | 5 | 2 | 4 | N/A |
|
|
135
135
|
| Stealth levels | **3 (auto-escalate)** | Cloud-based | 1 | Cloud-based | None |
|
|
136
136
|
| Anti-bot detection | **7 systems** | Partial | Partial | Partial | None |
|
|
137
137
|
| TLS fingerprinting | **JA3/JA4** | No | No | No | No |
|
|
@@ -224,7 +224,7 @@ Second visit to cloudflare.com:
|
|
|
224
224
|
|
|
225
225
|
---
|
|
226
226
|
|
|
227
|
-
## All
|
|
227
|
+
## All 29 Tools
|
|
228
228
|
|
|
229
229
|
### 📄 Scraping (no API key needed)
|
|
230
230
|
|
|
@@ -272,7 +272,7 @@ Second visit to cloudflare.com:
|
|
|
272
272
|
|
|
273
273
|
| Tool | What It Does |
|
|
274
274
|
|------|-------------|
|
|
275
|
-
| **interact** | Browser automation with
|
|
275
|
+
| **interact** | Browser automation with 19 action types (click, type, scroll, wait, screenshot, evaluate, select, hover, press, navigate, drag, upload, storage, cookies, pdf, auth_login, refresh). Ref targeting via ARIA snapshot, session encryption, action policy, domain filter, network interception, device emulation. |
|
|
276
276
|
| **snapshot** | ARIA-based page snapshot with interactive element refs. Use refs in interact for precise targeting. Annotated screenshots. |
|
|
277
277
|
|
|
278
278
|
### 📱 Social Media (no API key needed)
|
|
@@ -299,6 +299,12 @@ Second visit to cloudflare.com:
|
|
|
299
299
|
| **job_status** | Full results for a specific batch job including per-URL outcomes. |
|
|
300
300
|
| **delete_job** | Clean up completed or failed batch jobs. |
|
|
301
301
|
|
|
302
|
+
### 🧠 Knowledge Engine (no API key needed)
|
|
303
|
+
|
|
304
|
+
| Tool | What It Does |
|
|
305
|
+
|------|-------------|
|
|
306
|
+
| **knowledge** | Dump adaptive knowledge engine stats — per-domain success rates, optimal stealth levels, anti-bot detection history, rate limits. Use to debug scraping issues and understand problematic domains. |
|
|
307
|
+
|
|
302
308
|
---
|
|
303
309
|
|
|
304
310
|
## Setup
|
|
@@ -366,6 +372,26 @@ imperium-crawl tui
|
|
|
366
372
|
|
|
367
373
|
Interactive slash-command terminal with parameter prompts, table rendering, markdown display, and session state. Use `/save` to export results and `/again` to re-run the last command.
|
|
368
374
|
|
|
375
|
+
### Explore REPL
|
|
376
|
+
|
|
377
|
+
Interactively explore a site in a headed browser, then save the session as a reusable skill:
|
|
378
|
+
|
|
379
|
+
```bash
|
|
380
|
+
imperium-crawl explore https://example.com
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
```
|
|
384
|
+
> navigate https://example.com/login
|
|
385
|
+
> type "#email" "user@example.com"
|
|
386
|
+
> type "#password" "{{env:MY_PASSWORD}}"
|
|
387
|
+
> click "#submit"
|
|
388
|
+
> snapshot
|
|
389
|
+
> save-skill my-login
|
|
390
|
+
✅ Saved skill: my-login (4 actions, 1 parameter detected)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
Commands: `navigate`, `click`, `type`, `select`, `hover`, `press`, `scroll`, `wait`, `screenshot`, `snapshot`, `evaluate`, `save-skill`, `history`, `undo`, `status`, `help`, `exit`
|
|
394
|
+
|
|
369
395
|
---
|
|
370
396
|
|
|
371
397
|
## Skills & Recipes
|
|
@@ -389,6 +415,37 @@ run_skill({ name: "tc-ai-news" })
|
|
|
389
415
|
|
|
390
416
|
Skills are saved in `~/.imperium-crawl/skills/` as JSON files — human-readable, editable, portable.
|
|
391
417
|
|
|
418
|
+
### Skill Parameters
|
|
419
|
+
|
|
420
|
+
Use template variables in skills — resolved at run time:
|
|
421
|
+
|
|
422
|
+
```bash
|
|
423
|
+
# In skill JSON actions:
|
|
424
|
+
{ "value": "{{input:query}}" } # passed via --params or prompted
|
|
425
|
+
{ "value": "{{env:SITE_PASSWORD}}" } # from environment variable
|
|
426
|
+
{ "value": "{{computed:date_today}}" } # auto-computed (date_today, timestamp, random_string, year, month, day)
|
|
427
|
+
|
|
428
|
+
# Run with params:
|
|
429
|
+
imperium-crawl run-skill my-search --params '{"query": "machine learning"}'
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
### Skill Chains
|
|
433
|
+
|
|
434
|
+
Chain skills together — output of one step becomes input to the next:
|
|
435
|
+
|
|
436
|
+
```json
|
|
437
|
+
{
|
|
438
|
+
"type": "chain",
|
|
439
|
+
"name": "search-and-extract",
|
|
440
|
+
"steps": [
|
|
441
|
+
{ "skill": "search-results", "output": "search" },
|
|
442
|
+
{ "skill": "extract-details", "input": { "url": "$search.results[0].url" }, "output": "details" }
|
|
443
|
+
]
|
|
444
|
+
}
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
Variable syntax: `$step_name.field.nested[0]` — simple dot-path access, no eval.
|
|
448
|
+
|
|
392
449
|
### Built-in Recipes
|
|
393
450
|
|
|
394
451
|
| Recipe | What It Does |
|
|
@@ -435,7 +492,7 @@ Turn any website into an API. No documentation needed.
|
|
|
435
492
|
|
|
436
493
|
## AI Agent Guide
|
|
437
494
|
|
|
438
|
-
imperium-crawl ships with [`SKILL/`](./SKILL/) — a structured guide that teaches AI agents how to use all
|
|
495
|
+
imperium-crawl ships with [`SKILL/`](./SKILL/) — a structured guide that teaches AI agents how to use all 29 tools effectively. Includes proven workflows, decision trees, error recovery, and advanced patterns.
|
|
439
496
|
|
|
440
497
|
### Two Ways to Connect
|
|
441
498
|
|
|
@@ -496,13 +553,14 @@ Every tool tested against production websites with real anti-bot defenses:
|
|
|
496
553
|
| 📋 **list_jobs** | — | Batch jobs with status and progress |
|
|
497
554
|
| 📊 **job_status** | Batch job | Full per-URL results with timing |
|
|
498
555
|
| 🗑️ **delete_job** | Completed job | Cleaned up job data from disk |
|
|
556
|
+
| 🧠 **knowledge** | Local knowledge file | Per-domain stats: stealth levels, success rates, anti-bot systems detected |
|
|
499
557
|
| 🎬 **youtube** | "web scraping tutorial" | Search results, video details, comments, transcripts — no API key |
|
|
500
558
|
| 💬 **reddit** | r/webscraping | Subreddit posts, comments, search — public JSON API |
|
|
501
559
|
| 📸 **instagram** | @nike profile | Profile details, engagement rate, recent posts — internal API |
|
|
502
560
|
| 📥 **download** | YouTube video, web page images | Auto-detect URL type, download media files — images, video, og:image |
|
|
503
561
|
| 📡 **rss** | Hacker News RSS | Parsed feed items with title, link, date, author, categories |
|
|
504
562
|
|
|
505
|
-
> **
|
|
563
|
+
> **29/29 tools. 34 hidden APIs on Airbnb. Live BTC feed. Zero API keys for scraping.**
|
|
506
564
|
|
|
507
565
|
---
|
|
508
566
|
|
|
@@ -535,7 +593,7 @@ cd imperium-crawl
|
|
|
535
593
|
npm install
|
|
536
594
|
npm run build
|
|
537
595
|
npm run dev # Watch mode (rebuild on changes)
|
|
538
|
-
npm test #
|
|
596
|
+
npm test # 546 tests
|
|
539
597
|
npm start # Start CLI (shows help or TUI)
|
|
540
598
|
```
|
|
541
599
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Explore REPL — interactive browser session with live Playwright.
|
|
3
|
+
*
|
|
4
|
+
* Usage: imperium-crawl explore <url>
|
|
5
|
+
*
|
|
6
|
+
* Opens a headed (visible) browser window and gives the user a readline REPL
|
|
7
|
+
* to execute actions interactively. Every successful action is recorded.
|
|
8
|
+
* At any point, run `save-skill <name>` to export the session as a reusable skill.
|
|
9
|
+
*
|
|
10
|
+
* Commands:
|
|
11
|
+
* navigate <url> Navigate to URL
|
|
12
|
+
* click <selector> Click element
|
|
13
|
+
* type <selector> <text> Fill input field
|
|
14
|
+
* select <selector> <value> Select option
|
|
15
|
+
* wait [ms] Wait N ms (default 1000)
|
|
16
|
+
* screenshot [file] Save screenshot
|
|
17
|
+
* snapshot Show ARIA tree + refs
|
|
18
|
+
* evaluate <script> Run JS in page
|
|
19
|
+
* scroll [up|down] [px] Scroll page
|
|
20
|
+
* hover <selector> Hover element
|
|
21
|
+
* press <key> Press keyboard key
|
|
22
|
+
* save-skill <name> Export recording as skill JSON
|
|
23
|
+
* status Show URL, action count
|
|
24
|
+
* history List recorded actions
|
|
25
|
+
* undo Remove last action
|
|
26
|
+
* help Show command list
|
|
27
|
+
* exit / quit Close browser and exit
|
|
28
|
+
*/
|
|
29
|
+
export declare function runExplore(startUrl: string, sessionId?: string): Promise<void>;
|
|
30
|
+
//# sourceMappingURL=cli-explore.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli-explore.d.ts","sourceRoot":"","sources":["../src/cli-explore.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAgGH,wBAAsB,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CA6RpF"}
|