imperium-crawl 2.3.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +146 -11
- package/dist/cli-explore.d.ts +30 -0
- package/dist/cli-explore.d.ts.map +1 -0
- package/dist/cli-explore.js +427 -0
- package/dist/cli-explore.js.map +1 -0
- package/dist/cli-recorder.d.ts +44 -0
- package/dist/cli-recorder.d.ts.map +1 -0
- package/dist/cli-recorder.js +67 -0
- package/dist/cli-recorder.js.map +1 -0
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +51 -3
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +3 -0
- package/dist/config.js.map +1 -1
- package/dist/constants.d.ts +1 -1
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +31 -1
- package/dist/constants.js.map +1 -1
- package/dist/flows/engine.d.ts +7 -0
- package/dist/flows/engine.d.ts.map +1 -0
- package/dist/flows/engine.js +183 -0
- package/dist/flows/engine.js.map +1 -0
- package/dist/flows/index.d.ts +6 -0
- package/dist/flows/index.d.ts.map +1 -0
- package/dist/flows/index.js +6 -0
- package/dist/flows/index.js.map +1 -0
- package/dist/flows/server.d.ts +11 -0
- package/dist/flows/server.d.ts.map +1 -0
- package/dist/flows/server.js +81 -0
- package/dist/flows/server.js.map +1 -0
- package/dist/flows/smart-target.d.ts +9 -0
- package/dist/flows/smart-target.d.ts.map +1 -0
- package/dist/flows/smart-target.js +84 -0
- package/dist/flows/smart-target.js.map +1 -0
- package/dist/flows/storage.d.ts +26 -0
- package/dist/flows/storage.d.ts.map +1 -0
- package/dist/flows/storage.js +118 -0
- package/dist/flows/storage.js.map +1 -0
- package/dist/flows/templates.d.ts +4 -0
- package/dist/flows/templates.d.ts.map +1 -0
- package/dist/flows/templates.js +35 -0
- package/dist/flows/templates.js.map +1 -0
- package/dist/flows/types.d.ts +3356 -0
- package/dist/flows/types.d.ts.map +1 -0
- package/dist/flows/types.js +133 -0
- package/dist/flows/types.js.map +1 -0
- package/dist/knowledge/index.d.ts +1 -0
- package/dist/knowledge/index.d.ts.map +1 -1
- package/dist/knowledge/index.js +1 -0
- package/dist/knowledge/index.js.map +1 -1
- package/dist/knowledge/record-browser.d.ts +17 -0
- package/dist/knowledge/record-browser.d.ts.map +1 -0
- package/dist/knowledge/record-browser.js +29 -0
- package/dist/knowledge/record-browser.js.map +1 -0
- package/dist/knowledge/store.d.ts +19 -0
- package/dist/knowledge/store.d.ts.map +1 -1
- package/dist/knowledge/store.js +63 -4
- package/dist/knowledge/store.js.map +1 -1
- package/dist/llm/retry.d.ts +4 -2
- package/dist/llm/retry.d.ts.map +1 -1
- package/dist/llm/retry.js +15 -4
- package/dist/llm/retry.js.map +1 -1
- package/dist/sessions/browser-connect.d.ts +30 -0
- package/dist/sessions/browser-connect.d.ts.map +1 -0
- package/dist/sessions/browser-connect.js +68 -0
- package/dist/sessions/browser-connect.js.map +1 -0
- package/dist/sessions/browser-state.d.ts +35 -0
- package/dist/sessions/browser-state.d.ts.map +1 -0
- package/dist/sessions/browser-state.js +74 -0
- package/dist/sessions/browser-state.js.map +1 -0
- package/dist/sessions/index.d.ts +1 -1
- package/dist/sessions/index.d.ts.map +1 -1
- package/dist/sessions/index.js +1 -1
- package/dist/sessions/index.js.map +1 -1
- package/dist/sessions/inject-cookies.d.ts +20 -0
- package/dist/sessions/inject-cookies.d.ts.map +1 -0
- package/dist/sessions/inject-cookies.js +57 -0
- package/dist/sessions/inject-cookies.js.map +1 -0
- package/dist/sessions/manager.d.ts +31 -1
- package/dist/sessions/manager.d.ts.map +1 -1
- package/dist/sessions/manager.js +97 -6
- package/dist/sessions/manager.js.map +1 -1
- package/dist/sessions/types.d.ts +2 -0
- package/dist/sessions/types.d.ts.map +1 -1
- package/dist/skills/chain.d.ts +61 -0
- package/dist/skills/chain.d.ts.map +1 -0
- package/dist/skills/chain.js +182 -0
- package/dist/skills/chain.js.map +1 -0
- package/dist/skills/conditions.d.ts +14 -0
- package/dist/skills/conditions.d.ts.map +1 -0
- package/dist/skills/conditions.js +208 -0
- package/dist/skills/conditions.js.map +1 -0
- package/dist/skills/manager.d.ts +47 -2
- package/dist/skills/manager.d.ts.map +1 -1
- package/dist/skills/manager.js.map +1 -1
- package/dist/skills/parameters.d.ts +49 -0
- package/dist/skills/parameters.d.ts.map +1 -0
- package/dist/skills/parameters.js +157 -0
- package/dist/skills/parameters.js.map +1 -0
- package/dist/snapshot/store.d.ts +8 -0
- package/dist/snapshot/store.d.ts.map +1 -1
- package/dist/snapshot/store.js +48 -0
- package/dist/snapshot/store.js.map +1 -1
- package/dist/stealth/antibot-detector.d.ts +1 -1
- package/dist/stealth/antibot-detector.d.ts.map +1 -1
- package/dist/stealth/antibot-detector.js +56 -0
- package/dist/stealth/antibot-detector.js.map +1 -1
- package/dist/stealth/browser-image-extract.d.ts +43 -0
- package/dist/stealth/browser-image-extract.d.ts.map +1 -0
- package/dist/stealth/browser-image-extract.js +268 -0
- package/dist/stealth/browser-image-extract.js.map +1 -0
- package/dist/stealth/browser.d.ts +5 -0
- package/dist/stealth/browser.d.ts.map +1 -1
- package/dist/stealth/browser.js +82 -1
- package/dist/stealth/browser.js.map +1 -1
- package/dist/stealth/chrome-profile.d.ts +1 -0
- package/dist/stealth/chrome-profile.d.ts.map +1 -1
- package/dist/stealth/chrome-profile.js +28 -5
- package/dist/stealth/chrome-profile.js.map +1 -1
- package/dist/stealth/detector.d.ts +10 -1
- package/dist/stealth/detector.d.ts.map +1 -1
- package/dist/stealth/detector.js +117 -25
- package/dist/stealth/detector.js.map +1 -1
- package/dist/stealth/headers.d.ts +1 -1
- package/dist/stealth/headers.d.ts.map +1 -1
- package/dist/stealth/headers.js +94 -2
- package/dist/stealth/headers.js.map +1 -1
- package/dist/stealth/index.d.ts +5 -0
- package/dist/stealth/index.d.ts.map +1 -1
- package/dist/stealth/index.js +257 -27
- package/dist/stealth/index.js.map +1 -1
- package/dist/stealth/proxy.d.ts +40 -1
- package/dist/stealth/proxy.d.ts.map +1 -1
- package/dist/stealth/proxy.js +90 -6
- package/dist/stealth/proxy.js.map +1 -1
- package/dist/tools/action-executor.d.ts +66 -0
- package/dist/tools/action-executor.d.ts.map +1 -0
- package/dist/tools/action-executor.js +403 -0
- package/dist/tools/action-executor.js.map +1 -0
- package/dist/tools/batch-download.d.ts +33 -0
- package/dist/tools/batch-download.d.ts.map +1 -0
- package/dist/tools/batch-download.js +208 -0
- package/dist/tools/batch-download.js.map +1 -0
- package/dist/tools/batch-scrape.d.ts +2 -2
- package/dist/tools/browser.d.ts +100 -0
- package/dist/tools/browser.d.ts.map +1 -0
- package/dist/tools/browser.js +448 -0
- package/dist/tools/browser.js.map +1 -0
- package/dist/tools/crawl.d.ts +2 -2
- package/dist/tools/create-skill.d.ts +2 -2
- package/dist/tools/discover-apis.d.ts +1 -1
- package/dist/tools/discover-apis.d.ts.map +1 -1
- package/dist/tools/discover-apis.js +3 -0
- package/dist/tools/discover-apis.js.map +1 -1
- package/dist/tools/download.d.ts +39 -6
- package/dist/tools/download.d.ts.map +1 -1
- package/dist/tools/download.js +248 -44
- package/dist/tools/download.js.map +1 -1
- package/dist/tools/extract.d.ts +1 -1
- package/dist/tools/image-search.d.ts +1 -1
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +26 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/inspect-flow.d.ts +24 -0
- package/dist/tools/inspect-flow.d.ts.map +1 -0
- package/dist/tools/inspect-flow.js +23 -0
- package/dist/tools/inspect-flow.js.map +1 -0
- package/dist/tools/instagram.d.ts +2 -2
- package/dist/tools/interact.d.ts +91 -50
- package/dist/tools/interact.d.ts.map +1 -1
- package/dist/tools/interact.js +80 -299
- package/dist/tools/interact.js.map +1 -1
- package/dist/tools/knowledge.d.ts +24 -0
- package/dist/tools/knowledge.d.ts.map +1 -0
- package/dist/tools/knowledge.js +99 -0
- package/dist/tools/knowledge.js.map +1 -0
- package/dist/tools/list-flows.d.ts +21 -0
- package/dist/tools/list-flows.d.ts.map +1 -0
- package/dist/tools/list-flows.js +18 -0
- package/dist/tools/list-flows.js.map +1 -0
- package/dist/tools/list-skills.js +1 -1
- package/dist/tools/list-skills.js.map +1 -1
- package/dist/tools/manifest.d.ts.map +1 -1
- package/dist/tools/manifest.js +48 -0
- package/dist/tools/manifest.js.map +1 -1
- package/dist/tools/monitor-websocket.d.ts +1 -1
- package/dist/tools/monitor.d.ts +46 -0
- package/dist/tools/monitor.d.ts.map +1 -0
- package/dist/tools/monitor.js +213 -0
- package/dist/tools/monitor.js.map +1 -0
- package/dist/tools/news-search.d.ts +1 -1
- package/dist/tools/pdf-extract.d.ts +38 -0
- package/dist/tools/pdf-extract.d.ts.map +1 -0
- package/dist/tools/pdf-extract.js +244 -0
- package/dist/tools/pdf-extract.js.map +1 -0
- package/dist/tools/query-api.d.ts +6 -6
- package/dist/tools/readability.d.ts +2 -2
- package/dist/tools/record-flow.d.ts +39 -0
- package/dist/tools/record-flow.d.ts.map +1 -0
- package/dist/tools/record-flow.js +406 -0
- package/dist/tools/record-flow.js.map +1 -0
- package/dist/tools/reddit.d.ts +4 -4
- package/dist/tools/run-flow.d.ts +54 -0
- package/dist/tools/run-flow.d.ts.map +1 -0
- package/dist/tools/run-flow.js +47 -0
- package/dist/tools/run-flow.js.map +1 -0
- package/dist/tools/run-skill.d.ts +14 -4
- package/dist/tools/run-skill.d.ts.map +1 -1
- package/dist/tools/run-skill.js +74 -0
- package/dist/tools/run-skill.js.map +1 -1
- package/dist/tools/scrape.d.ts +9 -6
- package/dist/tools/scrape.d.ts.map +1 -1
- package/dist/tools/scrape.js +19 -1
- package/dist/tools/scrape.js.map +1 -1
- package/dist/tools/screenshot.d.ts.map +1 -1
- package/dist/tools/screenshot.js +6 -0
- package/dist/tools/screenshot.js.map +1 -1
- package/dist/tools/search.d.ts +1 -1
- package/dist/tools/serve-flow.d.ts +36 -0
- package/dist/tools/serve-flow.d.ts.map +1 -0
- package/dist/tools/serve-flow.js +42 -0
- package/dist/tools/serve-flow.js.map +1 -0
- package/dist/tools/snapshot.d.ts +5 -5
- package/dist/tools/snapshot.d.ts.map +1 -1
- package/dist/tools/snapshot.js +3 -0
- package/dist/tools/snapshot.js.map +1 -1
- package/dist/tools/validate-flow.d.ts +24 -0
- package/dist/tools/validate-flow.d.ts.map +1 -0
- package/dist/tools/validate-flow.js +23 -0
- package/dist/tools/validate-flow.js.map +1 -0
- package/dist/tools/video-search.d.ts +1 -1
- package/dist/tools/watch.d.ts +68 -0
- package/dist/tools/watch.d.ts.map +1 -0
- package/dist/tools/watch.js +224 -0
- package/dist/tools/watch.js.map +1 -0
- package/dist/tools/youtube.d.ts +2 -2
- package/dist/utils/fetcher.d.ts +13 -4
- package/dist/utils/fetcher.d.ts.map +1 -1
- package/dist/utils/fetcher.js +153 -23
- package/dist/utils/fetcher.js.map +1 -1
- package/package.json +19 -5
package/README.md
CHANGED
|
@@ -1,20 +1,34 @@
|
|
|
1
1
|
<div align="center">
|
|
2
2
|
|
|
3
|
+
<img src="assets/hero-banner.png" alt="imperium-crawl — 3-level auto-escalating stealth engine" width="800" />
|
|
4
|
+
|
|
3
5
|
# imperium-crawl
|
|
4
6
|
|
|
5
7
|
**The most powerful open-source CLI tool for web scraping, crawling, and data extraction.**
|
|
6
8
|
|
|
7
|
-
|
|
9
|
+
39 tools. Zero API keys required. One `npx` command.
|
|
8
10
|
|
|
9
11
|
[](https://www.npmjs.com/package/imperium-crawl)
|
|
10
12
|
[](./LICENSE)
|
|
11
|
-
[]()
|
|
12
14
|
[](https://www.npmjs.com/package/imperium-crawl)
|
|
13
15
|
|
|
14
16
|
</div>
|
|
15
17
|
|
|
16
18
|
---
|
|
17
19
|
|
|
20
|
+
## What's new in 2.5.0
|
|
21
|
+
|
|
22
|
+
Three new tools for document extraction and content monitoring — all zero-API-key, all native:
|
|
23
|
+
|
|
24
|
+
- **`pdf-extract`** — Pull text, pages, tables, and metadata from any PDF (local or remote) via `pdfjs-dist`. Ideal for regulatory docs, sustainability reports, invoices. Smoke-tested on a 98-page CBAM Guidance PDF (199K chars, confidence 0.99).
|
|
25
|
+
- **`watch`** — Hash-based one-shot change detector for a single URL. Cron-friendly. Fires a webhook on change.
|
|
26
|
+
- **`monitor`** — Multi-URL intelligence digest. Reads a JSON config grouping URLs by topic, emits a markdown digest filtered by minimum change percentage.
|
|
27
|
+
|
|
28
|
+
See [CHANGELOG.md](./CHANGELOG.md) for the full release notes.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
18
32
|
## Quick Start
|
|
19
33
|
|
|
20
34
|
Get running in 30 seconds.
|
|
@@ -31,7 +45,13 @@ npx -y imperium-crawl scrape --url https://example.com
|
|
|
31
45
|
npm install -g imperium-crawl
|
|
32
46
|
```
|
|
33
47
|
|
|
34
|
-
|
|
48
|
+
**Install from a local tarball** (e.g. pre-release testing):
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
npm install -g ./imperium-crawl-2.5.0.tgz
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
> That's it. 33 of 39 tools work with zero API keys. Add optional keys later to unlock search, AI extraction, and CAPTCHA solving.
|
|
35
55
|
|
|
36
56
|
---
|
|
37
57
|
|
|
@@ -107,7 +127,7 @@ Scraping 4 URLs (concurrency: 3)...
|
|
|
107
127
|
## Why imperium-crawl?
|
|
108
128
|
|
|
109
129
|
🔓 **Zero API Keys Required**
|
|
110
|
-
|
|
130
|
+
33 of 39 tools work out of the box. No accounts, no tokens, no credit cards. Just `npx` and go.
|
|
111
131
|
|
|
112
132
|
🛡️ **3-Level Auto-Escalating Stealth**
|
|
113
133
|
Headers → TLS fingerprinting → headless browser + CAPTCHA solving. Automatically escalates until it gets through.
|
|
@@ -115,7 +135,7 @@ Headers → TLS fingerprinting → headless browser + CAPTCHA solving. Automatic
|
|
|
115
135
|
🧠 **Self-Improving**
|
|
116
136
|
Adaptive learning engine remembers what works per domain. Second visit is 3x faster. The more you use it, the smarter it gets.
|
|
117
137
|
|
|
118
|
-
🧰 **
|
|
138
|
+
🧰 **33 Tools, 2 Modes**
|
|
119
139
|
CLI tool or interactive TUI. Scraping, crawling, search, extraction, API discovery, WebSocket monitoring, browser automation, batch processing.
|
|
120
140
|
|
|
121
141
|
📜 **14 Built-in Recipes**
|
|
@@ -131,7 +151,7 @@ Teach it once, run forever. Auto-detect patterns on any page, save as reusable s
|
|
|
131
151
|
| Feature | **imperium-crawl** | Firecrawl | Crawl4AI | Browserbase | Puppeteer |
|
|
132
152
|
|---------|:------------------:|:---------:|:--------:|:-----------:|:---------:|
|
|
133
153
|
| Price | **Free forever** | $19+/month | Free | $0.01/min | Free |
|
|
134
|
-
| Total tools | **
|
|
154
|
+
| Total tools | **33** | 5 | 2 | 4 | N/A |
|
|
135
155
|
| Stealth levels | **3 (auto-escalate)** | Cloud-based | 1 | Cloud-based | None |
|
|
136
156
|
| Anti-bot detection | **7 systems** | Partial | Partial | Partial | None |
|
|
137
157
|
| TLS fingerprinting | **JA3/JA4** | No | No | No | No |
|
|
@@ -224,7 +244,7 @@ Second visit to cloudflare.com:
|
|
|
224
244
|
|
|
225
245
|
---
|
|
226
246
|
|
|
227
|
-
## All
|
|
247
|
+
## All 39 Tools
|
|
228
248
|
|
|
229
249
|
### 📄 Scraping (no API key needed)
|
|
230
250
|
|
|
@@ -272,7 +292,7 @@ Second visit to cloudflare.com:
|
|
|
272
292
|
|
|
273
293
|
| Tool | What It Does |
|
|
274
294
|
|------|-------------|
|
|
275
|
-
| **interact** | Browser automation with
|
|
295
|
+
| **interact** | Browser automation with 19 action types (click, type, scroll, wait, screenshot, evaluate, select, hover, press, navigate, drag, upload, storage, cookies, pdf, auth_login, refresh). Ref targeting via ARIA snapshot, session encryption, action policy, domain filter, network interception, device emulation. |
|
|
276
296
|
| **snapshot** | ARIA-based page snapshot with interactive element refs. Use refs in interact for precise targeting. Annotated screenshots. |
|
|
277
297
|
|
|
278
298
|
### 📱 Social Media (no API key needed)
|
|
@@ -299,6 +319,69 @@ Second visit to cloudflare.com:
|
|
|
299
319
|
| **job_status** | Full results for a specific batch job including per-URL outcomes. |
|
|
300
320
|
| **delete_job** | Clean up completed or failed batch jobs. |
|
|
301
321
|
|
|
322
|
+
### 🧠 Knowledge Engine (no API key needed)
|
|
323
|
+
|
|
324
|
+
| Tool | What It Does |
|
|
325
|
+
|------|-------------|
|
|
326
|
+
| **knowledge** | Dump adaptive knowledge engine stats — per-domain success rates, optimal stealth levels, anti-bot detection history, rate limits. Use to debug scraping issues and understand problematic domains. |
|
|
327
|
+
|
|
328
|
+
### 📄 Documents (no API key needed)
|
|
329
|
+
|
|
330
|
+
| Tool | What It Does |
|
|
331
|
+
|------|-------------|
|
|
332
|
+
| **pdf_extract** | Extract text, pages, tables, and metadata from a local or remote PDF. Native text-layer strategy via `pdfjs-dist`. OCR + Claude Vision fallbacks deferred to v2.6.0. Use for sustainability reports, invoices, regulatory PDFs. |
|
|
333
|
+
|
|
334
|
+
```bash
|
|
335
|
+
imperium-crawl pdf-extract --input ./report.pdf --output ./extracted.json
|
|
336
|
+
imperium-crawl pdf-extract --input https://example.com/report.pdf --max-pages 20
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### 👀 Change Tracking (no API key needed)
|
|
340
|
+
|
|
341
|
+
| Tool | What It Does |
|
|
342
|
+
|------|-------------|
|
|
343
|
+
| **watch** | One-shot change detector: scrape a URL, hash its content (readability / markdown / full), compare against the last snapshot, fire a webhook on change. Pair with cron for periodic monitoring. |
|
|
344
|
+
| **monitor** | Portfolio-level change tracker across many URLs grouped by topic. Reads a JSON config, runs `watch` on each URL, emits a markdown digest filtered by minimum change percentage. |
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
# Watch a single URL — run periodically via cron
|
|
348
|
+
imperium-crawl watch --url https://carbonchain.com/pricing \
|
|
349
|
+
--output-dir ./data/watch \
|
|
350
|
+
--webhook https://hooks.example.com/on-change
|
|
351
|
+
|
|
352
|
+
# Monitor many URLs grouped by topic, emit a daily digest
|
|
353
|
+
imperium-crawl monitor --config ./monitor.json --output-dir ./data/monitor
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
`monitor.json`:
|
|
357
|
+
```json
|
|
358
|
+
{
|
|
359
|
+
"topics": [
|
|
360
|
+
{
|
|
361
|
+
"name": "Competitor pricing",
|
|
362
|
+
"urls": ["https://carbonchain.com/pricing", "https://spherasolutions.com/cbam"]
|
|
363
|
+
}
|
|
364
|
+
]
|
|
365
|
+
}
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
### 🔁 Imperium Flows (no API key needed; browser workflows may require Playwright)
|
|
369
|
+
|
|
370
|
+
| Tool | What It Does |
|
|
371
|
+
|------|-------------|
|
|
372
|
+
| **record_flow** | Record a headed browser workflow as a generic flow family/variant. Stores smart selector metadata and reusable input placeholders. |
|
|
373
|
+
| **run_flow** | Run a saved flow with runtime JSON input, CAPTCHA policy, browser mode, and evidence collection. |
|
|
374
|
+
| **serve_flow** | Expose saved flows through a local HTTP API. Requires bearer auth when bound publicly. |
|
|
375
|
+
| **list_flows** | List project-local and global flow definitions. |
|
|
376
|
+
| **inspect_flow** | Inspect a saved flow JSON definition. |
|
|
377
|
+
| **validate_flow** | Validate a flow schema and report inputs, steps, and storage path. |
|
|
378
|
+
|
|
379
|
+
```bash
|
|
380
|
+
imperium-crawl record-flow --family generic-search --variant site-a --url https://example.com
|
|
381
|
+
imperium-crawl run-flow generic-search/site-a --input '{"query":"example"}'
|
|
382
|
+
imperium-crawl serve-flow generic-search --port 8787
|
|
383
|
+
```
|
|
384
|
+
|
|
302
385
|
---
|
|
303
386
|
|
|
304
387
|
## Setup
|
|
@@ -366,6 +449,26 @@ imperium-crawl tui
|
|
|
366
449
|
|
|
367
450
|
Interactive slash-command terminal with parameter prompts, table rendering, markdown display, and session state. Use `/save` to export results and `/again` to re-run the last command.
|
|
368
451
|
|
|
452
|
+
### Explore REPL
|
|
453
|
+
|
|
454
|
+
Interactively explore a site in a headed browser, then save the session as a reusable skill:
|
|
455
|
+
|
|
456
|
+
```bash
|
|
457
|
+
imperium-crawl explore https://example.com
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
```
|
|
461
|
+
> navigate https://example.com/login
|
|
462
|
+
> type "#email" "user@example.com"
|
|
463
|
+
> type "#password" "{{env:MY_PASSWORD}}"
|
|
464
|
+
> click "#submit"
|
|
465
|
+
> snapshot
|
|
466
|
+
> save-skill my-login
|
|
467
|
+
✅ Saved skill: my-login (4 actions, 1 parameter detected)
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
Commands: `navigate`, `click`, `type`, `select`, `hover`, `press`, `scroll`, `wait`, `screenshot`, `snapshot`, `evaluate`, `save-skill`, `history`, `undo`, `status`, `help`, `exit`
|
|
471
|
+
|
|
369
472
|
---
|
|
370
473
|
|
|
371
474
|
## Skills & Recipes
|
|
@@ -389,6 +492,37 @@ run_skill({ name: "tc-ai-news" })
|
|
|
389
492
|
|
|
390
493
|
Skills are saved in `~/.imperium-crawl/skills/` as JSON files — human-readable, editable, portable.
|
|
391
494
|
|
|
495
|
+
### Skill Parameters
|
|
496
|
+
|
|
497
|
+
Use template variables in skills — resolved at run time:
|
|
498
|
+
|
|
499
|
+
```bash
|
|
500
|
+
# In skill JSON actions:
|
|
501
|
+
{ "value": "{{input:query}}" } # passed via --params or prompted
|
|
502
|
+
{ "value": "{{env:SITE_PASSWORD}}" } # from environment variable
|
|
503
|
+
{ "value": "{{computed:date_today}}" } # auto-computed (date_today, timestamp, random_string, year, month, day)
|
|
504
|
+
|
|
505
|
+
# Run with params:
|
|
506
|
+
imperium-crawl run-skill my-search --params '{"query": "machine learning"}'
|
|
507
|
+
```
|
|
508
|
+
|
|
509
|
+
### Skill Chains
|
|
510
|
+
|
|
511
|
+
Chain skills together — output of one step becomes input to the next:
|
|
512
|
+
|
|
513
|
+
```json
|
|
514
|
+
{
|
|
515
|
+
"type": "chain",
|
|
516
|
+
"name": "search-and-extract",
|
|
517
|
+
"steps": [
|
|
518
|
+
{ "skill": "search-results", "output": "search" },
|
|
519
|
+
{ "skill": "extract-details", "input": { "url": "$search.results[0].url" }, "output": "details" }
|
|
520
|
+
]
|
|
521
|
+
}
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
Variable syntax: `$step_name.field.nested[0]` — simple dot-path access, no eval.
|
|
525
|
+
|
|
392
526
|
### Built-in Recipes
|
|
393
527
|
|
|
394
528
|
| Recipe | What It Does |
|
|
@@ -435,7 +569,7 @@ Turn any website into an API. No documentation needed.
|
|
|
435
569
|
|
|
436
570
|
## AI Agent Guide
|
|
437
571
|
|
|
438
|
-
imperium-crawl ships with [`SKILL/`](./SKILL/) — a structured guide that teaches AI agents how to use all
|
|
572
|
+
imperium-crawl ships with [`SKILL/`](./SKILL/) — a structured guide that teaches AI agents how to use all 39 tools effectively. Includes proven workflows, decision trees, error recovery, and advanced patterns.
|
|
439
573
|
|
|
440
574
|
### Two Ways to Connect
|
|
441
575
|
|
|
@@ -496,13 +630,14 @@ Every tool tested against production websites with real anti-bot defenses:
|
|
|
496
630
|
| 📋 **list_jobs** | — | Batch jobs with status and progress |
|
|
497
631
|
| 📊 **job_status** | Batch job | Full per-URL results with timing |
|
|
498
632
|
| 🗑️ **delete_job** | Completed job | Cleaned up job data from disk |
|
|
633
|
+
| 🧠 **knowledge** | Local knowledge file | Per-domain stats: stealth levels, success rates, anti-bot systems detected |
|
|
499
634
|
| 🎬 **youtube** | "web scraping tutorial" | Search results, video details, comments, transcripts — no API key |
|
|
500
635
|
| 💬 **reddit** | r/webscraping | Subreddit posts, comments, search — public JSON API |
|
|
501
636
|
| 📸 **instagram** | @nike profile | Profile details, engagement rate, recent posts — internal API |
|
|
502
637
|
| 📥 **download** | YouTube video, web page images | Auto-detect URL type, download media files — images, video, og:image |
|
|
503
638
|
| 📡 **rss** | Hacker News RSS | Parsed feed items with title, link, date, author, categories |
|
|
504
639
|
|
|
505
|
-
> **
|
|
640
|
+
> **39 tools. 34 hidden APIs on Airbnb. Live BTC feed. Reusable browser flows. Zero API keys for scraping.**
|
|
506
641
|
|
|
507
642
|
---
|
|
508
643
|
|
|
@@ -535,7 +670,7 @@ cd imperium-crawl
|
|
|
535
670
|
npm install
|
|
536
671
|
npm run build
|
|
537
672
|
npm run dev # Watch mode (rebuild on changes)
|
|
538
|
-
npm test #
|
|
673
|
+
npm test # 546 tests
|
|
539
674
|
npm start # Start CLI (shows help or TUI)
|
|
540
675
|
```
|
|
541
676
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Explore REPL — interactive browser session with live Playwright.
|
|
3
|
+
*
|
|
4
|
+
* Usage: imperium-crawl explore <url>
|
|
5
|
+
*
|
|
6
|
+
* Opens a headed (visible) browser window and gives the user a readline REPL
|
|
7
|
+
* to execute actions interactively. Every successful action is recorded.
|
|
8
|
+
* At any point, run `save-skill <name>` to export the session as a reusable skill.
|
|
9
|
+
*
|
|
10
|
+
* Commands:
|
|
11
|
+
* navigate <url> Navigate to URL
|
|
12
|
+
* click <selector> Click element
|
|
13
|
+
* type <selector> <text> Fill input field
|
|
14
|
+
* select <selector> <value> Select option
|
|
15
|
+
* wait [ms] Wait N ms (default 1000)
|
|
16
|
+
* screenshot [file] Save screenshot
|
|
17
|
+
* snapshot Show ARIA tree + refs
|
|
18
|
+
* evaluate <script> Run JS in page
|
|
19
|
+
* scroll [up|down] [px] Scroll page
|
|
20
|
+
* hover <selector> Hover element
|
|
21
|
+
* press <key> Press keyboard key
|
|
22
|
+
* save-skill <name> Export recording as skill JSON
|
|
23
|
+
* status Show URL, action count
|
|
24
|
+
* history List recorded actions
|
|
25
|
+
* undo Remove last action
|
|
26
|
+
* help Show command list
|
|
27
|
+
* exit / quit Close browser and exit
|
|
28
|
+
*/
|
|
29
|
+
export declare function runExplore(startUrl: string, sessionId?: string): Promise<void>;
|
|
30
|
+
//# sourceMappingURL=cli-explore.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli-explore.d.ts","sourceRoot":"","sources":["../src/cli-explore.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAgGH,wBAAsB,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CA6RpF"}
|