imperium-crawl 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/.env.example +49 -0
  2. package/README.md +69 -11
  3. package/dist/cli-explore.d.ts +30 -0
  4. package/dist/cli-explore.d.ts.map +1 -0
  5. package/dist/cli-explore.js +427 -0
  6. package/dist/cli-explore.js.map +1 -0
  7. package/dist/cli-recorder.d.ts +44 -0
  8. package/dist/cli-recorder.d.ts.map +1 -0
  9. package/dist/cli-recorder.js +67 -0
  10. package/dist/cli-recorder.js.map +1 -0
  11. package/dist/cli.d.ts.map +1 -1
  12. package/dist/cli.js +28 -0
  13. package/dist/cli.js.map +1 -1
  14. package/dist/config.d.ts.map +1 -1
  15. package/dist/config.js +3 -0
  16. package/dist/config.js.map +1 -1
  17. package/dist/constants.d.ts +1 -1
  18. package/dist/constants.js +1 -1
  19. package/dist/knowledge/index.d.ts +1 -0
  20. package/dist/knowledge/index.d.ts.map +1 -1
  21. package/dist/knowledge/index.js +1 -0
  22. package/dist/knowledge/index.js.map +1 -1
  23. package/dist/knowledge/record-browser.d.ts +17 -0
  24. package/dist/knowledge/record-browser.d.ts.map +1 -0
  25. package/dist/knowledge/record-browser.js +29 -0
  26. package/dist/knowledge/record-browser.js.map +1 -0
  27. package/dist/llm/retry.d.ts +4 -2
  28. package/dist/llm/retry.d.ts.map +1 -1
  29. package/dist/llm/retry.js +15 -4
  30. package/dist/llm/retry.js.map +1 -1
  31. package/dist/sessions/index.d.ts +1 -1
  32. package/dist/sessions/index.d.ts.map +1 -1
  33. package/dist/sessions/index.js +1 -1
  34. package/dist/sessions/index.js.map +1 -1
  35. package/dist/sessions/manager.d.ts +20 -0
  36. package/dist/sessions/manager.d.ts.map +1 -1
  37. package/dist/sessions/manager.js +57 -0
  38. package/dist/sessions/manager.js.map +1 -1
  39. package/dist/sessions/types.d.ts +2 -0
  40. package/dist/sessions/types.d.ts.map +1 -1
  41. package/dist/skills/chain.d.ts +61 -0
  42. package/dist/skills/chain.d.ts.map +1 -0
  43. package/dist/skills/chain.js +182 -0
  44. package/dist/skills/chain.js.map +1 -0
  45. package/dist/skills/conditions.d.ts +14 -0
  46. package/dist/skills/conditions.d.ts.map +1 -0
  47. package/dist/skills/conditions.js +208 -0
  48. package/dist/skills/conditions.js.map +1 -0
  49. package/dist/skills/manager.d.ts +47 -2
  50. package/dist/skills/manager.d.ts.map +1 -1
  51. package/dist/skills/manager.js.map +1 -1
  52. package/dist/skills/parameters.d.ts +49 -0
  53. package/dist/skills/parameters.d.ts.map +1 -0
  54. package/dist/skills/parameters.js +157 -0
  55. package/dist/skills/parameters.js.map +1 -0
  56. package/dist/stealth/index.d.ts +1 -0
  57. package/dist/stealth/index.d.ts.map +1 -1
  58. package/dist/stealth/index.js +61 -13
  59. package/dist/stealth/index.js.map +1 -1
  60. package/dist/tools/action-executor.d.ts +64 -0
  61. package/dist/tools/action-executor.d.ts.map +1 -0
  62. package/dist/tools/action-executor.js +365 -0
  63. package/dist/tools/action-executor.js.map +1 -0
  64. package/dist/tools/batch-scrape.d.ts +2 -2
  65. package/dist/tools/crawl.d.ts +2 -2
  66. package/dist/tools/create-skill.d.ts +2 -2
  67. package/dist/tools/discover-apis.d.ts +1 -1
  68. package/dist/tools/discover-apis.d.ts.map +1 -1
  69. package/dist/tools/discover-apis.js +3 -0
  70. package/dist/tools/discover-apis.js.map +1 -1
  71. package/dist/tools/download.d.ts +4 -4
  72. package/dist/tools/download.d.ts.map +1 -1
  73. package/dist/tools/download.js +3 -0
  74. package/dist/tools/download.js.map +1 -1
  75. package/dist/tools/extract.d.ts +1 -1
  76. package/dist/tools/image-search.d.ts +1 -1
  77. package/dist/tools/index.d.ts.map +1 -1
  78. package/dist/tools/index.js +3 -0
  79. package/dist/tools/index.js.map +1 -1
  80. package/dist/tools/instagram.d.ts +2 -2
  81. package/dist/tools/interact.d.ts +72 -44
  82. package/dist/tools/interact.d.ts.map +1 -1
  83. package/dist/tools/interact.js +33 -299
  84. package/dist/tools/interact.js.map +1 -1
  85. package/dist/tools/knowledge.d.ts +24 -0
  86. package/dist/tools/knowledge.d.ts.map +1 -0
  87. package/dist/tools/knowledge.js +99 -0
  88. package/dist/tools/knowledge.js.map +1 -0
  89. package/dist/tools/list-skills.js +1 -1
  90. package/dist/tools/list-skills.js.map +1 -1
  91. package/dist/tools/manifest.d.ts.map +1 -1
  92. package/dist/tools/manifest.js +5 -0
  93. package/dist/tools/manifest.js.map +1 -1
  94. package/dist/tools/monitor-websocket.d.ts +1 -1
  95. package/dist/tools/news-search.d.ts +1 -1
  96. package/dist/tools/query-api.d.ts +6 -6
  97. package/dist/tools/readability.d.ts +2 -2
  98. package/dist/tools/reddit.d.ts +4 -4
  99. package/dist/tools/run-skill.d.ts +14 -4
  100. package/dist/tools/run-skill.d.ts.map +1 -1
  101. package/dist/tools/run-skill.js +73 -0
  102. package/dist/tools/run-skill.js.map +1 -1
  103. package/dist/tools/scrape.d.ts +9 -6
  104. package/dist/tools/scrape.d.ts.map +1 -1
  105. package/dist/tools/scrape.js +2 -0
  106. package/dist/tools/scrape.js.map +1 -1
  107. package/dist/tools/screenshot.d.ts.map +1 -1
  108. package/dist/tools/screenshot.js +6 -0
  109. package/dist/tools/screenshot.js.map +1 -1
  110. package/dist/tools/search.d.ts +1 -1
  111. package/dist/tools/snapshot.d.ts +5 -5
  112. package/dist/tools/snapshot.d.ts.map +1 -1
  113. package/dist/tools/snapshot.js +3 -0
  114. package/dist/tools/snapshot.js.map +1 -1
  115. package/dist/tools/video-search.d.ts +1 -1
  116. package/dist/tools/youtube.d.ts +2 -2
  117. package/dist/utils/fetcher.d.ts.map +1 -1
  118. package/dist/utils/fetcher.js +33 -0
  119. package/dist/utils/fetcher.js.map +1 -1
  120. package/package.json +7 -3
package/.env.example ADDED
@@ -0,0 +1,49 @@
1
+ # ── API Keys ──────────────────────────────────────────────
2
+ # Brave Search — unlocks 4 search tools (web, news, image, video)
3
+ # Free tier: https://brave.com/search/api/
4
+ BRAVE_API_KEY=
5
+
6
+ # LLM — unlocks ai_extract tool + llm_fallback in extract
7
+ # Supports Anthropic, OpenAI, or MiniMax API keys
8
+ LLM_API_KEY=
9
+ LLM_PROVIDER=anthropic # anthropic | openai | minimax
10
+ LLM_MODEL= # override default model (optional)
11
+
12
+ # OpenAI — Whisper transcription for YouTube videos without captions
13
+ # https://platform.openai.com/
14
+ OPENAI_API_KEY=
15
+
16
+ # 2Captcha — auto CAPTCHA solving (reCAPTCHA v2/v3, hCaptcha, Turnstile)
17
+ # https://2captcha.com/
18
+ TWOCAPTCHA_API_KEY=
19
+
20
+ # ── Proxy ─────────────────────────────────────────────────
21
+ # Single proxy (http/https/socks4/socks5)
22
+ PROXY_URL=
23
+ # Rotating proxy pool (comma-separated)
24
+ PROXY_URLS=
25
+
26
+ # ── Browser ───────────────────────────────────────────────
27
+ # Max pooled browser instances (default: 3)
28
+ BROWSER_POOL_SIZE=3
29
+ # Chrome user data dir for authenticated sessions
30
+ CHROME_PROFILE_PATH=
31
+
32
+ # ── Security ──────────────────────────────────────────────
33
+ # 32-byte hex key for encrypting session files at rest
34
+ # Generate: node -e "console.log(require('crypto').randomBytes(32).toString('hex'))"
35
+ SESSION_ENCRYPTION_KEY=
36
+
37
+ # ── Behavior ──────────────────────────────────────────────
38
+ # Honor robots.txt (default: true)
39
+ RESPECT_ROBOTS=true
40
+
41
+ # ── Instagram (advanced — for influencer recipes) ─────────
42
+ # IG_SESSION_ID=
43
+ # IG_CSRF_TOKEN=
44
+ # IG_DS_USER_ID=
45
+
46
+ # ── Debug ─────────────────────────────────────────────────
47
+ # DEBUG=1 # enable debug logging to stderr
48
+ # VERBOSE=1 # same as DEBUG=1
49
+ # NO_COLOR=1 # disable colored output
package/README.md CHANGED
@@ -4,11 +4,11 @@
4
4
 
5
5
  **The most powerful open-source CLI tool for web scraping, crawling, and data extraction.**
6
6
 
7
- 28 tools. Zero API keys required. One `npx` command.
7
+ 29 tools. Zero API keys required. One `npx` command.
8
8
 
9
9
  [![npm version](https://img.shields.io/npm/v/imperium-crawl.svg)](https://www.npmjs.com/package/imperium-crawl)
10
10
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE)
11
- [![Tests](https://img.shields.io/badge/tests-466%20passing-brightgreen.svg)]()
11
+ [![Tests](https://img.shields.io/badge/tests-546%20passing-brightgreen.svg)]()
12
12
  [![npm downloads](https://img.shields.io/npm/dm/imperium-crawl.svg)](https://www.npmjs.com/package/imperium-crawl)
13
13
 
14
14
  </div>
@@ -31,7 +31,7 @@ npx -y imperium-crawl scrape --url https://example.com
31
31
  npm install -g imperium-crawl
32
32
  ```
33
33
 
34
- > That's it. 22 of 28 tools work with zero API keys. Add optional keys later to unlock search, AI extraction, and CAPTCHA solving.
34
+ > That's it. 23 of 29 tools work with zero API keys. Add optional keys later to unlock search, AI extraction, and CAPTCHA solving.
35
35
 
36
36
  ---
37
37
 
@@ -107,7 +107,7 @@ Scraping 4 URLs (concurrency: 3)...
107
107
  ## Why imperium-crawl?
108
108
 
109
109
  🔓 **Zero API Keys Required**
110
- 22 of 28 tools work out of the box. No accounts, no tokens, no credit cards. Just `npx` and go.
110
+ 23 of 29 tools work out of the box. No accounts, no tokens, no credit cards. Just `npx` and go.
111
111
 
112
112
  🛡️ **3-Level Auto-Escalating Stealth**
113
113
  Headers → TLS fingerprinting → headless browser + CAPTCHA solving. Automatically escalates until it gets through.
@@ -115,7 +115,7 @@ Headers → TLS fingerprinting → headless browser + CAPTCHA solving. Automatic
115
115
  🧠 **Self-Improving**
116
116
  Adaptive learning engine remembers what works per domain. Second visit is 3x faster. The more you use it, the smarter it gets.
117
117
 
118
- 🧰 **28 Tools, 2 Modes**
118
+ 🧰 **29 Tools, 2 Modes**
119
119
  CLI tool or interactive TUI. Scraping, crawling, search, extraction, API discovery, WebSocket monitoring, browser automation, batch processing.
120
120
 
121
121
  📜 **14 Built-in Recipes**
@@ -131,7 +131,7 @@ Teach it once, run forever. Auto-detect patterns on any page, save as reusable s
131
131
  | Feature | **imperium-crawl** | Firecrawl | Crawl4AI | Browserbase | Puppeteer |
132
132
  |---------|:------------------:|:---------:|:--------:|:-----------:|:---------:|
133
133
  | Price | **Free forever** | $19+/month | Free | $0.01/min | Free |
134
- | Total tools | **28** | 5 | 2 | 4 | N/A |
134
+ | Total tools | **29** | 5 | 2 | 4 | N/A |
135
135
  | Stealth levels | **3 (auto-escalate)** | Cloud-based | 1 | Cloud-based | None |
136
136
  | Anti-bot detection | **7 systems** | Partial | Partial | Partial | None |
137
137
  | TLS fingerprinting | **JA3/JA4** | No | No | No | No |
@@ -224,7 +224,7 @@ Second visit to cloudflare.com:
224
224
 
225
225
  ---
226
226
 
227
- ## All 28 Tools
227
+ ## All 29 Tools
228
228
 
229
229
  ### 📄 Scraping (no API key needed)
230
230
 
@@ -272,7 +272,7 @@ Second visit to cloudflare.com:
272
272
 
273
273
  | Tool | What It Does |
274
274
  |------|-------------|
275
- | **interact** | Browser automation with 18 action types (click, type, scroll, wait, screenshot, evaluate, select, hover, press, navigate, drag, upload, storage, cookies, pdf, auth_login). Ref targeting via ARIA snapshot, session encryption, action policy, domain filter, network interception, device emulation. |
275
+ | **interact** | Browser automation with 19 action types (click, type, scroll, wait, screenshot, evaluate, select, hover, press, navigate, drag, upload, storage, cookies, pdf, auth_login, refresh). Ref targeting via ARIA snapshot, session encryption, action policy, domain filter, network interception, device emulation. |
276
276
  | **snapshot** | ARIA-based page snapshot with interactive element refs. Use refs in interact for precise targeting. Annotated screenshots. |
277
277
 
278
278
  ### 📱 Social Media (no API key needed)
@@ -299,6 +299,12 @@ Second visit to cloudflare.com:
299
299
  | **job_status** | Full results for a specific batch job including per-URL outcomes. |
300
300
  | **delete_job** | Clean up completed or failed batch jobs. |
301
301
 
302
+ ### 🧠 Knowledge Engine (no API key needed)
303
+
304
+ | Tool | What It Does |
305
+ |------|-------------|
306
+ | **knowledge** | Dump adaptive knowledge engine stats — per-domain success rates, optimal stealth levels, anti-bot detection history, rate limits. Use to debug scraping issues and understand problematic domains. |
307
+
302
308
  ---
303
309
 
304
310
  ## Setup
@@ -366,6 +372,26 @@ imperium-crawl tui
366
372
 
367
373
  Interactive slash-command terminal with parameter prompts, table rendering, markdown display, and session state. Use `/save` to export results and `/again` to re-run the last command.
368
374
 
375
+ ### Explore REPL
376
+
377
+ Interactively explore a site in a headed browser, then save the session as a reusable skill:
378
+
379
+ ```bash
380
+ imperium-crawl explore https://example.com
381
+ ```
382
+
383
+ ```
384
+ > navigate https://example.com/login
385
+ > type "#email" "user@example.com"
386
+ > type "#password" "{{env:MY_PASSWORD}}"
387
+ > click "#submit"
388
+ > snapshot
389
+ > save-skill my-login
390
+ ✅ Saved skill: my-login (4 actions, 1 parameter detected)
391
+ ```
392
+
393
+ Commands: `navigate`, `click`, `type`, `select`, `hover`, `press`, `scroll`, `wait`, `screenshot`, `snapshot`, `evaluate`, `save-skill`, `history`, `undo`, `status`, `help`, `exit`
394
+
369
395
  ---
370
396
 
371
397
  ## Skills & Recipes
@@ -389,6 +415,37 @@ run_skill({ name: "tc-ai-news" })
389
415
 
390
416
  Skills are saved in `~/.imperium-crawl/skills/` as JSON files — human-readable, editable, portable.
391
417
 
418
+ ### Skill Parameters
419
+
420
+ Use template variables in skills — resolved at run time:
421
+
422
+ ```bash
423
+ # In skill JSON actions:
424
+ { "value": "{{input:query}}" } # passed via --params or prompted
425
+ { "value": "{{env:SITE_PASSWORD}}" } # from environment variable
426
+ { "value": "{{computed:date_today}}" } # auto-computed (date_today, timestamp, random_string, year, month, day)
427
+
428
+ # Run with params:
429
+ imperium-crawl run-skill my-search --params '{"query": "machine learning"}'
430
+ ```
431
+
432
+ ### Skill Chains
433
+
434
+ Chain skills together — output of one step becomes input to the next:
435
+
436
+ ```json
437
+ {
438
+ "type": "chain",
439
+ "name": "search-and-extract",
440
+ "steps": [
441
+ { "skill": "search-results", "output": "search" },
442
+ { "skill": "extract-details", "input": { "url": "$search.results[0].url" }, "output": "details" }
443
+ ]
444
+ }
445
+ ```
446
+
447
+ Variable syntax: `$step_name.field.nested[0]` — simple dot-path access, no eval.
448
+
392
449
  ### Built-in Recipes
393
450
 
394
451
  | Recipe | What It Does |
@@ -435,7 +492,7 @@ Turn any website into an API. No documentation needed.
435
492
 
436
493
  ## AI Agent Guide
437
494
 
438
- imperium-crawl ships with [`SKILL/`](./SKILL/) — a structured guide that teaches AI agents how to use all 28 tools effectively. Includes proven workflows, decision trees, error recovery, and advanced patterns.
495
+ imperium-crawl ships with [`SKILL/`](./SKILL/) — a structured guide that teaches AI agents how to use all 29 tools effectively. Includes proven workflows, decision trees, error recovery, and advanced patterns.
439
496
 
440
497
  ### Two Ways to Connect
441
498
 
@@ -496,13 +553,14 @@ Every tool tested against production websites with real anti-bot defenses:
496
553
  | 📋 **list_jobs** | — | Batch jobs with status and progress |
497
554
  | 📊 **job_status** | Batch job | Full per-URL results with timing |
498
555
  | 🗑️ **delete_job** | Completed job | Cleaned up job data from disk |
556
+ | 🧠 **knowledge** | Local knowledge file | Per-domain stats: stealth levels, success rates, anti-bot systems detected |
499
557
  | 🎬 **youtube** | "web scraping tutorial" | Search results, video details, comments, transcripts — no API key |
500
558
  | 💬 **reddit** | r/webscraping | Subreddit posts, comments, search — public JSON API |
501
559
  | 📸 **instagram** | @nike profile | Profile details, engagement rate, recent posts — internal API |
502
560
  | 📥 **download** | YouTube video, web page images | Auto-detect URL type, download media files — images, video, og:image |
503
561
  | 📡 **rss** | Hacker News RSS | Parsed feed items with title, link, date, author, categories |
504
562
 
505
- > **28/28 tools. 34 hidden APIs on Airbnb. Live BTC feed. Zero API keys for scraping.**
563
+ > **29/29 tools. 34 hidden APIs on Airbnb. Live BTC feed. Zero API keys for scraping.**
506
564
 
507
565
  ---
508
566
 
@@ -535,7 +593,7 @@ cd imperium-crawl
535
593
  npm install
536
594
  npm run build
537
595
  npm run dev # Watch mode (rebuild on changes)
538
- npm test # 466 tests
596
+ npm test # 546 tests
539
597
  npm start # Start CLI (shows help or TUI)
540
598
  ```
541
599
 
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Explore REPL — interactive browser session with live Playwright.
3
+ *
4
+ * Usage: imperium-crawl explore <url>
5
+ *
6
+ * Opens a headed (visible) browser window and gives the user a readline REPL
7
+ * to execute actions interactively. Every successful action is recorded.
8
+ * At any point, run `save-skill <name>` to export the session as a reusable skill.
9
+ *
10
+ * Commands:
11
+ * navigate <url> Navigate to URL
12
+ * click <selector> Click element
13
+ * type <selector> <text> Fill input field
14
+ * select <selector> <value> Select option
15
+ * wait [ms] Wait N ms (default 1000)
16
+ * screenshot [file] Save screenshot
17
+ * snapshot Show ARIA tree + refs
18
+ * evaluate <script> Run JS in page
19
+ * scroll [up|down] [px] Scroll page
20
+ * hover <selector> Hover element
21
+ * press <key> Press keyboard key
22
+ * save-skill <name> Export recording as skill JSON
23
+ * status Show URL, action count
24
+ * history List recorded actions
25
+ * undo Remove last action
26
+ * help Show command list
27
+ * exit / quit Close browser and exit
28
+ */
29
+ export declare function runExplore(startUrl: string, sessionId?: string): Promise<void>;
30
+ //# sourceMappingURL=cli-explore.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli-explore.d.ts","sourceRoot":"","sources":["../src/cli-explore.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAgGH,wBAAsB,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CA6RpF"}