enterprise-ai-recursive-web-scraper 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- package/README.md +40 -20
- package/package.json +43 -36
package/README.md
CHANGED
@@ -17,7 +17,10 @@
|
|
17
17
|
|
18
18
|
## ✨ Features
|
19
19
|
|
20
|
-
* 🚀 **High Performance**:
|
20
|
+
* 🚀 **High Performance**:
|
21
|
+
- Blazing fast multi-threaded scraping with concurrent processing
|
22
|
+
- Smart rate limiting to prevent API throttling and server overload
|
23
|
+
- Automatic request queuing and retry mechanisms
|
21
24
|
* 🤖 **AI-Powered**: Intelligent content extraction using Groq LLMs
|
22
25
|
* 🌐 **Multi-Browser**: Support for Chromium, Firefox, and WebKit
|
23
26
|
* 📊 **Smart Extraction**:
|
@@ -26,6 +29,7 @@
|
|
26
29
|
- Cosine similarity clustering for content deduplication
|
27
30
|
* 🎯 **Advanced Capabilities**:
|
28
31
|
- Recursive domain crawling with boundary respect
|
32
|
+
- Intelligent rate limiting with token bucket algorithm
|
29
33
|
- Session management for complex multi-page flows
|
30
34
|
- Custom JavaScript execution support
|
31
35
|
- Enhanced screenshot capture with lazy-load detection
|
@@ -33,8 +37,9 @@
|
|
33
37
|
* 🔒 **Enterprise Ready**:
|
34
38
|
- Proxy support with authentication
|
35
39
|
- Custom headers and user-agent configuration
|
36
|
-
- Comprehensive error handling
|
37
|
-
- Flexible timeout management
|
40
|
+
- Comprehensive error handling and retry mechanisms
|
41
|
+
- Flexible timeout and rate limit management
|
42
|
+
- Detailed logging and monitoring
|
38
43
|
|
39
44
|
## 🚀 Quick Start
|
40
45
|
|
@@ -66,29 +71,44 @@ web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
|
|
66
71
|
|
67
72
|
#### CLI Options
|
68
73
|
|
69
|
-
- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key
|
70
|
-
- `-u, --url <url>`: **(Required)** The URL of the website
|
71
|
-
- `-o, --output <directory>`:
|
72
|
-
- `-d, --depth <number>`: Maximum crawl depth
|
73
|
-
- `-c, --concurrency <number>`: Concurrent scraping limit
|
74
|
-
- `-
|
75
|
-
- `-
|
76
|
-
-
|
77
|
-
-
|
78
|
-
- `--
|
79
|
-
-
|
80
|
-
|
81
|
-
|
82
|
-
#### Example Command
|
74
|
+
- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key
|
75
|
+
- `-u, --url <url>`: **(Required)** The URL of the website to scrape
|
76
|
+
- `-o, --output <directory>`: Output directory for scraped data (default: `scraping_output`)
|
77
|
+
- `-d, --depth <number>`: Maximum crawl depth (default: `3`)
|
78
|
+
- `-c, --concurrency <number>`: Concurrent scraping limit (default: `5`)
|
79
|
+
- `-r, --rate-limit <number>`: Requests per second (default: `5`)
|
80
|
+
- `-t, --timeout <number>`: Request timeout in milliseconds (default: `30000`)
|
81
|
+
- `-f, --format <type>`: Output format: json|csv|markdown (default: `json`)
|
82
|
+
- `-v, --verbose`: Enable verbose logging
|
83
|
+
- `--retry-attempts <number>`: Number of retry attempts (default: `3`)
|
84
|
+
- `--retry-delay <number>`: Delay between retries in ms (default: `1000`)
|
85
|
+
|
86
|
+
Example usage with rate limiting:
|
83
87
|
|
84
88
|
```bash
|
85
|
-
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
|
89
|
+
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output \
|
90
|
+
--depth 5 --concurrency 10 --rate-limit 2 --retry-attempts 3 --format csv --verbose
|
86
91
|
```
|
87
92
|
|
88
|
-
This command will scrape the specified URL with a maximum depth of 5, using 10 concurrent requests, and save the output in CSV format in the `./output` directory with verbose logging enabled.
|
89
|
-
|
90
93
|
## 🔧 Advanced Usage
|
91
94
|
|
95
|
+
### Rate Limiting Configuration
|
96
|
+
|
97
|
+
Configure rate limiting to respect server limits and prevent throttling:
|
98
|
+
|
99
|
+
```typescript
|
100
|
+
import { WebScraper, RateLimiter } from "enterprise-ai-recursive-web-scraper";
|
101
|
+
|
102
|
+
const scraper = new WebScraper({
|
103
|
+
rateLimiter: new RateLimiter({
|
104
|
+
maxTokens: 5, // Maximum number of tokens
|
105
|
+
refillRate: 1, // Tokens refilled per second
|
106
|
+
retryAttempts: 3, // Number of retry attempts
|
107
|
+
retryDelay: 1000 // Delay between retries (ms)
|
108
|
+
})
|
109
|
+
});
|
110
|
+
```
|
111
|
+
|
92
112
|
### Structured Data Extraction
|
93
113
|
|
94
114
|
To extract structured data using a JSON schema, you can use the `JsonExtractionStrategy`:
|
package/package.json
CHANGED
@@ -1,7 +1,16 @@
|
|
1
1
|
{
|
2
2
|
"name": "enterprise-ai-recursive-web-scraper",
|
3
|
-
"version": "1.0.
|
3
|
+
"version": "1.0.5",
|
4
4
|
"description": "AI powered, recursive, web-scraper utilizing Gemini models, Puppeteer, and Playwright",
|
5
|
+
"keywords": [
|
6
|
+
"web-scraper",
|
7
|
+
"ai",
|
8
|
+
"gemini",
|
9
|
+
"puppeteer",
|
10
|
+
"playwright",
|
11
|
+
"typescript",
|
12
|
+
"cli"
|
13
|
+
],
|
5
14
|
"repository": {
|
6
15
|
"type": "git",
|
7
16
|
"url": "https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper"
|
@@ -12,30 +21,52 @@
|
|
12
21
|
"email": "airwolf635@gmail.com"
|
13
22
|
},
|
14
23
|
"type": "module",
|
24
|
+
"exports": {
|
25
|
+
".": {
|
26
|
+
"require": "./lib/index.cjs",
|
27
|
+
"import": "./lib/index.js",
|
28
|
+
"types": "./lib/index.d.ts"
|
29
|
+
},
|
30
|
+
"./cli": {
|
31
|
+
"require": "./lib/cli.cjs",
|
32
|
+
"import": "./lib/cli.js",
|
33
|
+
"types": "./lib/cli.d.ts"
|
34
|
+
}
|
35
|
+
},
|
15
36
|
"main": "lib/cli.cjs",
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
37
|
+
"bin": {
|
38
|
+
"web-scraper": "./lib/cli.cjs"
|
39
|
+
},
|
40
|
+
"files": [
|
41
|
+
"lib/*.js",
|
42
|
+
"lib/*.cjs",
|
43
|
+
"lib/*.d.ts",
|
44
|
+
"lib/*.d.cts",
|
45
|
+
"lib/*.map",
|
46
|
+
"package.json",
|
47
|
+
"LICENSE.md",
|
48
|
+
"README.md"
|
49
|
+
],
|
26
50
|
"scripts": {
|
27
51
|
"build": "bun run build:pre && tsup",
|
28
52
|
"build:pre": "bunx puppeteer browsers install chrome",
|
53
|
+
"prepack": "bun run build",
|
29
54
|
"prepublishOnly": "bun run build",
|
30
|
-
"
|
55
|
+
"lint": "bunx biome check --write ./ || true",
|
56
|
+
"pretest": "bun run build",
|
57
|
+
"test": "bun test"
|
31
58
|
},
|
32
59
|
"dependencies": {
|
60
|
+
"@biomejs/biome": "1.9.4",
|
33
61
|
"@google/generative-ai": "^0.21.0",
|
62
|
+
"async-sema": "^3.1.1",
|
63
|
+
"bottleneck": "^2.19.5",
|
34
64
|
"chalk": "^5.3.0",
|
35
65
|
"cli-table3": "^0.6.5",
|
36
66
|
"commander": "^11.1.0",
|
37
67
|
"dotenv": "^16.4.5",
|
38
68
|
"inquirer": "^9.2.15",
|
69
|
+
"lru-cache": "^11.0.2",
|
39
70
|
"natural": "^8.0.1",
|
40
71
|
"ora": "^8.0.1",
|
41
72
|
"playwright": "^1.48.2",
|
@@ -54,29 +85,5 @@
|
|
54
85
|
},
|
55
86
|
"engines": {
|
56
87
|
"node": ">=20.12.2"
|
57
|
-
},
|
58
|
-
"keywords": [
|
59
|
-
"web-scraper",
|
60
|
-
"ai",
|
61
|
-
"gemini",
|
62
|
-
"puppeteer",
|
63
|
-
"playwright",
|
64
|
-
"typescript",
|
65
|
-
"cli"
|
66
|
-
],
|
67
|
-
"bin": {
|
68
|
-
"web-scraper": "./lib/cli.cjs"
|
69
|
-
},
|
70
|
-
"exports": {
|
71
|
-
".": {
|
72
|
-
"require": "./lib/index.cjs",
|
73
|
-
"import": "./lib/index.js",
|
74
|
-
"types": "./lib/index.d.ts"
|
75
|
-
},
|
76
|
-
"./cli": {
|
77
|
-
"require": "./lib/cli.cjs",
|
78
|
-
"import": "./lib/cli.js",
|
79
|
-
"types": "./lib/cli.d.ts"
|
80
|
-
}
|
81
88
|
}
|
82
89
|
}
|