enterprise-ai-recursive-web-scraper 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -20
- package/package.json +43 -36
package/README.md
CHANGED
@@ -17,7 +17,10 @@
|
|
17
17
|
|
18
18
|
## ✨ Features
|
19
19
|
|
20
|
-
* 🚀 **High Performance**:
|
20
|
+
* 🚀 **High Performance**:
|
21
|
+
- Blazing fast multi-threaded scraping with concurrent processing
|
22
|
+
- Smart rate limiting to prevent API throttling and server overload
|
23
|
+
- Automatic request queuing and retry mechanisms
|
21
24
|
* 🤖 **AI-Powered**: Intelligent content extraction using Groq LLMs
|
22
25
|
* 🌐 **Multi-Browser**: Support for Chromium, Firefox, and WebKit
|
23
26
|
* 📊 **Smart Extraction**:
|
@@ -26,6 +29,7 @@
|
|
26
29
|
- Cosine similarity clustering for content deduplication
|
27
30
|
* 🎯 **Advanced Capabilities**:
|
28
31
|
- Recursive domain crawling with boundary respect
|
32
|
+
- Intelligent rate limiting with token bucket algorithm
|
29
33
|
- Session management for complex multi-page flows
|
30
34
|
- Custom JavaScript execution support
|
31
35
|
- Enhanced screenshot capture with lazy-load detection
|
@@ -33,8 +37,9 @@
|
|
33
37
|
* 🔒 **Enterprise Ready**:
|
34
38
|
- Proxy support with authentication
|
35
39
|
- Custom headers and user-agent configuration
|
36
|
-
- Comprehensive error handling
|
37
|
-
- Flexible timeout management
|
40
|
+
- Comprehensive error handling and retry mechanisms
|
41
|
+
- Flexible timeout and rate limit management
|
42
|
+
- Detailed logging and monitoring
|
38
43
|
|
39
44
|
## 🚀 Quick Start
|
40
45
|
|
@@ -66,29 +71,44 @@ web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
|
|
66
71
|
|
67
72
|
#### CLI Options
|
68
73
|
|
69
|
-
- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key
|
70
|
-
- `-u, --url <url>`: **(Required)** The URL of the website
|
71
|
-
- `-o, --output <directory>`:
|
72
|
-
- `-d, --depth <number>`: Maximum crawl depth
|
73
|
-
- `-c, --concurrency <number>`: Concurrent scraping limit
|
74
|
-
- `-
|
75
|
-
- `-
|
76
|
-
-
|
77
|
-
-
|
78
|
-
- `--
|
79
|
-
-
|
80
|
-
|
81
|
-
|
82
|
-
#### Example Command
|
74
|
+
- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key
|
75
|
+
- `-u, --url <url>`: **(Required)** The URL of the website to scrape
|
76
|
+
- `-o, --output <directory>`: Output directory for scraped data (default: `scraping_output`)
|
77
|
+
- `-d, --depth <number>`: Maximum crawl depth (default: `3`)
|
78
|
+
- `-c, --concurrency <number>`: Concurrent scraping limit (default: `5`)
|
79
|
+
- `-r, --rate-limit <number>`: Requests per second (default: `5`)
|
80
|
+
- `-t, --timeout <number>`: Request timeout in milliseconds (default: `30000`)
|
81
|
+
- `-f, --format <type>`: Output format: json|csv|markdown (default: `json`)
|
82
|
+
- `-v, --verbose`: Enable verbose logging
|
83
|
+
- `--retry-attempts <number>`: Number of retry attempts (default: `3`)
|
84
|
+
- `--retry-delay <number>`: Delay between retries in ms (default: `1000`)
|
85
|
+
|
86
|
+
Example usage with rate limiting:
|
83
87
|
|
84
88
|
```bash
|
85
|
-
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
|
89
|
+
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output \
|
90
|
+
--depth 5 --concurrency 10 --rate-limit 2 --retry-attempts 3 --format csv --verbose
|
86
91
|
```
|
87
92
|
|
88
|
-
This command will scrape the specified URL with a maximum depth of 5, using 10 concurrent requests, and save the output in CSV format in the `./output` directory with verbose logging enabled.
|
89
|
-
|
90
93
|
## 🔧 Advanced Usage
|
91
94
|
|
95
|
+
### Rate Limiting Configuration
|
96
|
+
|
97
|
+
Configure rate limiting to respect server limits and prevent throttling:
|
98
|
+
|
99
|
+
```typescript
|
100
|
+
import { WebScraper, RateLimiter } from "enterprise-ai-recursive-web-scraper";
|
101
|
+
|
102
|
+
const scraper = new WebScraper({
|
103
|
+
rateLimiter: new RateLimiter({
|
104
|
+
maxTokens: 5, // Maximum number of tokens
|
105
|
+
refillRate: 1, // Tokens refilled per second
|
106
|
+
retryAttempts: 3, // Number of retry attempts
|
107
|
+
retryDelay: 1000 // Delay between retries (ms)
|
108
|
+
})
|
109
|
+
});
|
110
|
+
```
|
111
|
+
|
92
112
|
### Structured Data Extraction
|
93
113
|
|
94
114
|
To extract structured data using a JSON schema, you can use the `JsonExtractionStrategy`:
|
package/package.json
CHANGED
@@ -1,7 +1,16 @@
|
|
1
1
|
{
|
2
2
|
"name": "enterprise-ai-recursive-web-scraper",
|
3
|
-
"version": "1.0.
|
3
|
+
"version": "1.0.5",
|
4
4
|
"description": "AI powered, recursive, web-scraper utilizing Gemini models, Puppeteer, and Playwright",
|
5
|
+
"keywords": [
|
6
|
+
"web-scraper",
|
7
|
+
"ai",
|
8
|
+
"gemini",
|
9
|
+
"puppeteer",
|
10
|
+
"playwright",
|
11
|
+
"typescript",
|
12
|
+
"cli"
|
13
|
+
],
|
5
14
|
"repository": {
|
6
15
|
"type": "git",
|
7
16
|
"url": "https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper"
|
@@ -12,30 +21,52 @@
|
|
12
21
|
"email": "airwolf635@gmail.com"
|
13
22
|
},
|
14
23
|
"type": "module",
|
24
|
+
"exports": {
|
25
|
+
".": {
|
26
|
+
"require": "./lib/index.cjs",
|
27
|
+
"import": "./lib/index.js",
|
28
|
+
"types": "./lib/index.d.ts"
|
29
|
+
},
|
30
|
+
"./cli": {
|
31
|
+
"require": "./lib/cli.cjs",
|
32
|
+
"import": "./lib/cli.js",
|
33
|
+
"types": "./lib/cli.d.ts"
|
34
|
+
}
|
35
|
+
},
|
15
36
|
"main": "lib/cli.cjs",
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
37
|
+
"bin": {
|
38
|
+
"web-scraper": "./lib/cli.cjs"
|
39
|
+
},
|
40
|
+
"files": [
|
41
|
+
"lib/*.js",
|
42
|
+
"lib/*.cjs",
|
43
|
+
"lib/*.d.ts",
|
44
|
+
"lib/*.d.cts",
|
45
|
+
"lib/*.map",
|
46
|
+
"package.json",
|
47
|
+
"LICENSE.md",
|
48
|
+
"README.md"
|
49
|
+
],
|
26
50
|
"scripts": {
|
27
51
|
"build": "bun run build:pre && tsup",
|
28
52
|
"build:pre": "bunx puppeteer browsers install chrome",
|
53
|
+
"prepack": "bun run build",
|
29
54
|
"prepublishOnly": "bun run build",
|
30
|
-
"
|
55
|
+
"lint": "bunx biome check --write ./ || true",
|
56
|
+
"pretest": "bun run build",
|
57
|
+
"test": "bun test"
|
31
58
|
},
|
32
59
|
"dependencies": {
|
60
|
+
"@biomejs/biome": "1.9.4",
|
33
61
|
"@google/generative-ai": "^0.21.0",
|
62
|
+
"async-sema": "^3.1.1",
|
63
|
+
"bottleneck": "^2.19.5",
|
34
64
|
"chalk": "^5.3.0",
|
35
65
|
"cli-table3": "^0.6.5",
|
36
66
|
"commander": "^11.1.0",
|
37
67
|
"dotenv": "^16.4.5",
|
38
68
|
"inquirer": "^9.2.15",
|
69
|
+
"lru-cache": "^11.0.2",
|
39
70
|
"natural": "^8.0.1",
|
40
71
|
"ora": "^8.0.1",
|
41
72
|
"playwright": "^1.48.2",
|
@@ -54,29 +85,5 @@
|
|
54
85
|
},
|
55
86
|
"engines": {
|
56
87
|
"node": ">=20.12.2"
|
57
|
-
},
|
58
|
-
"keywords": [
|
59
|
-
"web-scraper",
|
60
|
-
"ai",
|
61
|
-
"gemini",
|
62
|
-
"puppeteer",
|
63
|
-
"playwright",
|
64
|
-
"typescript",
|
65
|
-
"cli"
|
66
|
-
],
|
67
|
-
"bin": {
|
68
|
-
"web-scraper": "./lib/cli.cjs"
|
69
|
-
},
|
70
|
-
"exports": {
|
71
|
-
".": {
|
72
|
-
"require": "./lib/index.cjs",
|
73
|
-
"import": "./lib/index.js",
|
74
|
-
"types": "./lib/index.d.ts"
|
75
|
-
},
|
76
|
-
"./cli": {
|
77
|
-
"require": "./lib/cli.cjs",
|
78
|
-
"import": "./lib/cli.js",
|
79
|
-
"types": "./lib/cli.d.ts"
|
80
|
-
}
|
81
88
|
}
|
82
89
|
}
|