npm - enterprise-ai-recursive-web-scraper - Versions diffs - 1.0.4 → 1.0.5 - Mend

enterprise-ai-recursive-web-scraper 1.0.4 → 1.0.5

Files changed (2) hide show

package/README.md +40 -20
package/package.json +43 -36

package/README.md CHANGED Viewed

@@ -17,7 +17,10 @@
 ## ✨ Features
-* 🚀 **High Performance**: Blazing fast multi-threaded scraping with concurrent processing
+* 🚀 **High Performance**:
+  - Blazing fast multi-threaded scraping with concurrent processing
+  - Smart rate limiting to prevent API throttling and server overload
+  - Automatic request queuing and retry mechanisms
 * 🤖 **AI-Powered**: Intelligent content extraction using Groq LLMs
 * 🌐 **Multi-Browser**: Support for Chromium, Firefox, and WebKit
 * 📊 **Smart Extraction**:
@@ -26,6 +29,7 @@
   - Cosine similarity clustering for content deduplication
 * 🎯 **Advanced Capabilities**:
   - Recursive domain crawling with boundary respect
+  - Intelligent rate limiting with token bucket algorithm
   - Session management for complex multi-page flows
   - Custom JavaScript execution support
   - Enhanced screenshot capture with lazy-load detection
@@ -33,8 +37,9 @@
 * 🔒 **Enterprise Ready**:
   - Proxy support with authentication
   - Custom headers and user-agent configuration
-  - Comprehensive error handling
-  - Flexible timeout management
+  - Comprehensive error handling and retry mechanisms
+  - Flexible timeout and rate limit management
+  - Detailed logging and monitoring
 ## 🚀 Quick Start
@@ -66,29 +71,44 @@ web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
 #### CLI Options
-- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key.
-- `-u, --url <url>`: **(Required)** The URL of the website you want to scrape.
-- `-o, --output <directory>`: The directory where the scraped data will be saved. Default is `scraping_output`.
-- `-d, --depth <number>`: Maximum crawl depth. Default is `3`.
-- `-c, --concurrency <number>`: Concurrent scraping limit. Default is `5`.
-- `-t, --timeout <seconds>`: Request timeout in seconds. Default is `30`.
-- `-f, --format <type>`: Output format (`json`, `csv`, `markdown`). Default is `json`.
-- `--screenshot`: Capture screenshots of pages.
-- `--no-headless`: Run the browser in non-headless mode.
-- `--proxy <url>`: Use a proxy server.
-- `-v, --verbose`: Enable verbose logging.
-- `--config <path>`: Path to a configuration file.
-#### Example Command
+- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key
+- `-u, --url <url>`: **(Required)** The URL of the website to scrape
+- `-o, --output <directory>`: Output directory for scraped data (default: `scraping_output`)
+- `-d, --depth <number>`: Maximum crawl depth (default: `3`)
+- `-c, --concurrency <number>`: Concurrent scraping limit (default: `5`)
+- `-r, --rate-limit <number>`: Requests per second (default: `5`)
+- `-t, --timeout <number>`: Request timeout in milliseconds (default: `30000`)
+- `-f, --format <type>`: Output format: json|csv|markdown (default: `json`)
+- `-v, --verbose`: Enable verbose logging
+- `--retry-attempts <number>`: Number of retry attempts (default: `3`)
+- `--retry-delay <number>`: Delay between retries in ms (default: `1000`)
+Example usage with rate limiting:
 ```bash
-web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output --depth 5 --concurrency 10 --format csv --verbose
+web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output \
+  --depth 5 --concurrency 10 --rate-limit 2 --retry-attempts 3 --format csv --verbose
 ```
-This command will scrape the specified URL with a maximum depth of 5, using 10 concurrent requests, and save the output in CSV format in the `./output` directory with verbose logging enabled.
 ## 🔧 Advanced Usage
+### Rate Limiting Configuration
+Configure rate limiting to respect server limits and prevent throttling:
+```typescript
+import { WebScraper, RateLimiter } from "enterprise-ai-recursive-web-scraper";
+const scraper = new WebScraper({
+    rateLimiter: new RateLimiter({
+        maxTokens: 5,      // Maximum number of tokens
+        refillRate: 1,     // Tokens refilled per second
+        retryAttempts: 3,  // Number of retry attempts
+        retryDelay: 1000   // Delay between retries (ms)
+    })
+});
+```
 ### Structured Data Extraction
 To extract structured data using a JSON schema, you can use the `JsonExtractionStrategy`:

package/package.json CHANGED Viewed

@@ -1,7 +1,16 @@
 {
 	"name": "enterprise-ai-recursive-web-scraper",
-	"version": "1.0.4",
+	"version": "1.0.5",
 	"description": "AI powered, recursive, web-scraper utilizing Gemini models, Puppeteer, and Playwright",
+	"keywords": [
+		"web-scraper",
+		"ai",
+		"gemini",
+		"puppeteer",
+		"playwright",
+		"typescript",
+		"cli"
+	],
 	"repository": {
 		"type": "git",
 		"url": "https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper"
@@ -12,30 +21,52 @@
 		"email": "airwolf635@gmail.com"
 	},
 	"type": "module",
+	"exports": {
+		".": {
+			"require": "./lib/index.cjs",
+			"import": "./lib/index.js",
+			"types": "./lib/index.d.ts"
+		},
+		"./cli": {
+			"require": "./lib/cli.cjs",
+			"import": "./lib/cli.js",
+			"types": "./lib/cli.d.ts"
+		}
+	},
 	"main": "lib/cli.cjs",
-  "files": [
-    "lib/*.js",
-    "lib/*.cjs",
-    "lib/*.d.ts",
-    "lib/*.d.cts",
-    "lib/*.map",
-    "package.json",
-    "LICENSE.md",
-    "README.md"
-  ],
+	"bin": {
+		"web-scraper": "./lib/cli.cjs"
+	},
+	"files": [
+		"lib/*.js",
+		"lib/*.cjs",
+		"lib/*.d.ts",
+		"lib/*.d.cts",
+		"lib/*.map",
+		"package.json",
+		"LICENSE.md",
+		"README.md"
+	],
 	"scripts": {
 		"build": "bun run build:pre && tsup",
 		"build:pre": "bunx puppeteer browsers install chrome",
+		"prepack": "bun run build",
 		"prepublishOnly": "bun run build",
-		"prepack": "bun run build"
+		"lint": "bunx biome check --write ./ || true",
+		"pretest": "bun run build",
+		"test": "bun test"
 	},
 	"dependencies": {
+		"@biomejs/biome": "1.9.4",
 		"@google/generative-ai": "^0.21.0",
+		"async-sema": "^3.1.1",
+		"bottleneck": "^2.19.5",
 		"chalk": "^5.3.0",
 		"cli-table3": "^0.6.5",
 		"commander": "^11.1.0",
 		"dotenv": "^16.4.5",
 		"inquirer": "^9.2.15",
+		"lru-cache": "^11.0.2",
 		"natural": "^8.0.1",
 		"ora": "^8.0.1",
 		"playwright": "^1.48.2",
@@ -54,29 +85,5 @@
 	},
 	"engines": {
 		"node": ">=20.12.2"
-	},
-	"keywords": [
-		"web-scraper",
-		"ai",
-		"gemini",
-		"puppeteer",
-		"playwright",
-		"typescript",
-		"cli"
-	],
-	"bin": {
-		"web-scraper": "./lib/cli.cjs"
-	},
-	"exports": {
-		".": {
-			"require": "./lib/index.cjs",
-			"import": "./lib/index.js",
-			"types": "./lib/index.d.ts"
-		},
-		"./cli": {
-			"require": "./lib/cli.cjs",
-			"import": "./lib/cli.js",
-			"types": "./lib/cli.d.ts"
-		}
 	}
 }