enterprise-ai-recursive-web-scraper 1.0.0 β 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- package/README.md +46 -13
- package/package.json +24 -8
package/README.md
CHANGED
@@ -38,30 +38,61 @@
|
|
38
38
|
|
39
39
|
## π Quick Start
|
40
40
|
|
41
|
+
To install the package, run:
|
42
|
+
|
41
43
|
```bash
|
42
|
-
npm
|
44
|
+
npm install enterprise-ai-recursive-web-scraper
|
43
45
|
```
|
44
46
|
|
45
|
-
|
46
|
-
|
47
|
+
### Using the CLI
|
48
|
+
|
49
|
+
The `enterprise-ai-recursive-web-scraper` package includes a command-line interface (CLI) that you can use to perform web scraping tasks directly from the terminal.
|
50
|
+
|
51
|
+
#### Installation
|
47
52
|
|
48
|
-
|
49
|
-
const scraper = new WebScraper({
|
50
|
-
outputDir: "scraping_output",
|
51
|
-
verbose: true
|
52
|
-
});
|
53
|
+
Ensure that the package is installed globally to use the CLI:
|
53
54
|
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
```bash
|
56
|
+
npm install -g enterprise-ai-recursive-web-scraper
|
57
|
+
```
|
58
|
+
|
59
|
+
#### Running the CLI
|
57
60
|
|
58
|
-
|
61
|
+
Once installed, you can use the `web-scraper` command to start scraping. Hereβs a basic example of how to use it:
|
62
|
+
|
63
|
+
```bash
|
64
|
+
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
|
59
65
|
```
|
60
66
|
|
67
|
+
#### CLI Options
|
68
|
+
|
69
|
+
- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key.
|
70
|
+
- `-u, --url <url>`: **(Required)** The URL of the website you want to scrape.
|
71
|
+
- `-o, --output <directory>`: The directory where the scraped data will be saved. Default is `scraping_output`.
|
72
|
+
- `-d, --depth <number>`: Maximum crawl depth. Default is `3`.
|
73
|
+
- `-c, --concurrency <number>`: Concurrent scraping limit. Default is `5`.
|
74
|
+
- `-t, --timeout <seconds>`: Request timeout in seconds. Default is `30`.
|
75
|
+
- `-f, --format <type>`: Output format (`json`, `csv`, `markdown`). Default is `json`.
|
76
|
+
- `--screenshot`: Capture screenshots of pages.
|
77
|
+
- `--no-headless`: Run the browser in non-headless mode.
|
78
|
+
- `--proxy <url>`: Use a proxy server.
|
79
|
+
- `-v, --verbose`: Enable verbose logging.
|
80
|
+
- `--config <path>`: Path to a configuration file.
|
81
|
+
|
82
|
+
#### Example Command
|
83
|
+
|
84
|
+
```bash
|
85
|
+
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output --depth 5 --concurrency 10 --format csv --verbose
|
86
|
+
```
|
87
|
+
|
88
|
+
This command will scrape the specified URL with a maximum depth of 5, using 10 concurrent requests, and save the output in CSV format in the `./output` directory with verbose logging enabled.
|
89
|
+
|
61
90
|
## π§ Advanced Usage
|
62
91
|
|
63
92
|
### Structured Data Extraction
|
64
93
|
|
94
|
+
To extract structured data using a JSON schema, you can use the `JsonExtractionStrategy`:
|
95
|
+
|
65
96
|
```typescript
|
66
97
|
import { WebScraper, JsonExtractionStrategy } from "enterprise-ai-recursive-web-scraper";
|
67
98
|
|
@@ -81,6 +112,8 @@ const scraper = new WebScraper({
|
|
81
112
|
|
82
113
|
### Custom Browser Session
|
83
114
|
|
115
|
+
You can customize the browser session with specific configurations:
|
116
|
+
|
84
117
|
```typescript
|
85
118
|
import { WebScraper } from "enterprise-ai-recursive-web-scraper";
|
86
119
|
|
@@ -119,4 +152,4 @@ const scraper = new WebScraper({
|
|
119
152
|
|
120
153
|
MIT Β© [Mike Odnis](https://github.com/WomB0ComB0)
|
121
154
|
|
122
|
-
> π Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
|
155
|
+
> π Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "enterprise-ai-recursive-web-scraper",
|
3
|
-
"version": "1.0.
|
3
|
+
"version": "1.0.2",
|
4
4
|
"description": "AI powered, recursive, web-scraper utilizing Gemini models, Puppeteer, and Playwright",
|
5
5
|
"repository": {
|
6
6
|
"type": "git",
|
@@ -13,14 +13,18 @@
|
|
13
13
|
},
|
14
14
|
"type": "module",
|
15
15
|
"main": "lib/cli.cjs",
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
"files": [
|
17
|
+
"lib/*.js",
|
18
|
+
"lib/*.cjs",
|
19
|
+
"lib/*.d.ts",
|
20
|
+
"lib/*.d.cts",
|
21
|
+
"lib/*.map",
|
22
|
+
"package.json",
|
23
|
+
"LICENSE.md",
|
24
|
+
"README.md"
|
25
|
+
],
|
22
26
|
"scripts": {
|
23
|
-
"build": "bun run build:pre && tsup
|
27
|
+
"build": "bun run build:pre && tsup",
|
24
28
|
"build:pre": "bunx puppeteer browsers install chrome",
|
25
29
|
"prepublishOnly": "bun run build",
|
26
30
|
"prepack": "bun run build"
|
@@ -62,5 +66,17 @@
|
|
62
66
|
],
|
63
67
|
"bin": {
|
64
68
|
"web-scraper": "./lib/cli.cjs"
|
69
|
+
},
|
70
|
+
"exports": {
|
71
|
+
".": {
|
72
|
+
"require": "./lib/index.cjs",
|
73
|
+
"import": "./lib/index.js",
|
74
|
+
"types": "./lib/index.d.ts"
|
75
|
+
},
|
76
|
+
"./cli": {
|
77
|
+
"require": "./lib/cli.cjs",
|
78
|
+
"import": "./lib/cli.js",
|
79
|
+
"types": "./lib/cli.d.ts"
|
80
|
+
}
|
65
81
|
}
|
66
82
|
}
|