enterprise-ai-recursive-web-scraper 1.0.0 β 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -13
- package/package.json +23 -7
package/README.md
CHANGED
@@ -38,30 +38,61 @@
|
|
38
38
|
|
39
39
|
## π Quick Start
|
40
40
|
|
41
|
+
To install the package, run:
|
42
|
+
|
41
43
|
```bash
|
42
|
-
npm
|
44
|
+
npm install enterprise-ai-recursive-web-scraper
|
43
45
|
```
|
44
46
|
|
45
|
-
|
46
|
-
|
47
|
+
### Using the CLI
|
48
|
+
|
49
|
+
The `enterprise-ai-recursive-web-scraper` package includes a command-line interface (CLI) that you can use to perform web scraping tasks directly from the terminal.
|
50
|
+
|
51
|
+
#### Installation
|
47
52
|
|
48
|
-
|
49
|
-
const scraper = new WebScraper({
|
50
|
-
outputDir: "scraping_output",
|
51
|
-
verbose: true
|
52
|
-
});
|
53
|
+
Ensure that the package is installed globally to use the CLI:
|
53
54
|
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
```bash
|
56
|
+
npm install -g enterprise-ai-recursive-web-scraper
|
57
|
+
```
|
58
|
+
|
59
|
+
#### Running the CLI
|
57
60
|
|
58
|
-
|
61
|
+
Once installed, you can use the `web-scraper` command to start scraping. Hereβs a basic example of how to use it:
|
62
|
+
|
63
|
+
```bash
|
64
|
+
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
|
59
65
|
```
|
60
66
|
|
67
|
+
#### CLI Options
|
68
|
+
|
69
|
+
- `-k, --api-key <key>`: **(Required)** Your Google Gemini API key.
|
70
|
+
- `-u, --url <url>`: **(Required)** The URL of the website you want to scrape.
|
71
|
+
- `-o, --output <directory>`: The directory where the scraped data will be saved. Default is `scraping_output`.
|
72
|
+
- `-d, --depth <number>`: Maximum crawl depth. Default is `3`.
|
73
|
+
- `-c, --concurrency <number>`: Concurrent scraping limit. Default is `5`.
|
74
|
+
- `-t, --timeout <seconds>`: Request timeout in seconds. Default is `30`.
|
75
|
+
- `-f, --format <type>`: Output format (`json`, `csv`, `markdown`). Default is `json`.
|
76
|
+
- `--screenshot`: Capture screenshots of pages.
|
77
|
+
- `--no-headless`: Run the browser in non-headless mode.
|
78
|
+
- `--proxy <url>`: Use a proxy server.
|
79
|
+
- `-v, --verbose`: Enable verbose logging.
|
80
|
+
- `--config <path>`: Path to a configuration file.
|
81
|
+
|
82
|
+
#### Example Command
|
83
|
+
|
84
|
+
```bash
|
85
|
+
web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output --depth 5 --concurrency 10 --format csv --verbose
|
86
|
+
```
|
87
|
+
|
88
|
+
This command will scrape the specified URL with a maximum depth of 5, using 10 concurrent requests, and save the output in CSV format in the `./output` directory with verbose logging enabled.
|
89
|
+
|
61
90
|
## π§ Advanced Usage
|
62
91
|
|
63
92
|
### Structured Data Extraction
|
64
93
|
|
94
|
+
To extract structured data using a JSON schema, you can use the `JsonExtractionStrategy`:
|
95
|
+
|
65
96
|
```typescript
|
66
97
|
import { WebScraper, JsonExtractionStrategy } from "enterprise-ai-recursive-web-scraper";
|
67
98
|
|
@@ -81,6 +112,8 @@ const scraper = new WebScraper({
|
|
81
112
|
|
82
113
|
### Custom Browser Session
|
83
114
|
|
115
|
+
You can customize the browser session with specific configurations:
|
116
|
+
|
84
117
|
```typescript
|
85
118
|
import { WebScraper } from "enterprise-ai-recursive-web-scraper";
|
86
119
|
|
@@ -119,4 +152,4 @@ const scraper = new WebScraper({
|
|
119
152
|
|
120
153
|
MIT Β© [Mike Odnis](https://github.com/WomB0ComB0)
|
121
154
|
|
122
|
-
> π Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
|
155
|
+
> π Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "enterprise-ai-recursive-web-scraper",
|
3
|
-
"version": "1.0.
|
3
|
+
"version": "1.0.1",
|
4
4
|
"description": "AI powered, recursive, web-scraper utilizing Gemini models, Puppeteer, and Playwright",
|
5
5
|
"repository": {
|
6
6
|
"type": "git",
|
@@ -13,12 +13,16 @@
|
|
13
13
|
},
|
14
14
|
"type": "module",
|
15
15
|
"main": "lib/cli.cjs",
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
"files": [
|
17
|
+
"lib/*.js",
|
18
|
+
"lib/*.cjs",
|
19
|
+
"lib/*.d.ts",
|
20
|
+
"lib/*.d.cts",
|
21
|
+
"lib/*.map",
|
22
|
+
"package.json",
|
23
|
+
"LICENSE.md",
|
24
|
+
"README.md"
|
25
|
+
],
|
22
26
|
"scripts": {
|
23
27
|
"build": "bun run build:pre && tsup src/cli.ts src/index.ts --format esm,cjs --dts --outDir lib",
|
24
28
|
"build:pre": "bunx puppeteer browsers install chrome",
|
@@ -62,5 +66,17 @@
|
|
62
66
|
],
|
63
67
|
"bin": {
|
64
68
|
"web-scraper": "./lib/cli.cjs"
|
69
|
+
},
|
70
|
+
"exports": {
|
71
|
+
".": {
|
72
|
+
"require": "./lib/index.cjs",
|
73
|
+
"import": "./lib/index.js",
|
74
|
+
"types": "./lib/index.d.ts"
|
75
|
+
},
|
76
|
+
"./cli": {
|
77
|
+
"require": "./lib/cli.cjs",
|
78
|
+
"import": "./lib/cli.js",
|
79
|
+
"types": "./lib/cli.d.ts"
|
80
|
+
}
|
65
81
|
}
|
66
82
|
}
|