enterprise-ai-recursive-web-scraper 1.0.0 β†’ 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +46 -13
  2. package/package.json +23 -7
package/README.md CHANGED
@@ -38,30 +38,61 @@
38
38
 
39
39
  ## πŸš€ Quick Start
40
40
 
41
+ To install the package, run:
42
+
41
43
  ```bash
42
- npm i enterprise-ai-recursive-web-scraper
44
+ npm install enterprise-ai-recursive-web-scraper
43
45
  ```
44
46
 
45
- ```typescript
46
- import { WebScraper } from "enterprise-ai-recursive-web-scraper";
47
+ ### Using the CLI
48
+
49
+ The `enterprise-ai-recursive-web-scraper` package includes a command-line interface (CLI) that you can use to perform web scraping tasks directly from the terminal.
50
+
51
+ #### Installation
47
52
 
48
- async function main() {
49
- const scraper = new WebScraper({
50
- outputDir: "scraping_output",
51
- verbose: true
52
- });
53
+ Ensure that the package is installed globally to use the CLI:
53
54
 
54
- const results = await scraper.scrapeWebsite("https://example.com");
55
- console.log(results);
56
- }
55
+ ```bash
56
+ npm install -g enterprise-ai-recursive-web-scraper
57
+ ```
58
+
59
+ #### Running the CLI
57
60
 
58
- main().catch(console.error);
61
+ Once installed, you can use the `web-scraper` command to start scraping. Here’s a basic example of how to use it:
62
+
63
+ ```bash
64
+ web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output
59
65
  ```
60
66
 
67
+ #### CLI Options
68
+
69
+ - `-k, --api-key <key>`: **(Required)** Your Google Gemini API key.
70
+ - `-u, --url <url>`: **(Required)** The URL of the website you want to scrape.
71
+ - `-o, --output <directory>`: The directory where the scraped data will be saved. Default is `scraping_output`.
72
+ - `-d, --depth <number>`: Maximum crawl depth. Default is `3`.
73
+ - `-c, --concurrency <number>`: Concurrent scraping limit. Default is `5`.
74
+ - `-t, --timeout <seconds>`: Request timeout in seconds. Default is `30`.
75
+ - `-f, --format <type>`: Output format (`json`, `csv`, `markdown`). Default is `json`.
76
+ - `--screenshot`: Capture screenshots of pages.
77
+ - `--no-headless`: Run the browser in non-headless mode.
78
+ - `--proxy <url>`: Use a proxy server.
79
+ - `-v, --verbose`: Enable verbose logging.
80
+ - `--config <path>`: Path to a configuration file.
81
+
82
+ #### Example Command
83
+
84
+ ```bash
85
+ web-scraper --api-key YOUR_API_KEY --url https://example.com --output ./output --depth 5 --concurrency 10 --format csv --verbose
86
+ ```
87
+
88
+ This command will scrape the specified URL with a maximum depth of 5, using 10 concurrent requests, and save the output in CSV format in the `./output` directory with verbose logging enabled.
89
+
61
90
  ## πŸ”§ Advanced Usage
62
91
 
63
92
  ### Structured Data Extraction
64
93
 
94
+ To extract structured data using a JSON schema, you can use the `JsonExtractionStrategy`:
95
+
65
96
  ```typescript
66
97
  import { WebScraper, JsonExtractionStrategy } from "enterprise-ai-recursive-web-scraper";
67
98
 
@@ -81,6 +112,8 @@ const scraper = new WebScraper({
81
112
 
82
113
  ### Custom Browser Session
83
114
 
115
+ You can customize the browser session with specific configurations:
116
+
84
117
  ```typescript
85
118
  import { WebScraper } from "enterprise-ai-recursive-web-scraper";
86
119
 
@@ -119,4 +152,4 @@ const scraper = new WebScraper({
119
152
 
120
153
  MIT Β© [Mike Odnis](https://github.com/WomB0ComB0)
121
154
 
122
- > πŸ’™ Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
155
+ > πŸ’™ Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "enterprise-ai-recursive-web-scraper",
3
- "version": "1.0.0",
3
+ "version": "1.0.1",
4
4
  "description": "AI powered, recursive, web-scraper utilizing Gemini models, Puppeteer, and Playwright",
5
5
  "repository": {
6
6
  "type": "git",
@@ -13,12 +13,16 @@
13
13
  },
14
14
  "type": "module",
15
15
  "main": "lib/cli.cjs",
16
- "files": [
17
- "lib/**/*",
18
- "package.json",
19
- "LICENSE.md",
20
- "README.md"
21
- ],
16
+ "files": [
17
+ "lib/*.js",
18
+ "lib/*.cjs",
19
+ "lib/*.d.ts",
20
+ "lib/*.d.cts",
21
+ "lib/*.map",
22
+ "package.json",
23
+ "LICENSE.md",
24
+ "README.md"
25
+ ],
22
26
  "scripts": {
23
27
  "build": "bun run build:pre && tsup src/cli.ts src/index.ts --format esm,cjs --dts --outDir lib",
24
28
  "build:pre": "bunx puppeteer browsers install chrome",
@@ -62,5 +66,17 @@
62
66
  ],
63
67
  "bin": {
64
68
  "web-scraper": "./lib/cli.cjs"
69
+ },
70
+ "exports": {
71
+ ".": {
72
+ "require": "./lib/index.cjs",
73
+ "import": "./lib/index.js",
74
+ "types": "./lib/index.d.ts"
75
+ },
76
+ "./cli": {
77
+ "require": "./lib/cli.cjs",
78
+ "import": "./lib/cli.js",
79
+ "types": "./lib/cli.d.ts"
80
+ }
65
81
  }
66
82
  }