enterprise-ai-recursive-web-scraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +20 -0
- package/README.md +122 -0
- package/lib/cli.cjs +45382 -0
- package/lib/cli.cjs.map +1 -0
- package/lib/cli.d.cts +1 -0
- package/lib/cli.d.ts +1 -0
- package/lib/cli.js +45364 -0
- package/lib/cli.js.map +1 -0
- package/lib/index.cjs +45402 -0
- package/lib/index.cjs.map +1 -0
- package/lib/index.d.cts +1303 -0
- package/lib/index.d.ts +1303 -0
- package/lib/index.js +45373 -0
- package/lib/index.js.map +1 -0
- package/package.json +66 -0
package/LICENSE.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# MIT License
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
'Software'), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
<h1 align="center">Enterprise AI Recursive Web Scraper</h1>
|
2
|
+
|
3
|
+
<p align="center">Advanced AI-powered recursive web scraper utilizing Groq LLMs, Puppeteer, and Playwright for intelligent content extraction</p>
|
4
|
+
|
5
|
+
<p align="center">
|
6
|
+
<!-- prettier-ignore-start -->
|
7
|
+
<!-- ALL-CONTRIBUTORS-BADGE:START -->
|
8
|
+
<a href="#contributors" target="_blank"><img alt="๐ช All Contributors: 1" src="https://img.shields.io/badge/%F0%9F%91%AA_all_contributors-1-21bb42.svg" /></a>
|
9
|
+
<!-- ALL-CONTRIBUTORS-BADGE:END -->
|
10
|
+
<!-- prettier-ignore-end -->
|
11
|
+
<a href="https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper/blob/main/.github/CODE_OF_CONDUCT.md" target="_blank"><img alt="๐ค Code of Conduct: Kept" src="https://img.shields.io/badge/%F0%9F%A4%9D_code_of_conduct-kept-21bb42" /></a>
|
12
|
+
<a href="https://codecov.io/gh/WomB0ComB0/enterprise-ai-recursive-web-scraper" target="_blank"><img alt="๐งช Coverage" src="https://img.shields.io/codecov/c/github/WomB0ComB0/enterprise-ai-recursive-web-scraper?label=%F0%9F%A7%AA%20coverage" /></a>
|
13
|
+
<a href="https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper/blob/main/LICENSE.md" target="_blank"><img alt="๐ License: MIT" src="https://img.shields.io/badge/%F0%9F%93%9D_license-MIT-21bb42.svg"></a>
|
14
|
+
<a href="http://npmjs.com/package/enterprise-ai-recursive-web-scraper"><img alt="๐ฆ npm version" src="https://img.shields.io/npm/v/enterprise-ai-recursive-web-scraper?color=21bb42&label=%F0%9F%93%A6%20npm" /></a>
|
15
|
+
<img alt="๐ช TypeScript: Strict" src="https://img.shields.io/badge/%F0%9F%92%AA_typescript-strict-21bb42.svg" />
|
16
|
+
</p>
|
17
|
+
|
18
|
+
## โจ Features
|
19
|
+
|
20
|
+
* ๐ **High Performance**: Blazing fast multi-threaded scraping with concurrent processing
|
21
|
+
* ๐ค **AI-Powered**: Intelligent content extraction using Groq LLMs
|
22
|
+
* ๐ **Multi-Browser**: Support for Chromium, Firefox, and WebKit
|
23
|
+
* ๐ **Smart Extraction**:
|
24
|
+
- Structured data extraction without LLMs using CSS selectors
|
25
|
+
- Topic-based and semantic chunking strategies
|
26
|
+
- Cosine similarity clustering for content deduplication
|
27
|
+
* ๐ฏ **Advanced Capabilities**:
|
28
|
+
- Recursive domain crawling with boundary respect
|
29
|
+
- Session management for complex multi-page flows
|
30
|
+
- Custom JavaScript execution support
|
31
|
+
- Enhanced screenshot capture with lazy-load detection
|
32
|
+
- iframe content extraction
|
33
|
+
* ๐ **Enterprise Ready**:
|
34
|
+
- Proxy support with authentication
|
35
|
+
- Custom headers and user-agent configuration
|
36
|
+
- Comprehensive error handling
|
37
|
+
- Flexible timeout management
|
38
|
+
|
39
|
+
## ๐ Quick Start
|
40
|
+
|
41
|
+
```bash
|
42
|
+
npm i enterprise-ai-recursive-web-scraper
|
43
|
+
```
|
44
|
+
|
45
|
+
```typescript
|
46
|
+
import { WebScraper } from "enterprise-ai-recursive-web-scraper";
|
47
|
+
|
48
|
+
async function main() {
|
49
|
+
const scraper = new WebScraper({
|
50
|
+
outputDir: "scraping_output",
|
51
|
+
verbose: true
|
52
|
+
});
|
53
|
+
|
54
|
+
const results = await scraper.scrapeWebsite("https://example.com");
|
55
|
+
console.log(results);
|
56
|
+
}
|
57
|
+
|
58
|
+
main().catch(console.error);
|
59
|
+
```
|
60
|
+
|
61
|
+
## ๐ง Advanced Usage
|
62
|
+
|
63
|
+
### Structured Data Extraction
|
64
|
+
|
65
|
+
```typescript
|
66
|
+
import { WebScraper, JsonExtractionStrategy } from "enterprise-ai-recursive-web-scraper";
|
67
|
+
|
68
|
+
const schema = {
|
69
|
+
baseSelector: "article",
|
70
|
+
fields: [
|
71
|
+
{ name: "title", selector: "h1" },
|
72
|
+
{ name: "content", selector: ".content" },
|
73
|
+
{ name: "date", selector: "time", attribute: "datetime" }
|
74
|
+
]
|
75
|
+
};
|
76
|
+
|
77
|
+
const scraper = new WebScraper({
|
78
|
+
extractionStrategy: new JsonExtractionStrategy(schema)
|
79
|
+
});
|
80
|
+
```
|
81
|
+
|
82
|
+
### Custom Browser Session
|
83
|
+
|
84
|
+
```typescript
|
85
|
+
import { WebScraper } from "enterprise-ai-recursive-web-scraper";
|
86
|
+
|
87
|
+
const scraper = new WebScraper({
|
88
|
+
browserConfig: {
|
89
|
+
headless: false,
|
90
|
+
proxy: "http://proxy.example.com",
|
91
|
+
userAgent: "Custom User Agent"
|
92
|
+
}
|
93
|
+
});
|
94
|
+
```
|
95
|
+
|
96
|
+
## ๐ค Contributors
|
97
|
+
|
98
|
+
<!-- ALL-CONTRIBUTORS-LIST:START -->
|
99
|
+
<table>
|
100
|
+
<tbody>
|
101
|
+
<tr>
|
102
|
+
<td align="center" valign="top" width="14.28%">
|
103
|
+
<a href="https://www.mikeodnis.dev/">
|
104
|
+
<img src="https://avatars.githubusercontent.com/u/95197809?v=4?s=100" width="100px;" alt="Mike Odnis"/>
|
105
|
+
<br /><sub><b>Mike Odnis</b></sub>
|
106
|
+
</a>
|
107
|
+
<br />
|
108
|
+
<a href="https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper/commits?author=WomB0ComB0" title="Code">๐ป</a>
|
109
|
+
<a href="#content-WomB0ComB0" title="Content">๐</a>
|
110
|
+
<a href="#ideas-WomB0ComB0" title="Ideas">๐ค</a>
|
111
|
+
<a href="#infra-WomB0ComB0" title="Infrastructure">๐</a>
|
112
|
+
</td>
|
113
|
+
</tr>
|
114
|
+
</tbody>
|
115
|
+
</table>
|
116
|
+
<!-- ALL-CONTRIBUTORS-LIST:END -->
|
117
|
+
|
118
|
+
## ๐ License
|
119
|
+
|
120
|
+
MIT ยฉ [Mike Odnis](https://github.com/WomB0ComB0)
|
121
|
+
|
122
|
+
> ๐ Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
|