enterprise-ai-recursive-web-scraper 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- package/LICENSE.md +20 -0
- package/README.md +122 -0
- package/lib/cli.cjs +45382 -0
- package/lib/cli.cjs.map +1 -0
- package/lib/cli.d.cts +1 -0
- package/lib/cli.d.ts +1 -0
- package/lib/cli.js +45364 -0
- package/lib/cli.js.map +1 -0
- package/lib/index.cjs +45402 -0
- package/lib/index.cjs.map +1 -0
- package/lib/index.d.cts +1303 -0
- package/lib/index.d.ts +1303 -0
- package/lib/index.js +45373 -0
- package/lib/index.js.map +1 -0
- package/package.json +66 -0
package/LICENSE.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# MIT License
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
'Software'), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
<h1 align="center">Enterprise AI Recursive Web Scraper</h1>
|
2
|
+
|
3
|
+
<p align="center">Advanced AI-powered recursive web scraper utilizing Groq LLMs, Puppeteer, and Playwright for intelligent content extraction</p>
|
4
|
+
|
5
|
+
<p align="center">
|
6
|
+
<!-- prettier-ignore-start -->
|
7
|
+
<!-- ALL-CONTRIBUTORS-BADGE:START -->
|
8
|
+
<a href="#contributors" target="_blank"><img alt="๐ช All Contributors: 1" src="https://img.shields.io/badge/%F0%9F%91%AA_all_contributors-1-21bb42.svg" /></a>
|
9
|
+
<!-- ALL-CONTRIBUTORS-BADGE:END -->
|
10
|
+
<!-- prettier-ignore-end -->
|
11
|
+
<a href="https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper/blob/main/.github/CODE_OF_CONDUCT.md" target="_blank"><img alt="๐ค Code of Conduct: Kept" src="https://img.shields.io/badge/%F0%9F%A4%9D_code_of_conduct-kept-21bb42" /></a>
|
12
|
+
<a href="https://codecov.io/gh/WomB0ComB0/enterprise-ai-recursive-web-scraper" target="_blank"><img alt="๐งช Coverage" src="https://img.shields.io/codecov/c/github/WomB0ComB0/enterprise-ai-recursive-web-scraper?label=%F0%9F%A7%AA%20coverage" /></a>
|
13
|
+
<a href="https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper/blob/main/LICENSE.md" target="_blank"><img alt="๐ License: MIT" src="https://img.shields.io/badge/%F0%9F%93%9D_license-MIT-21bb42.svg"></a>
|
14
|
+
<a href="http://npmjs.com/package/enterprise-ai-recursive-web-scraper"><img alt="๐ฆ npm version" src="https://img.shields.io/npm/v/enterprise-ai-recursive-web-scraper?color=21bb42&label=%F0%9F%93%A6%20npm" /></a>
|
15
|
+
<img alt="๐ช TypeScript: Strict" src="https://img.shields.io/badge/%F0%9F%92%AA_typescript-strict-21bb42.svg" />
|
16
|
+
</p>
|
17
|
+
|
18
|
+
## โจ Features
|
19
|
+
|
20
|
+
* ๐ **High Performance**: Blazing fast multi-threaded scraping with concurrent processing
|
21
|
+
* ๐ค **AI-Powered**: Intelligent content extraction using Groq LLMs
|
22
|
+
* ๐ **Multi-Browser**: Support for Chromium, Firefox, and WebKit
|
23
|
+
* ๐ **Smart Extraction**:
|
24
|
+
- Structured data extraction without LLMs using CSS selectors
|
25
|
+
- Topic-based and semantic chunking strategies
|
26
|
+
- Cosine similarity clustering for content deduplication
|
27
|
+
* ๐ฏ **Advanced Capabilities**:
|
28
|
+
- Recursive domain crawling with boundary respect
|
29
|
+
- Session management for complex multi-page flows
|
30
|
+
- Custom JavaScript execution support
|
31
|
+
- Enhanced screenshot capture with lazy-load detection
|
32
|
+
- iframe content extraction
|
33
|
+
* ๐ **Enterprise Ready**:
|
34
|
+
- Proxy support with authentication
|
35
|
+
- Custom headers and user-agent configuration
|
36
|
+
- Comprehensive error handling
|
37
|
+
- Flexible timeout management
|
38
|
+
|
39
|
+
## ๐ Quick Start
|
40
|
+
|
41
|
+
```bash
|
42
|
+
npm i enterprise-ai-recursive-web-scraper
|
43
|
+
```
|
44
|
+
|
45
|
+
```typescript
|
46
|
+
import { WebScraper } from "enterprise-ai-recursive-web-scraper";
|
47
|
+
|
48
|
+
async function main() {
|
49
|
+
const scraper = new WebScraper({
|
50
|
+
outputDir: "scraping_output",
|
51
|
+
verbose: true
|
52
|
+
});
|
53
|
+
|
54
|
+
const results = await scraper.scrapeWebsite("https://example.com");
|
55
|
+
console.log(results);
|
56
|
+
}
|
57
|
+
|
58
|
+
main().catch(console.error);
|
59
|
+
```
|
60
|
+
|
61
|
+
## ๐ง Advanced Usage
|
62
|
+
|
63
|
+
### Structured Data Extraction
|
64
|
+
|
65
|
+
```typescript
|
66
|
+
import { WebScraper, JsonExtractionStrategy } from "enterprise-ai-recursive-web-scraper";
|
67
|
+
|
68
|
+
const schema = {
|
69
|
+
baseSelector: "article",
|
70
|
+
fields: [
|
71
|
+
{ name: "title", selector: "h1" },
|
72
|
+
{ name: "content", selector: ".content" },
|
73
|
+
{ name: "date", selector: "time", attribute: "datetime" }
|
74
|
+
]
|
75
|
+
};
|
76
|
+
|
77
|
+
const scraper = new WebScraper({
|
78
|
+
extractionStrategy: new JsonExtractionStrategy(schema)
|
79
|
+
});
|
80
|
+
```
|
81
|
+
|
82
|
+
### Custom Browser Session
|
83
|
+
|
84
|
+
```typescript
|
85
|
+
import { WebScraper } from "enterprise-ai-recursive-web-scraper";
|
86
|
+
|
87
|
+
const scraper = new WebScraper({
|
88
|
+
browserConfig: {
|
89
|
+
headless: false,
|
90
|
+
proxy: "http://proxy.example.com",
|
91
|
+
userAgent: "Custom User Agent"
|
92
|
+
}
|
93
|
+
});
|
94
|
+
```
|
95
|
+
|
96
|
+
## ๐ค Contributors
|
97
|
+
|
98
|
+
<!-- ALL-CONTRIBUTORS-LIST:START -->
|
99
|
+
<table>
|
100
|
+
<tbody>
|
101
|
+
<tr>
|
102
|
+
<td align="center" valign="top" width="14.28%">
|
103
|
+
<a href="https://www.mikeodnis.dev/">
|
104
|
+
<img src="https://avatars.githubusercontent.com/u/95197809?v=4?s=100" width="100px;" alt="Mike Odnis"/>
|
105
|
+
<br /><sub><b>Mike Odnis</b></sub>
|
106
|
+
</a>
|
107
|
+
<br />
|
108
|
+
<a href="https://github.com/WomB0ComB0/enterprise-ai-recursive-web-scraper/commits?author=WomB0ComB0" title="Code">๐ป</a>
|
109
|
+
<a href="#content-WomB0ComB0" title="Content">๐</a>
|
110
|
+
<a href="#ideas-WomB0ComB0" title="Ideas">๐ค</a>
|
111
|
+
<a href="#infra-WomB0ComB0" title="Infrastructure">๐</a>
|
112
|
+
</td>
|
113
|
+
</tr>
|
114
|
+
</tbody>
|
115
|
+
</table>
|
116
|
+
<!-- ALL-CONTRIBUTORS-LIST:END -->
|
117
|
+
|
118
|
+
## ๐ License
|
119
|
+
|
120
|
+
MIT ยฉ [Mike Odnis](https://github.com/WomB0ComB0)
|
121
|
+
|
122
|
+
> ๐ Built with [`create-typescript-app`](https://github.com/JoshuaKGoldberg/create-typescript-app)
|