ag-webscrape 0.0.14 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +206 -206
- package/dist/WebScraper.d.ts.map +1 -1
- package/dist/WebScraper.js +7 -7
- package/dist/WebScraper.js.map +1 -1
- package/dist/helpers/dom.d.ts.map +1 -1
- package/dist/helpers/dom.js +64 -13
- package/dist/helpers/dom.js.map +1 -1
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -1,207 +1,207 @@
|
|
|
1
|
-
# ag-webscrape
|
|
2
|
-
|
|
3
|
-
A TypeScript web scraper with intelligent fallback strategy. Attempts direct HTTP fetching first, then falls back to Playwright for anti-scraping protection.
|
|
4
|
-
|
|
5
|
-
## Features
|
|
6
|
-
|
|
7
|
-
- **Dual Strategy**: Direct fetch first, Playwright fallback
|
|
8
|
-
- **Anti-Scraping Detection**: Automatically detects and bypasses common anti-scraping measures
|
|
9
|
-
- **Persistent Browser**: Maintains browser instance for faster subsequent scrapes
|
|
10
|
-
- **Error Handling**: Comprehensive error detection for 4xx/5xx responses
|
|
11
|
-
- **TypeScript Support**: Full type safety and IntelliSense
|
|
12
|
-
- **Configurable**: Extensive customization options
|
|
13
|
-
|
|
14
|
-
## Installation
|
|
15
|
-
|
|
16
|
-
```bash
|
|
17
|
-
npm install ag-webscrape
|
|
18
|
-
```
|
|
19
|
-
|
|
20
|
-
## Quick Start
|
|
21
|
-
|
|
22
|
-
```typescript
|
|
23
|
-
import { WebScraper } from 'ag-webscrape';
|
|
24
|
-
|
|
25
|
-
const scraper = new WebScraper();
|
|
26
|
-
|
|
27
|
-
// Scrape a single URL
|
|
28
|
-
const result = await scraper.scrape('https://example.com');
|
|
29
|
-
console.log(result.html);
|
|
30
|
-
|
|
31
|
-
// Clean up when done
|
|
32
|
-
await scraper.dispose();
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
## API Reference
|
|
36
|
-
|
|
37
|
-
### WebScraper Class
|
|
38
|
-
|
|
39
|
-
#### Constructor
|
|
40
|
-
|
|
41
|
-
```typescript
|
|
42
|
-
new WebScraper(options?: ScrapingOptions)
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
#### Options
|
|
46
|
-
|
|
47
|
-
```typescript
|
|
48
|
-
interface ScrapingOptions {
|
|
49
|
-
timeout?: number; // Request timeout in ms (default: 30000)
|
|
50
|
-
userAgent?: string; // Custom user agent
|
|
51
|
-
headers?: Record<string, string>; // Additional headers
|
|
52
|
-
retries?: number; // Number of retries (default: 3)
|
|
53
|
-
waitForSelector?: string; // CSS selector to wait for
|
|
54
|
-
waitForTimeout?: number; // Time to wait in ms (default: 5000)
|
|
55
|
-
}
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
#### Methods
|
|
59
|
-
|
|
60
|
-
##### `scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>`
|
|
61
|
-
|
|
62
|
-
Scrapes a single URL with fallback strategy.
|
|
63
|
-
|
|
64
|
-
```typescript
|
|
65
|
-
const result = await scraper.scrape('https://example.com', {
|
|
66
|
-
timeout: 60000,
|
|
67
|
-
waitForSelector: '.main-content'
|
|
68
|
-
});
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
##### `scrapeMultiple(urls: string[], options?: ScrapingOptions): Promise<ScrapingResult[]>`
|
|
72
|
-
|
|
73
|
-
Scrapes multiple URLs efficiently.
|
|
74
|
-
|
|
75
|
-
```typescript
|
|
76
|
-
const results = await scraper.scrapeMultiple([
|
|
77
|
-
'https://example1.com',
|
|
78
|
-
'https://example2.com'
|
|
79
|
-
]);
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
##### `dispose(): Promise<void>`
|
|
83
|
-
|
|
84
|
-
Cleans up browser resources. Always call this when done.
|
|
85
|
-
|
|
86
|
-
```typescript
|
|
87
|
-
await scraper.dispose();
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
#### Result Object
|
|
91
|
-
|
|
92
|
-
```typescript
|
|
93
|
-
interface ScrapingResult {
|
|
94
|
-
url: string; // Original URL
|
|
95
|
-
html: string; // HTML content
|
|
96
|
-
status: number; // HTTP status code
|
|
97
|
-
method: 'fetch' | 'playwright'; // Method used
|
|
98
|
-
error?: string; // Error message if any
|
|
99
|
-
redirected?: boolean; // Whether request was redirected
|
|
100
|
-
finalUrl?: string; // Final URL after redirects
|
|
101
|
-
}
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Advanced Usage
|
|
105
|
-
|
|
106
|
-
### Custom Headers and User Agent
|
|
107
|
-
|
|
108
|
-
```typescript
|
|
109
|
-
const scraper = new WebScraper({
|
|
110
|
-
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
111
|
-
headers: {
|
|
112
|
-
'Accept': 'text/html,application/xhtml+xml',
|
|
113
|
-
'Accept-Language': 'en-US,en;q=0.9'
|
|
114
|
-
}
|
|
115
|
-
});
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
### Waiting for Content
|
|
119
|
-
|
|
120
|
-
```typescript
|
|
121
|
-
// Wait for specific element
|
|
122
|
-
const result = await scraper.scrape('https://spa-app.com', {
|
|
123
|
-
waitForSelector: '.dynamic-content'
|
|
124
|
-
});
|
|
125
|
-
|
|
126
|
-
// Wait for specific time
|
|
127
|
-
const result = await scraper.scrape('https://slow-app.com', {
|
|
128
|
-
waitForTimeout: 10000
|
|
129
|
-
});
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
### Error Handling
|
|
133
|
-
|
|
134
|
-
```typescript
|
|
135
|
-
const result = await scraper.scrape('https://example.com');
|
|
136
|
-
|
|
137
|
-
if (result.error) {
|
|
138
|
-
console.error('Scraping failed:', result.error);
|
|
139
|
-
} else {
|
|
140
|
-
console.log('Success:', result.html.length, 'characters');
|
|
141
|
-
}
|
|
142
|
-
```
|
|
143
|
-
|
|
144
|
-
### Batch Scraping
|
|
145
|
-
|
|
146
|
-
```typescript
|
|
147
|
-
const urls = [
|
|
148
|
-
'https://news.site.com/article1',
|
|
149
|
-
'https://news.site.com/article2',
|
|
150
|
-
'https://news.site.com/article3'
|
|
151
|
-
];
|
|
152
|
-
|
|
153
|
-
const results = await scraper.scrapeMultiple(urls, {
|
|
154
|
-
waitForSelector: '.article-content'
|
|
155
|
-
});
|
|
156
|
-
|
|
157
|
-
results.forEach((result, index) => {
|
|
158
|
-
if (!result.error) {
|
|
159
|
-
console.log(`Article ${index + 1}: ${result.html.length} chars`);
|
|
160
|
-
}
|
|
161
|
-
});
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
## How It Works
|
|
165
|
-
|
|
166
|
-
1. **Direct Fetch**: First attempts HTTP request using `node-fetch`
|
|
167
|
-
2. **Anti-Scraping Detection**: Checks response for common anti-scraping patterns
|
|
168
|
-
3. **Playwright Fallback**: If direct fetch fails or anti-scraping detected, uses Playwright
|
|
169
|
-
4. **Error Detection**: Monitors for 4xx/5xx responses in both methods
|
|
170
|
-
5. **Resource Management**: Maintains browser instance for performance
|
|
171
|
-
|
|
172
|
-
## Anti-Scraping Protection
|
|
173
|
-
|
|
174
|
-
The scraper automatically detects and handles:
|
|
175
|
-
|
|
176
|
-
- Cloudflare protection
|
|
177
|
-
- DistilNetworks
|
|
178
|
-
- PerimeterX
|
|
179
|
-
- DataDome
|
|
180
|
-
- Akamai Bot Manager
|
|
181
|
-
- CAPTCHA challenges
|
|
182
|
-
- JavaScript requirement checks
|
|
183
|
-
- Rate limiting
|
|
184
|
-
- Access denied pages
|
|
185
|
-
|
|
186
|
-
## Performance
|
|
187
|
-
|
|
188
|
-
- **Fast**: Direct fetch for simple pages
|
|
189
|
-
- **Efficient**: Reuses browser instance
|
|
190
|
-
- **Robust**: Fallback ensures high success rate
|
|
191
|
-
- **Intelligent**: Only uses Playwright when necessary
|
|
192
|
-
|
|
193
|
-
## Examples
|
|
194
|
-
|
|
195
|
-
Check out the `src/example.ts` file for complete usage examples.
|
|
196
|
-
|
|
197
|
-
## License
|
|
198
|
-
|
|
199
|
-
MIT
|
|
200
|
-
|
|
201
|
-
## Contributing
|
|
202
|
-
|
|
203
|
-
Pull requests welcome! Please ensure TypeScript compilation and tests pass.
|
|
204
|
-
|
|
205
|
-
## Support
|
|
206
|
-
|
|
1
|
+
# ag-webscrape
|
|
2
|
+
|
|
3
|
+
A TypeScript web scraper with intelligent fallback strategy. Attempts direct HTTP fetching first, then falls back to Playwright for anti-scraping protection.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Dual Strategy**: Direct fetch first, Playwright fallback
|
|
8
|
+
- **Anti-Scraping Detection**: Automatically detects and bypasses common anti-scraping measures
|
|
9
|
+
- **Persistent Browser**: Maintains browser instance for faster subsequent scrapes
|
|
10
|
+
- **Error Handling**: Comprehensive error detection for 4xx/5xx responses
|
|
11
|
+
- **TypeScript Support**: Full type safety and IntelliSense
|
|
12
|
+
- **Configurable**: Extensive customization options
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npm install ag-webscrape
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
import { WebScraper } from 'ag-webscrape';
|
|
24
|
+
|
|
25
|
+
const scraper = new WebScraper();
|
|
26
|
+
|
|
27
|
+
// Scrape a single URL
|
|
28
|
+
const result = await scraper.scrape('https://example.com');
|
|
29
|
+
console.log(result.html);
|
|
30
|
+
|
|
31
|
+
// Clean up when done
|
|
32
|
+
await scraper.dispose();
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## API Reference
|
|
36
|
+
|
|
37
|
+
### WebScraper Class
|
|
38
|
+
|
|
39
|
+
#### Constructor
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
new WebScraper(options?: ScrapingOptions)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
#### Options
|
|
46
|
+
|
|
47
|
+
```typescript
|
|
48
|
+
interface ScrapingOptions {
|
|
49
|
+
timeout?: number; // Request timeout in ms (default: 30000)
|
|
50
|
+
userAgent?: string; // Custom user agent
|
|
51
|
+
headers?: Record<string, string>; // Additional headers
|
|
52
|
+
retries?: number; // Number of retries (default: 3)
|
|
53
|
+
waitForSelector?: string; // CSS selector to wait for
|
|
54
|
+
waitForTimeout?: number; // Time to wait in ms (default: 5000)
|
|
55
|
+
}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
#### Methods
|
|
59
|
+
|
|
60
|
+
##### `scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>`
|
|
61
|
+
|
|
62
|
+
Scrapes a single URL with fallback strategy.
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
const result = await scraper.scrape('https://example.com', {
|
|
66
|
+
timeout: 60000,
|
|
67
|
+
waitForSelector: '.main-content'
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
##### `scrapeMultiple(urls: string[], options?: ScrapingOptions): Promise<ScrapingResult[]>`
|
|
72
|
+
|
|
73
|
+
Scrapes multiple URLs efficiently.
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
const results = await scraper.scrapeMultiple([
|
|
77
|
+
'https://example1.com',
|
|
78
|
+
'https://example2.com'
|
|
79
|
+
]);
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
##### `dispose(): Promise<void>`
|
|
83
|
+
|
|
84
|
+
Cleans up browser resources. Always call this when done.
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
await scraper.dispose();
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
#### Result Object
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
interface ScrapingResult {
|
|
94
|
+
url: string; // Original URL
|
|
95
|
+
html: string; // HTML content
|
|
96
|
+
status: number; // HTTP status code
|
|
97
|
+
method: 'fetch' | 'playwright'; // Method used
|
|
98
|
+
error?: string; // Error message if any
|
|
99
|
+
redirected?: boolean; // Whether request was redirected
|
|
100
|
+
finalUrl?: string; // Final URL after redirects
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Advanced Usage
|
|
105
|
+
|
|
106
|
+
### Custom Headers and User Agent
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
const scraper = new WebScraper({
|
|
110
|
+
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
111
|
+
headers: {
|
|
112
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
113
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Waiting for Content
|
|
119
|
+
|
|
120
|
+
```typescript
|
|
121
|
+
// Wait for specific element
|
|
122
|
+
const result = await scraper.scrape('https://spa-app.com', {
|
|
123
|
+
waitForSelector: '.dynamic-content'
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
// Wait for specific time
|
|
127
|
+
const result = await scraper.scrape('https://slow-app.com', {
|
|
128
|
+
waitForTimeout: 10000
|
|
129
|
+
});
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Error Handling
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
const result = await scraper.scrape('https://example.com');
|
|
136
|
+
|
|
137
|
+
if (result.error) {
|
|
138
|
+
console.error('Scraping failed:', result.error);
|
|
139
|
+
} else {
|
|
140
|
+
console.log('Success:', result.html.length, 'characters');
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Batch Scraping
|
|
145
|
+
|
|
146
|
+
```typescript
|
|
147
|
+
const urls = [
|
|
148
|
+
'https://news.site.com/article1',
|
|
149
|
+
'https://news.site.com/article2',
|
|
150
|
+
'https://news.site.com/article3'
|
|
151
|
+
];
|
|
152
|
+
|
|
153
|
+
const results = await scraper.scrapeMultiple(urls, {
|
|
154
|
+
waitForSelector: '.article-content'
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
results.forEach((result, index) => {
|
|
158
|
+
if (!result.error) {
|
|
159
|
+
console.log(`Article ${index + 1}: ${result.html.length} chars`);
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## How It Works
|
|
165
|
+
|
|
166
|
+
1. **Direct Fetch**: First attempts HTTP request using `node-fetch`
|
|
167
|
+
2. **Anti-Scraping Detection**: Checks response for common anti-scraping patterns
|
|
168
|
+
3. **Playwright Fallback**: If direct fetch fails or anti-scraping detected, uses Playwright
|
|
169
|
+
4. **Error Detection**: Monitors for 4xx/5xx responses in both methods
|
|
170
|
+
5. **Resource Management**: Maintains browser instance for performance
|
|
171
|
+
|
|
172
|
+
## Anti-Scraping Protection
|
|
173
|
+
|
|
174
|
+
The scraper automatically detects and handles:
|
|
175
|
+
|
|
176
|
+
- Cloudflare protection
|
|
177
|
+
- DistilNetworks
|
|
178
|
+
- PerimeterX
|
|
179
|
+
- DataDome
|
|
180
|
+
- Akamai Bot Manager
|
|
181
|
+
- CAPTCHA challenges
|
|
182
|
+
- JavaScript requirement checks
|
|
183
|
+
- Rate limiting
|
|
184
|
+
- Access denied pages
|
|
185
|
+
|
|
186
|
+
## Performance
|
|
187
|
+
|
|
188
|
+
- **Fast**: Direct fetch for simple pages
|
|
189
|
+
- **Efficient**: Reuses browser instance
|
|
190
|
+
- **Robust**: Fallback ensures high success rate
|
|
191
|
+
- **Intelligent**: Only uses Playwright when necessary
|
|
192
|
+
|
|
193
|
+
## Examples
|
|
194
|
+
|
|
195
|
+
Check out the `src/example.ts` file for complete usage examples.
|
|
196
|
+
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
MIT
|
|
200
|
+
|
|
201
|
+
## Contributing
|
|
202
|
+
|
|
203
|
+
Pull requests welcome! Please ensure TypeScript compilation and tests pass.
|
|
204
|
+
|
|
205
|
+
## Support
|
|
206
|
+
|
|
207
207
|
For issues and questions, please use the GitHub issue tracker.
|
package/dist/WebScraper.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"WebScraper.d.ts","sourceRoot":"","sources":["../src/WebScraper.ts"],"names":[],"mappings":"AAKA,MAAM,WAAW,eAAe;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,GAAG,QAAQ,CAAC;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,UAAU;IACrB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,cAAc,CAAkB;gBAE5B,OAAO,GAAE,eAAoB;YAc3B,aAAa;
|
|
1
|
+
{"version":3,"file":"WebScraper.d.ts","sourceRoot":"","sources":["../src/WebScraper.ts"],"names":[],"mappings":"AAKA,MAAM,WAAW,eAAe;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,GAAG,QAAQ,CAAC;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,UAAU;IACrB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,cAAc,CAAkB;gBAE5B,OAAO,GAAE,eAAoB;YAc3B,aAAa;YAkDb,mBAAmB;IAqE3B,MAAM,CACV,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC;IAsDpB,cAAc,CAClB,IAAI,EAAE,MAAM,EAAE,EACd,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,EAAE,CAAC;IAwBtB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B"}
|
package/dist/WebScraper.js
CHANGED
|
@@ -38,6 +38,7 @@ class WebScraper {
|
|
|
38
38
|
url,
|
|
39
39
|
html,
|
|
40
40
|
status: response.status,
|
|
41
|
+
error: response.status === 200 ? undefined : response.status.toString(),
|
|
41
42
|
method: 'fetch',
|
|
42
43
|
redirected: response.redirected,
|
|
43
44
|
finalUrl: response.url,
|
|
@@ -62,9 +63,8 @@ class WebScraper {
|
|
|
62
63
|
html = pageResult.html.outerHTML;
|
|
63
64
|
status = pageResult.status;
|
|
64
65
|
finalUrl = pageResult.url;
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
}
|
|
66
|
+
error =
|
|
67
|
+
status === 200 ? undefined : `HTTP ${status}: ${pageResult.statusText}`;
|
|
68
68
|
return {
|
|
69
69
|
url,
|
|
70
70
|
html,
|
|
@@ -98,7 +98,7 @@ class WebScraper {
|
|
|
98
98
|
html: '',
|
|
99
99
|
status,
|
|
100
100
|
method: 'visual',
|
|
101
|
-
error: errorMessage,
|
|
101
|
+
error: errorMessage || 'err',
|
|
102
102
|
finalUrl,
|
|
103
103
|
};
|
|
104
104
|
}
|
|
@@ -123,9 +123,9 @@ class WebScraper {
|
|
|
123
123
|
(0, log_1.info)(`Direct fetch failed or anti-scraping detected for ${url}: ${lastError.message}. Falling back to puppeteer.`);
|
|
124
124
|
}
|
|
125
125
|
try {
|
|
126
|
-
const
|
|
127
|
-
(0, log_1.debug)(`Puppeteer scrape successful for ${url}.`, JSON.stringify(
|
|
128
|
-
return
|
|
126
|
+
const resultP = await this.scrapeWithpuppeteer(url, mergedOptions);
|
|
127
|
+
(0, log_1.debug)(`Puppeteer scrape successful for ${url}.`, JSON.stringify(resultP, null, 2));
|
|
128
|
+
return resultP;
|
|
129
129
|
}
|
|
130
130
|
catch (error) {
|
|
131
131
|
const puppeteerError = error instanceof Error ? error : new Error('Unknown puppeteer error');
|
package/dist/WebScraper.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"WebScraper.js","sourceRoot":"","sources":["../src/WebScraper.ts"],"names":[],"mappings":";;;AACA,2DAAsE;AAEtE,uCAAwE;AAsBxE,MAAa,UAAU;IAIrB,YAAY,UAA2B,EAAE;QACvC,IAAI,CAAC,SAAS;YACZ,iHAAiH,CAAC;QACpH,IAAI,CAAC,cAAc,GAAG;YACpB,OAAO,EAAE,KAAK;YACd,OAAO,EAAE,CAAC;YACV,cAAc,EAAE,IAAI;YACpB,GAAG,OAAO;SACX,CAAC;IACJ,CAAC;IAKO,KAAK,CAAC,aAAa,CACzB,GAAW,EACX,OAAwB;QAExB,MAAM,OAAO,GAAG;YACd,YAAY,EAAE,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE;YAC5D,MAAM,EACJ,4EAA4E;YAC9E,iBAAiB,EAAE,gBAAgB;YACnC,iBAAiB,EAAE,eAAe;YAClC,UAAU,EAAE,YAAY;YACxB,2BAA2B,EAAE,GAAG;YAChC,GAAG,OAAO,CAAC,OAAO;SACnB,CAAC;QAEF,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,SAAS,GAAG,UAAU,CAC1B,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EACxB,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,CAAC,OAAQ,CAChD,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO;gBACP,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;YAEH,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,OAAO;gBACL,GAAG;gBACH,IAAI;gBACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;gBACvB,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,QAAQ,CAAC,UAAU;gBAC/B,QAAQ,EAAE,QAAQ,CAAC,GAAG;aACvB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,YAAY,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAKO,KAAK,CAAC,mBAAmB,CAC/B,GAAW,EACX,OAAwB;QAExB,IAAI,IAAI,GAAG,EAAE,CAAC;QACd,IAAI,MAAM,GAAG,GAAG,CAAC;QACjB,IAAI,KAAyB,CAAC;QAC9B,IAAI,QAAQ,GAAG,GAAG,CAAC;QAEnB,IAAI,CAAC;YAEH,MAAM,UAAU,GAAe,MAAM,IAAA,cAAQ,EAAC,GAAG,EAAE;gBACjD,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,CAAC,OAAO;gBACvD,iBAAiB,EAAE,OAAO,CAAC,eAAe;gBAC1C,cAAc,EAAE,OAAO,CAAC,cAAc;aACvC,CAAC,CAAC;YAGH,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;YACjC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;YAC3B,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC;
|
|
1
|
+
{"version":3,"file":"WebScraper.js","sourceRoot":"","sources":["../src/WebScraper.ts"],"names":[],"mappings":";;;AACA,2DAAsE;AAEtE,uCAAwE;AAsBxE,MAAa,UAAU;IAIrB,YAAY,UAA2B,EAAE;QACvC,IAAI,CAAC,SAAS;YACZ,iHAAiH,CAAC;QACpH,IAAI,CAAC,cAAc,GAAG;YACpB,OAAO,EAAE,KAAK;YACd,OAAO,EAAE,CAAC;YACV,cAAc,EAAE,IAAI;YACpB,GAAG,OAAO;SACX,CAAC;IACJ,CAAC;IAKO,KAAK,CAAC,aAAa,CACzB,GAAW,EACX,OAAwB;QAExB,MAAM,OAAO,GAAG;YACd,YAAY,EAAE,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE;YAC5D,MAAM,EACJ,4EAA4E;YAC9E,iBAAiB,EAAE,gBAAgB;YACnC,iBAAiB,EAAE,eAAe;YAClC,UAAU,EAAE,YAAY;YACxB,2BAA2B,EAAE,GAAG;YAChC,GAAG,OAAO,CAAC,OAAO;SACnB,CAAC;QAEF,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,SAAS,GAAG,UAAU,CAC1B,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EACxB,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,CAAC,OAAQ,CAChD,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO;gBACP,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;YAEH,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,OAAO;gBACL,GAAG;gBACH,IAAI;gBACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;gBACvB,KAAK,EAAE,QAAQ,CAAC,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,EAAE;gBACvE,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,QAAQ,CAAC,UAAU;gBAC/B,QAAQ,EAAE,QAAQ,CAAC,GAAG;aACvB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,YAAY,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAKO,KAAK,CAAC,mBAAmB,CAC/B,GAAW,EACX,OAAwB;QAExB,IAAI,IAAI,GAAG,EAAE,CAAC;QACd,IAAI,MAAM,GAAG,GAAG,CAAC;QACjB,IAAI,KAAyB,CAAC;QAC9B,IAAI,QAAQ,GAAG,GAAG,CAAC;QAEnB,IAAI,CAAC;YAEH,MAAM,UAAU,GAAe,MAAM,IAAA,cAAQ,EAAC,GAAG,EAAE;gBACjD,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,CAAC,OAAO;gBACvD,iBAAiB,EAAE,OAAO,CAAC,eAAe;gBAC1C,cAAc,EAAE,OAAO,CAAC,cAAc;aACvC,CAAC,CAAC;YAGH,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;YACjC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;YAC3B,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC;YAE1B,KAAK;gBACH,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,MAAM,KAAK,UAAU,CAAC,UAAU,EAAE,CAAC;YAE1E,OAAO;gBACL,GAAG;gBACH,IAAI;gBACJ,MAAM;gBACN,MAAM,EAAE,QAAQ;gBAChB,KAAK;gBACL,QAAQ;aACT,CAAC;QACJ,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,YAAY,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;YAG1E,IAAI,YAAY,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBACrC,MAAM,GAAG,GAAG,CAAC;YACf,CAAC;iBAAM,IACL,YAAY,CAAC,QAAQ,CAAC,KAAK,CAAC;gBAC5B,YAAY,CAAC,QAAQ,CAAC,WAAW,CAAC,EAClC,CAAC;gBACD,MAAM,GAAG,GAAG,CAAC;YACf,CAAC;iBAAM,IACL,YAAY,CAAC,QAAQ,CAAC,KAAK,CAAC;gBAC5B,YAAY,CAAC,QAAQ,CAAC,WAAW,CAAC,EAClC,CAAC;gBACD,MAAM,GAAG,GAAG,CAAC;YACf,CAAC;iBAAM,IAAI,YAAY,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;gBACxC,MAAM,GAAG,GAAG,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,CAAC,CAAC;YACb,CAAC;YAED,OAAO;gBACL,GAAG;gBACH,IAAI,EAAE,EAAE;gBACR,MAAM;gBACN,MAAM,EAAE,QAAQ;gBAChB,KAAK,EAAE,YAAY,IAAI,KAAK;gBAC5B,QAAQ;aACT,CAAC;QACJ,CAAC;IACH,CAAC;IAKD,KAAK,CAAC,MAAM,CACV,GAAW,EACX,UAA2B,EAAE;QAE7B,MAAM,aAAa,GAAG,EAAE,GAAG,IAAI,CAAC,cAAc,EAAE,GAAG,OAAO,EAAE,CAAC;QAC7D,IAAI,SAAS,GAAiB,IAAI,CAAC;QAGnC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;YAG5D,IAAI,MAAM,CAAC,MAAM,IAAI,GAAG,IAAI,MAAM,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBAChD,IAAA,UAAI,EAAC,WAAW,EAAE,GAAG,CAAC,CAAC;gBACvB,OAAO,MAAM,CAAC;YAChB,CAAC;YAGD,IAAI,MAAM,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;gBAC1B,IAAA,UAAI,EAAC,SAAS,MAAM,CAAC,MAAM,SAAS,EAAE,GAAG,CAAC,CAAC;gBAC3C,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS;gBACP,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;YACpE,IAAA,UAAI,EACF,qDAAqD,GAAG,KAAK,SAAS,CAAC,OAAO,8BAA8B,CAC7G,CAAC;QACJ,CAAC;QAGD,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;YACnE,IAAA,WAAK,EACH,mCAAmC,GAAG,GAAG,EACzC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CACjC,CAAC;YACF,OAAO,OAAO,CAAC;QACjB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,cAAc,GAClB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;YAExE,MAAM,CAAC,GAAG,+BAA+B,SAAS,EAAE,OAAO,IAAI,SAAS,gBAAgB,cAAc,CAAC,OAAO,SAAU,KAAe,CAAC,OAAO,EAAE,CAAC;YAClJ,IAAA,UAAI,EAAC,CAAC,CAAC,CAAC;YACR,OAAO;gBACL,GAAG;gBACH,IAAI,EAAE,EAAE;gBACR,MAAM,EAAE,CAAC;gBACT,MAAM,EAAE,QAAQ;gBAChB,KAAK,EAAE,CAAC;aACT,CAAC;QACJ,CAAC;IACH,CAAC;IAKD,KAAK,CAAC,cAAc,CAClB,IAAc,EACd,UAA2B,EAAE;QAE7B,MAAM,OAAO,GAAqB,EAAE,CAAC;QAErC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;gBAC/C,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC;oBACX,GAAG;oBACH,IAAI,EAAE,EAAE;oBACR,MAAM,EAAE,CAAC;oBACT,MAAM,EAAE,OAAO;oBACf,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe;iBAChE,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAKD,KAAK,CAAC,OAAO;QACX,MAAM,IAAA,kBAAY,GAAE,CAAC;IACvB,CAAC;CACF;AAhOD,gCAgOC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom.d.ts","sourceRoot":"","sources":["../../src/helpers/dom.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAOpD,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,WAAW,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjC;
|
|
1
|
+
{"version":3,"file":"dom.d.ts","sourceRoot":"","sources":["../../src/helpers/dom.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAOpD,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,WAAW,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjC;AAgFD,eAAO,MAAM,aAAa,GAAU,iBAAiB,MAAM,kBAuC1D,CAAC;AAEF,eAAO,MAAM,YAAY,qBAUxB,CAAC;AAEF,eAAO,MAAM,QAAQ,GACnB,KAAK,MAAM,GAAG,GAAG,EACjB,MAAM;IAEJ,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB,KACA,OAAO,CAAC,UAAU,CAuGpB,CAAC"}
|
package/dist/helpers/dom.js
CHANGED
|
@@ -10,23 +10,74 @@ const fs_1 = require("fs");
|
|
|
10
10
|
const node_html_parser_1 = require("node-html-parser");
|
|
11
11
|
const puppeteer_core_1 = require("puppeteer-core");
|
|
12
12
|
let browser;
|
|
13
|
+
const ENV_EXECUTABLE_PATH_KEYS = [
|
|
14
|
+
'CHROME_EXECUTABLE_PATH',
|
|
15
|
+
'PUPPETEER_EXECUTABLE_PATH',
|
|
16
|
+
'BROWSER_EXECUTABLE_PATH',
|
|
17
|
+
];
|
|
18
|
+
function getPlatformChromePaths() {
|
|
19
|
+
switch (process.platform) {
|
|
20
|
+
case 'win32':
|
|
21
|
+
return [
|
|
22
|
+
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
|
23
|
+
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
|
24
|
+
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
|
|
25
|
+
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
|
|
26
|
+
];
|
|
27
|
+
case 'darwin':
|
|
28
|
+
return [
|
|
29
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
30
|
+
'/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge',
|
|
31
|
+
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
|
32
|
+
];
|
|
33
|
+
case 'linux':
|
|
34
|
+
return [
|
|
35
|
+
'/usr/bin/google-chrome',
|
|
36
|
+
'/usr/bin/google-chrome-stable',
|
|
37
|
+
'/usr/bin/chromium',
|
|
38
|
+
'/usr/bin/chromium-browser',
|
|
39
|
+
'/snap/bin/chromium',
|
|
40
|
+
];
|
|
41
|
+
case 'aix':
|
|
42
|
+
case 'android':
|
|
43
|
+
case 'freebsd':
|
|
44
|
+
case 'haiku':
|
|
45
|
+
case 'openbsd':
|
|
46
|
+
case 'sunos':
|
|
47
|
+
case 'cygwin':
|
|
48
|
+
case 'netbsd':
|
|
49
|
+
return [];
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
const pathExists = (path) => {
|
|
53
|
+
try {
|
|
54
|
+
(0, fs_1.accessSync)(path);
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
};
|
|
13
61
|
const getSystemChromePath = async () => {
|
|
14
|
-
const
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
try {
|
|
23
|
-
(0, fs_1.accessSync)(path);
|
|
24
|
-
return path;
|
|
62
|
+
for (const key of ENV_EXECUTABLE_PATH_KEYS) {
|
|
63
|
+
const value = process.env[key];
|
|
64
|
+
if (!value) {
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
if (pathExists(value)) {
|
|
68
|
+
(0, log_1.info)(`using browser executable from ${key}: ${value}`);
|
|
69
|
+
return value;
|
|
25
70
|
}
|
|
26
|
-
|
|
71
|
+
(0, log_1.info)(`browser executable from ${key} not found: ${value}`);
|
|
72
|
+
}
|
|
73
|
+
for (const path of getPlatformChromePaths()) {
|
|
74
|
+
if (pathExists(path)) {
|
|
75
|
+
(0, log_1.info)(`using detected browser executable: ${path}`);
|
|
76
|
+
return path;
|
|
27
77
|
}
|
|
28
78
|
}
|
|
29
79
|
const ret = await chromium_1.default.executablePath();
|
|
80
|
+
(0, log_1.info)(`using sparticuz chromium executable: ${ret}`);
|
|
30
81
|
return ret;
|
|
31
82
|
};
|
|
32
83
|
const launchBrowser = async (executablePath) => {
|
|
@@ -117,7 +168,7 @@ const goToPage = async (url, opt) => {
|
|
|
117
168
|
}
|
|
118
169
|
const content = await page.content();
|
|
119
170
|
const doc = (0, node_html_parser_1.parse)(content);
|
|
120
|
-
doc.querySelectorAll('.visually-hidden')
|
|
171
|
+
doc.querySelectorAll('.visually-hidden').forEach((n) => n.remove());
|
|
121
172
|
await page.close();
|
|
122
173
|
const result = {
|
|
123
174
|
html: doc,
|
package/dist/helpers/dom.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom.js","sourceRoot":"","sources":["../../src/helpers/dom.ts"],"names":[],"mappings":";;;;;;AACA,mEAA2C;AAC3C,2DAA8E;AAC9E,2BAAgC;AAEhC,uDAAyC;AAEzC,mDAAwC;AAExC,IAAI,OAA4B,CAAC;AAUjC,MAAM,
|
|
1
|
+
{"version":3,"file":"dom.js","sourceRoot":"","sources":["../../src/helpers/dom.ts"],"names":[],"mappings":";;;;;;AACA,mEAA2C;AAC3C,2DAA8E;AAC9E,2BAAgC;AAEhC,uDAAyC;AAEzC,mDAAwC;AAExC,IAAI,OAA4B,CAAC;AAUjC,MAAM,wBAAwB,GAAG;IAC/B,wBAAwB;IACxB,2BAA2B;IAC3B,yBAAyB;CACjB,CAAC;AAEX,SAAS,sBAAsB;IAC7B,QAAQ,OAAO,CAAC,QAAQ,EAAE,CAAC;QACzB,KAAK,OAAO;YACV,OAAO;gBACL,4DAA4D;gBAC5D,kEAAkE;gBAClE,6DAA6D;gBAC7D,mEAAmE;aACpE,CAAC;QACJ,KAAK,QAAQ;YACX,OAAO;gBACL,8DAA8D;gBAC9D,gEAAgE;gBAChE,oDAAoD;aACrD,CAAC;QACJ,KAAK,OAAO;YACV,OAAO;gBACL,wBAAwB;gBACxB,+BAA+B;gBAC/B,mBAAmB;gBACnB,2BAA2B;gBAC3B,oBAAoB;aACrB,CAAC;QACJ,KAAK,KAAK,CAAC;QACX,KAAK,SAAS,CAAC;QACf,KAAK,SAAS,CAAC;QACf,KAAK,OAAO,CAAC;QACb,KAAK,SAAS,CAAC;QACf,KAAK,OAAO,CAAC;QACb,KAAK,QAAQ,CAAC;QACd,KAAK,QAAQ;YACX,OAAO,EAAE,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,UAAU,GAAG,CAAC,IAAY,EAAW,EAAE;IAC3C,IAAI,CAAC;QACH,IAAA,eAAU,EAAC,IAAI,CAAC,CAAC;QACjB,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC,CAAC;AAEF,MAAM,mBAAmB,GAAG,KAAK,IAAI,EAAE;IACrC,KAAK,MAAM,GAAG,IAAI,wBAAwB,EAAE,CAAC;QAC3C,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC/B,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,SAAS;QACX,CAAC;QAED,IAAI,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YACtB,IAAA,UAAI,EAAC,iCAAiC,GAAG,KAAK,KAAK,EAAE,CAAC,CAAC;YACvD,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAA,UAAI,EAAC,2BAA2B,GAAG,eAAe,KAAK,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,sBAAsB,EAAE,EAAE,CAAC;QAC5C,IAAI,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YACrB,IAAA,UAAI,EAAC,sCAAsC,IAAI,EAAE,CAAC,CAAC;YACnD,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAGD,MAAM,GAAG,GAAG,MAAM,kBAAQ,CAAC,cAAc,EAAE,CAAC;IAC5C,IAAA,UAAI,EAAC,wCAAwC,GAAG,EAAE,CAAC,CAAC;IACpD,OAAO,GAAG,CAAC;AACb,CAAC,CAAC;AAEK,MAAM,aAAa,GAAG,KAAK,EAAE,cAAuB,EAAE,EAAE;IAC7D,MAAM,qBAAqB,GAAG,cAAc,IAAI,CAAC,MAAM,mBAAmB,EAAE,CAAC,CAAC;IAC9E,MAAM,GAAG,GAAG;QACV,eAAe,EAAE;YACf,MAAM,EAAE,IAAI;YACZ,KAAK,EAAE,IAAI;SACZ;QACD,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI;QACzD,iBAAiB,EAAE,IAAI;QACvB,QAAQ,EAAE,KAAK;QACf,cAAc,EAAE,qBAAqB;QACrC,IAAI,EAAE;YACJ,cAAc;YACd,0BAA0B;YAC1B,yBAAyB;YACzB,iCAAiC;YACjC,gBAAgB;YAChB,aAAa;YACb,kBAAkB;YAClB,eAAe;YACf,uCAAuC;YACvC,kCAAkC;YAClC,0CAA0C;YAC1C,mCAAmC;YACnC,4BAA4B;YAC5B,0BAA0B;YAC1B,sBAAsB;SACvB;KACF,CAAC;IAEF,IAAA,WAAK,EAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;IACnC,IAAI,CAAC;QACH,IAAI,OAAO,EAAE,KAAK,EAAE,CAAC;YACnB,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;IAET,CAAC;IACD,OAAO,GAAG,CAAC,MAAM,IAAA,uBAAM,EAAC,GAAG,CAAC,CAAuB,CAAC;AACtD,CAAC,CAAC;AAvCW,QAAA,aAAa,iBAuCxB;AAEK,MAAM,YAAY,GAAG,KAAK,IAAI,EAAE;IACrC,IAAI,CAAC;QACH,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO;QACT,CAAC;QAED,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACxB,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,IAAA,UAAI,EAAC,wBAAwB,EAAE,CAAC,CAAC,CAAC;IACpC,CAAC;AACH,CAAC,CAAC;AAVW,QAAA,YAAY,gBAUvB;AAEK,MAAM,QAAQ,GAAG,KAAK,EAC3B,GAAiB,EACjB,GAKC,EACoB,EAAE;IACvB,IAAI,UAAU,GAAG,KAAK,CAAC;IACvB,GAAG,CAAC;QACF,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,MAAM,IAAA,qBAAa,EAAC,GAAG,EAAE,cAAc,CAAC,CAAC;YAC3C,CAAC;YAED,IAAA,WAAK,EAAC,aAAa,GAAG,GAAG,CAAC,CAAC;YAC3B,MAAM,IAAI,GAAG,MAAM,OAAQ,CAAC,OAAO,EAAE,CAAC;YACtC,IAAI,CAAC,GAAG,GAAG,EAAE,OAAO,IAAI,IAAI,CAAC;YAC7B,IAAI,UAAU,EAAE,CAAC;gBACf,CAAC,IAAI,IAAI,CAAC;YACZ,CAAC;YAED,MAAM,IAAI,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;YAC5D,IAAI,QAAQ,CAAC;YAEb,IAAI,CAAC,GAAG,EAAE,iBAAiB,EAAE,CAAC;gBAE5B,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;oBAC/B,SAAS,EAAE,CAAC,MAAM,EAAE,kBAAkB,CAAC;oBACvC,OAAO,EAAE,CAAC;iBACX,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;oBAC/B,SAAS,EAAE,CAAC,MAAM,CAAC;oBACnB,OAAO,EAAE,CAAC;iBACX,CAAC,CAAC;gBACH,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,iBAAiB,EAAE;oBAChD,OAAO,EAAE,CAAC;oBACV,OAAO,EAAE,IAAI;iBACd,CAAC,CAAC;YACL,CAAC;YAGD,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACd,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;YAC/D,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,IAAA,wBAAK,EAAC,OAAO,CAAC,CAAC;YAG3B,GAAG,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YACpE,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YAGnB,MAAM,MAAM,GAAe;gBACzB,IAAI,EAAE,GAAG;gBACT,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE;gBACzB,UAAU,EAAE,QAAQ,CAAC,UAAU,EAAE;gBACjC,GAAG,EAAE,QAAQ,CAAC,GAAG,EAAE;gBACnB,OAAO,EAAE,QAAQ,CAAC,OAAO,EAAE;aAC5B,CAAC;YAEF,UAAU,GAAG,KAAK,CAAC;YACnB,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,CAAC,GAAG,GAAY,CAAC;YACvB,IAAI,UAAU,EAAE,CAAC;gBACf,IAAA,WAAK,EAAC,qBAAqB,EAAE,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAChD,MAAM,CAAC,CAAC;YACV,CAAC;YAGD,IACE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,kBAAkB,CAAC;gBACzC,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,YAAY,CAAC;gBACnC,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,oBAAoB,CAAC;gBAC3C,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,+BAA+B,CAAC;gBACtD,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gBAAgB,CAAC;gBACvC,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,eAAe,CAAC;gBACtC,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAChC,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC;gBAC9B,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC;gBAC/B,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAC/B,CAAC;gBACD,IAAI,CAAC;oBACH,IAAA,WAAK,EAAC,QAAQ,EAAE,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;oBAGnC,IACE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC;wBAChC,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,EAC9B,CAAC;wBACD,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;oBAC5D,CAAC;oBAED,MAAM,IAAA,qBAAa,EAAC,GAAG,EAAE,cAAc,CAAC,CAAC;oBACzC,UAAU,GAAG,IAAI,CAAC;gBACpB,CAAC;gBAAC,OAAO,EAAE,EAAE,CAAC;oBACZ,IAAA,WAAK,EAAC,4BAA4B,EAAE,EAAE,CAAC,CAAC;oBACxC,MAAM,EAAE,CAAC;gBACX,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAA,WAAK,EAAC,gBAAgB,CAAC,EAAE,CAAC,CAAC;gBAC3B,MAAM,CAAC,CAAC;YACV,CAAC;QACH,CAAC;IAEH,CAAC,QAAQ,UAAU,EAAE;IACrB,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;AACrC,CAAC,CAAC;AA/GW,QAAA,QAAQ,YA+GnB"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ag-webscrape",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.16",
|
|
4
4
|
"author": "admin@gec.dev",
|
|
5
5
|
"description": "TypeScript web scraper with Playwright fallback for anti-scraping protection",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -14,14 +14,14 @@
|
|
|
14
14
|
],
|
|
15
15
|
"license": "MIT",
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@sparticuz/chromium": "^
|
|
18
|
-
"ag-common": "^0.0.
|
|
17
|
+
"@sparticuz/chromium": "^143.0.0",
|
|
18
|
+
"ag-common": "^0.0.874",
|
|
19
19
|
"node-html-parser": "^7.0.1",
|
|
20
20
|
"puppeteer": "^24.15.0",
|
|
21
21
|
"puppeteer-core": "^24.15.0"
|
|
22
22
|
},
|
|
23
23
|
"devDependencies": {
|
|
24
|
-
"@types/node": "^
|
|
24
|
+
"@types/node": "^25.0.0",
|
|
25
25
|
"eslint": "^9.32.0",
|
|
26
26
|
"eslint-config-e7npm": "^0.1.23",
|
|
27
27
|
"tsx": "^4.20.3",
|