ag-webscrape 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,207 @@
1
+ # ag-webscrape
2
+
3
+ A TypeScript web scraper with intelligent fallback strategy. Attempts direct HTTP fetching first, then falls back to Playwright for anti-scraping protection.
4
+
5
+ ## Features
6
+
7
+ - **Dual Strategy**: Direct fetch first, Playwright fallback
8
+ - **Anti-Scraping Detection**: Automatically detects and bypasses common anti-scraping measures
9
+ - **Persistent Browser**: Maintains browser instance for faster subsequent scrapes
10
+ - **Error Handling**: Comprehensive error detection for 4xx/5xx responses
11
+ - **TypeScript Support**: Full type safety and IntelliSense
12
+ - **Configurable**: Extensive customization options
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ npm install ag-webscrape
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```typescript
23
+ import { WebScraper } from 'ag-webscrape';
24
+
25
+ const scraper = new WebScraper();
26
+
27
+ // Scrape a single URL
28
+ const result = await scraper.scrape('https://example.com');
29
+ console.log(result.html);
30
+
31
+ // Clean up when done
32
+ await scraper.dispose();
33
+ ```
34
+
35
+ ## API Reference
36
+
37
+ ### WebScraper Class
38
+
39
+ #### Constructor
40
+
41
+ ```typescript
42
+ new WebScraper(options?: ScrapingOptions)
43
+ ```
44
+
45
+ #### Options
46
+
47
+ ```typescript
48
+ interface ScrapingOptions {
49
+ timeout?: number; // Request timeout in ms (default: 30000)
50
+ userAgent?: string; // Custom user agent
51
+ headers?: Record<string, string>; // Additional headers
52
+ retries?: number; // Number of retries (default: 3)
53
+ waitForSelector?: string; // CSS selector to wait for
54
+ waitForTimeout?: number; // Time to wait in ms (default: 5000)
55
+ }
56
+ ```
57
+
58
+ #### Methods
59
+
60
+ ##### `scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>`
61
+
62
+ Scrapes a single URL with fallback strategy.
63
+
64
+ ```typescript
65
+ const result = await scraper.scrape('https://example.com', {
66
+ timeout: 60000,
67
+ waitForSelector: '.main-content'
68
+ });
69
+ ```
70
+
71
+ ##### `scrapeMultiple(urls: string[], options?: ScrapingOptions): Promise<ScrapingResult[]>`
72
+
73
+ Scrapes multiple URLs efficiently.
74
+
75
+ ```typescript
76
+ const results = await scraper.scrapeMultiple([
77
+ 'https://example1.com',
78
+ 'https://example2.com'
79
+ ]);
80
+ ```
81
+
82
+ ##### `dispose(): Promise<void>`
83
+
84
+ Cleans up browser resources. Always call this when done.
85
+
86
+ ```typescript
87
+ await scraper.dispose();
88
+ ```
89
+
90
+ #### Result Object
91
+
92
+ ```typescript
93
+ interface ScrapingResult {
94
+ url: string; // Original URL
95
+ html: string; // HTML content
96
+ status: number; // HTTP status code
97
+ method: 'fetch' | 'playwright'; // Method used
98
+ error?: string; // Error message if any
99
+ redirected?: boolean; // Whether request was redirected
100
+ finalUrl?: string; // Final URL after redirects
101
+ }
102
+ ```
103
+
104
+ ## Advanced Usage
105
+
106
+ ### Custom Headers and User Agent
107
+
108
+ ```typescript
109
+ const scraper = new WebScraper({
110
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
111
+ headers: {
112
+ 'Accept': 'text/html,application/xhtml+xml',
113
+ 'Accept-Language': 'en-US,en;q=0.9'
114
+ }
115
+ });
116
+ ```
117
+
118
+ ### Waiting for Content
119
+
120
+ ```typescript
121
+ // Wait for specific element
122
+ const result = await scraper.scrape('https://spa-app.com', {
123
+ waitForSelector: '.dynamic-content'
124
+ });
125
+
126
+ // Wait for specific time
127
+ const result = await scraper.scrape('https://slow-app.com', {
128
+ waitForTimeout: 10000
129
+ });
130
+ ```
131
+
132
+ ### Error Handling
133
+
134
+ ```typescript
135
+ const result = await scraper.scrape('https://example.com');
136
+
137
+ if (result.error) {
138
+ console.error('Scraping failed:', result.error);
139
+ } else {
140
+ console.log('Success:', result.html.length, 'characters');
141
+ }
142
+ ```
143
+
144
+ ### Batch Scraping
145
+
146
+ ```typescript
147
+ const urls = [
148
+ 'https://news.site.com/article1',
149
+ 'https://news.site.com/article2',
150
+ 'https://news.site.com/article3'
151
+ ];
152
+
153
+ const results = await scraper.scrapeMultiple(urls, {
154
+ waitForSelector: '.article-content'
155
+ });
156
+
157
+ results.forEach((result, index) => {
158
+ if (!result.error) {
159
+ console.log(`Article ${index + 1}: ${result.html.length} chars`);
160
+ }
161
+ });
162
+ ```
163
+
164
+ ## How It Works
165
+
166
+ 1. **Direct Fetch**: First attempts HTTP request using `node-fetch`
167
+ 2. **Anti-Scraping Detection**: Checks response for common anti-scraping patterns
168
+ 3. **Playwright Fallback**: If direct fetch fails or anti-scraping detected, uses Playwright
169
+ 4. **Error Detection**: Monitors for 4xx/5xx responses in both methods
170
+ 5. **Resource Management**: Maintains browser instance for performance
171
+
172
+ ## Anti-Scraping Protection
173
+
174
+ The scraper automatically detects and handles:
175
+
176
+ - Cloudflare protection
177
+ - DistilNetworks
178
+ - PerimeterX
179
+ - DataDome
180
+ - Akamai Bot Manager
181
+ - CAPTCHA challenges
182
+ - JavaScript requirement checks
183
+ - Rate limiting
184
+ - Access denied pages
185
+
186
+ ## Performance
187
+
188
+ - **Fast**: Direct fetch for simple pages
189
+ - **Efficient**: Reuses browser instance
190
+ - **Robust**: Fallback ensures high success rate
191
+ - **Intelligent**: Only uses Playwright when necessary
192
+
193
+ ## Examples
194
+
195
+ Check out the `src/example.ts` file for complete usage examples.
196
+
197
+ ## License
198
+
199
+ MIT
200
+
201
+ ## Contributing
202
+
203
+ Pull requests welcome! Please ensure TypeScript compilation and tests pass.
204
+
205
+ ## Support
206
+
207
+ For issues and questions, please use the GitHub issue tracker.
@@ -0,0 +1,34 @@
1
+ export interface ScrapingOptions {
2
+ timeout?: number;
3
+ userAgent?: string;
4
+ headers?: Record<string, string>;
5
+ retries?: number;
6
+ waitForSelector?: string;
7
+ waitForTimeout?: number;
8
+ }
9
+ export interface ScrapingResult {
10
+ url: string;
11
+ html: string;
12
+ status: number;
13
+ method: 'fetch' | 'playwright';
14
+ error?: string;
15
+ redirected?: boolean;
16
+ finalUrl?: string;
17
+ }
18
+ export declare class WebScraper {
19
+ private browser;
20
+ private context;
21
+ private page;
22
+ private userAgent;
23
+ private defaultOptions;
24
+ constructor(options?: ScrapingOptions);
25
+ private initializeBrowser;
26
+ private detectAntiScraping;
27
+ private fetchDirectly;
28
+ private scrapeWithPlaywright;
29
+ private isErrorPage;
30
+ scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
31
+ scrapeMultiple(urls: string[], options?: ScrapingOptions): Promise<ScrapingResult[]>;
32
+ dispose(): Promise<void>;
33
+ }
34
+ //# sourceMappingURL=WebScraper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WebScraper.d.ts","sourceRoot":"","sources":["../src/WebScraper.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,eAAe;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,GAAG,YAAY,CAAC;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,UAAU;IACrB,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,OAAO,CAA+B;IAC9C,OAAO,CAAC,IAAI,CAAqB;IACjC,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,cAAc,CAAkB;gBAE5B,OAAO,GAAE,eAAoB;YAa3B,iBAAiB;IA2B/B,OAAO,CAAC,kBAAkB;YAwBZ,aAAa;YAiDb,oBAAoB;IA4ElC,OAAO,CAAC,WAAW;IAmBb,MAAM,CACV,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC;IAkDpB,cAAc,CAClB,IAAI,EAAE,MAAM,EAAE,EACd,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,EAAE,CAAC;IAwBtB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAgB/B"}
@@ -0,0 +1,228 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.WebScraper = void 0;
4
+ const playwright_1 = require("playwright");
5
+ class WebScraper {
6
+ constructor(options = {}) {
7
+ this.browser = null;
8
+ this.context = null;
9
+ this.page = null;
10
+ this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36";
11
+ this.defaultOptions = {
12
+ timeout: 30000,
13
+ retries: 3,
14
+ waitForTimeout: 5000,
15
+ ...options,
16
+ };
17
+ }
18
+ async initializeBrowser() {
19
+ if (!this.browser) {
20
+ this.browser = await playwright_1.chromium.launch({
21
+ headless: true,
22
+ args: [
23
+ '--no-sandbox',
24
+ '--disable-setuid-sandbox',
25
+ '--disable-dev-shm-usage',
26
+ '--disable-accelerated-2d-canvas',
27
+ '--disable-gpu',
28
+ '--window-size=1920,1080',
29
+ ],
30
+ });
31
+ this.context = await this.browser.newContext({
32
+ userAgent: this.defaultOptions.userAgent || this.userAgent.toString(),
33
+ viewport: { width: 1920, height: 1080 },
34
+ extraHTTPHeaders: this.defaultOptions.headers || {},
35
+ });
36
+ this.page = await this.context.newPage();
37
+ }
38
+ }
39
+ detectAntiScraping(html) {
40
+ const antiScrapingPatterns = [
41
+ /cloudflare/i,
42
+ /distil.networks/i,
43
+ /perimeterx/i,
44
+ /datadome/i,
45
+ /akamai/i,
46
+ /bot.protection/i,
47
+ /please.enable.javascript/i,
48
+ /access.denied/i,
49
+ /blocked/i,
50
+ /captcha/i,
51
+ /challenge/i,
52
+ /security.check/i,
53
+ /rate.limit/i,
54
+ /temporarily.unavailable/i,
55
+ ];
56
+ return antiScrapingPatterns.some((pattern) => pattern.test(html));
57
+ }
58
+ async fetchDirectly(url, options) {
59
+ const headers = {
60
+ 'User-Agent': options.userAgent || this.userAgent.toString(),
61
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
62
+ 'Accept-Language': 'en-US,en;q=0.5',
63
+ 'Accept-Encoding': 'gzip, deflate',
64
+ Connection: 'keep-alive',
65
+ 'Upgrade-Insecure-Requests': '1',
66
+ ...options.headers,
67
+ };
68
+ const controller = new AbortController();
69
+ const timeoutId = setTimeout(() => controller.abort(), options.timeout || this.defaultOptions.timeout);
70
+ try {
71
+ const response = await fetch(url, {
72
+ headers,
73
+ signal: controller.signal,
74
+ redirect: 'follow',
75
+ });
76
+ clearTimeout(timeoutId);
77
+ const html = await response.text();
78
+ return {
79
+ url,
80
+ html,
81
+ status: response.status,
82
+ method: 'fetch',
83
+ redirected: response.redirected,
84
+ finalUrl: response.url,
85
+ };
86
+ }
87
+ catch (error) {
88
+ clearTimeout(timeoutId);
89
+ throw error;
90
+ }
91
+ }
92
+ async scrapeWithPlaywright(url, options) {
93
+ await this.initializeBrowser();
94
+ if (!this.page) {
95
+ throw new Error('Failed to initialize Playwright page');
96
+ }
97
+ const page = this.page;
98
+ let html = '';
99
+ let status = 0;
100
+ let error;
101
+ try {
102
+ if (options.headers) {
103
+ await page.setExtraHTTPHeaders(options.headers);
104
+ }
105
+ const response = await page.goto(url, {
106
+ waitUntil: 'networkidle',
107
+ timeout: options.timeout || this.defaultOptions.timeout,
108
+ });
109
+ if (response) {
110
+ status = response.status();
111
+ if (status >= 400) {
112
+ error = `HTTP ${status} error`;
113
+ }
114
+ }
115
+ if (options.waitForSelector) {
116
+ await page.waitForSelector(options.waitForSelector, {
117
+ timeout: options.waitForTimeout || this.defaultOptions.waitForTimeout,
118
+ });
119
+ }
120
+ else if (options.waitForTimeout) {
121
+ await page.waitForTimeout(options.waitForTimeout);
122
+ }
123
+ html = await page.content();
124
+ if (this.isErrorPage(html)) {
125
+ error = error || 'Error page detected in HTML content';
126
+ }
127
+ return {
128
+ url,
129
+ html,
130
+ status,
131
+ method: 'playwright',
132
+ error,
133
+ finalUrl: page.url(),
134
+ };
135
+ }
136
+ catch (err) {
137
+ return {
138
+ url,
139
+ html: '',
140
+ status: 0,
141
+ method: 'playwright',
142
+ error: err instanceof Error ? err.message : 'Unknown error',
143
+ finalUrl: page.url(),
144
+ };
145
+ }
146
+ }
147
+ isErrorPage(html) {
148
+ const errorPatterns = [
149
+ /<title>.*4\d\d.*<\/title>/i,
150
+ /<title>.*5\d\d.*<\/title>/i,
151
+ /<title>.*error.*<\/title>/i,
152
+ /<title>.*not found.*<\/title>/i,
153
+ /<title>.*forbidden.*<\/title>/i,
154
+ /<title>.*unauthorized.*<\/title>/i,
155
+ /<h1>.*4\d\d.*<\/h1>/i,
156
+ /<h1>.*5\d\d.*<\/h1>/i,
157
+ /<h1>.*error.*<\/h1>/i,
158
+ ];
159
+ return errorPatterns.some((pattern) => pattern.test(html));
160
+ }
161
+ async scrape(url, options = {}) {
162
+ const mergedOptions = { ...this.defaultOptions, ...options };
163
+ let lastError = null;
164
+ try {
165
+ const result = await this.fetchDirectly(url, mergedOptions);
166
+ if (result.status >= 200 &&
167
+ result.status < 300 &&
168
+ !this.detectAntiScraping(result.html)) {
169
+ return result;
170
+ }
171
+ console.log(`Direct fetch failed or anti-scraping detected for ${url}. Falling back to Playwright.`);
172
+ }
173
+ catch (error) {
174
+ lastError =
175
+ error instanceof Error ? error : new Error('Unknown fetch error');
176
+ console.log(`Direct fetch failed for ${url}: ${lastError.message}. Falling back to Playwright.`);
177
+ }
178
+ try {
179
+ const result = await this.scrapeWithPlaywright(url, mergedOptions);
180
+ return result;
181
+ }
182
+ catch (error) {
183
+ const playwrightError = error instanceof Error ? error : new Error('Unknown Playwright error');
184
+ return {
185
+ url,
186
+ html: '',
187
+ status: 0,
188
+ method: 'playwright',
189
+ error: `Both methods failed. Fetch: ${lastError?.message || 'Unknown'}. Playwright: ${playwrightError.message}`,
190
+ };
191
+ }
192
+ }
193
+ async scrapeMultiple(urls, options = {}) {
194
+ const results = [];
195
+ for (const url of urls) {
196
+ try {
197
+ const result = await this.scrape(url, options);
198
+ results.push(result);
199
+ }
200
+ catch (error) {
201
+ results.push({
202
+ url,
203
+ html: '',
204
+ status: 0,
205
+ method: 'fetch',
206
+ error: error instanceof Error ? error.message : 'Unknown error',
207
+ });
208
+ }
209
+ }
210
+ return results;
211
+ }
212
+ async dispose() {
213
+ if (this.page) {
214
+ await this.page.close();
215
+ this.page = null;
216
+ }
217
+ if (this.context) {
218
+ await this.context.close();
219
+ this.context = null;
220
+ }
221
+ if (this.browser) {
222
+ await this.browser.close();
223
+ this.browser = null;
224
+ }
225
+ }
226
+ }
227
+ exports.WebScraper = WebScraper;
228
+ //# sourceMappingURL=WebScraper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WebScraper.js","sourceRoot":"","sources":["../src/WebScraper.ts"],"names":[],"mappings":";;;AACA,2CAAsC;AAqBtC,MAAa,UAAU;IAOrB,YAAY,UAA2B,EAAE;QANjC,YAAO,GAAmB,IAAI,CAAC;QAC/B,YAAO,GAA0B,IAAI,CAAC;QACtC,SAAI,GAAgB,IAAI,CAAC;QAK/B,IAAI,CAAC,SAAS,GAAG,iHAAiH,CAAC;QACnI,IAAI,CAAC,cAAc,GAAG;YACpB,OAAO,EAAE,KAAK;YACd,OAAO,EAAE,CAAC;YACV,cAAc,EAAE,IAAI;YACpB,GAAG,OAAO;SACX,CAAC;IACJ,CAAC;IAKO,KAAK,CAAC,iBAAiB;QAC7B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,IAAI,CAAC,OAAO,GAAG,MAAM,qBAAQ,CAAC,MAAM,CAAC;gBACnC,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,iCAAiC;oBACjC,eAAe;oBACf,yBAAyB;iBAC1B;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;gBAC3C,SAAS,EAAE,IAAI,CAAC,cAAc,CAAC,SAAS,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE;gBACrE,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE;gBACvC,gBAAgB,EAAE,IAAI,CAAC,cAAc,CAAC,OAAO,IAAI,EAAE;aACpD,CAAC,CAAC;YAEH,IAAI,CAAC,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;QAC3C,CAAC;IACH,CAAC;IAKO,kBAAkB,CAAC,IAAY;QACrC,MAAM,oBAAoB,GAAG;YAC3B,aAAa;YACb,kBAAkB;YAClB,aAAa;YACb,WAAW;YACX,SAAS;YACT,iBAAiB;YACjB,2BAA2B;YAC3B,gBAAgB;YAChB,UAAU;YACV,UAAU;YACV,YAAY;YACZ,iBAAiB;YACjB,aAAa;YACb,0BAA0B;SAC3B,CAAC;QAEF,OAAO,oBAAoB,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACpE,CAAC;IAKO,KAAK,CAAC,aAAa,CACzB,GAAW,EACX,OAAwB;QAExB,MAAM,OAAO,GAAG;YACd,YAAY,EAAE,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE;YAC5D,MAAM,EACJ,4EAA4E;YAC9E,iBAAiB,EAAE,gBAAgB;YACnC,iBAAiB,EAAE,eAAe;YAClC,UAAU,EAAE,YAAY;YACxB,2BAA2B,EAAE,GAAG;YAChC,GAAG,OAAO,CAAC,OAAO;SACnB,CAAC;QAEF,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,SAAS,GAAG,UAAU,CAC1B,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EACxB,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,CAAC,OAAQ,CAChD,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO;gBACP,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;YAEH,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,OAAO;gBACL,GAAG;gBACH,IAAI;gBACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;gBACvB,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,QAAQ,CAAC,UAAU;gBAC/B,QAAQ,EAAE,QAAQ,CAAC,GAAG;aACvB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,YAAY,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAKO,KAAK,CAAC,oBAAoB,CAChC,GAAW,EACX,OAAwB;QAExB,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAE/B,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;QACvB,IAAI,IAAI,GAAG,EAAE,CAAC;QACd,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,KAAyB,CAAC;QAE9B,IAAI,CAAC;YAEH,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;gBACpB,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAClD,CAAC;YAGD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACpC,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,cAAc,CAAC,OAAO;aACxD,CAAC,CAAC;YAEH,IAAI,QAAQ,EAAE,CAAC;gBACb,MAAM,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;gBAG3B,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;oBAClB,KAAK,GAAG,QAAQ,MAAM,QAAQ,CAAC;gBACjC,CAAC;YACH,CAAC;YAGD,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE;oBAClD,OAAO,EAAE,OAAO,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc,CAAC,cAAc;iBACtE,CAAC,CAAC;YACL,CAAC;iBAAM,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;gBAClC,MAAM,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;YACpD,CAAC;YAGD,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAG5B,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,KAAK,GAAG,KAAK,IAAI,qCAAqC,CAAC;YACzD,CAAC;YAED,OAAO;gBACL,GAAG;gBACH,IAAI;gBACJ,MAAM;gBACN,MAAM,EAAE,YAAY;gBACpB,KAAK;gBACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE;aACrB,CAAC;QACJ,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,GAAG;gBACH,IAAI,EAAE,EAAE;gBACR,MAAM,EAAE,CAAC;gBACT,MAAM,EAAE,YAAY;gBACpB,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe;gBAC3D,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE;aACrB,CAAC;QACJ,CAAC;IACH,CAAC;IAKO,WAAW,CAAC,IAAY;QAC9B,MAAM,aAAa,GAAG;YACpB,4BAA4B;YAC5B,4BAA4B;YAC5B,4BAA4B;YAC5B,gCAAgC;YAChC,gCAAgC;YAChC,mCAAmC;YACnC,sBAAsB;YACtB,sBAAsB;YACtB,sBAAsB;SACvB,CAAC;QAEF,OAAO,aAAa,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7D,CAAC;IAKD,KAAK,CAAC,MAAM,CACV,GAAW,EACX,UAA2B,EAAE;QAE7B,MAAM,aAAa,GAAG,EAAE,GAAG,IAAI,CAAC,cAAc,EAAE,GAAG,OAAO,EAAE,CAAC;QAC7D,IAAI,SAAS,GAAiB,IAAI,CAAC;QAGnC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;YAG5D,IACE,MAAM,CAAC,MAAM,IAAI,GAAG;gBACpB,MAAM,CAAC,MAAM,GAAG,GAAG;gBACnB,CAAC,IAAI,CAAC,kBAAkB,CAAC,MAAM,CAAC,IAAI,CAAC,EACrC,CAAC;gBACD,OAAO,MAAM,CAAC;YAChB,CAAC;YAGD,OAAO,CAAC,GAAG,CACT,qDAAqD,GAAG,+BAA+B,CACxF,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS;gBACP,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;YACpE,OAAO,CAAC,GAAG,CACT,2BAA2B,GAAG,KAAK,SAAS,CAAC,OAAO,+BAA+B,CACpF,CAAC;QACJ,CAAC;QAGD,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;YACnE,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,eAAe,GACnB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,0BAA0B,CAAC,CAAC;YAEzE,OAAO;gBACL,GAAG;gBACH,IAAI,EAAE,EAAE;gBACR,MAAM,EAAE,CAAC;gBACT,MAAM,EAAE,YAAY;gBACpB,KAAK,EAAE,+BAA+B,SAAS,EAAE,OAAO,IAAI,SAAS,iBAAiB,eAAe,CAAC,OAAO,EAAE;aAChH,CAAC;QACJ,CAAC;IACH,CAAC;IAKD,KAAK,CAAC,cAAc,CAClB,IAAc,EACd,UAA2B,EAAE;QAE7B,MAAM,OAAO,GAAqB,EAAE,CAAC;QAErC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;gBAC/C,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC;oBACX,GAAG;oBACH,IAAI,EAAE,EAAE;oBACR,MAAM,EAAE,CAAC;oBACT,MAAM,EAAE,OAAO;oBACf,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe;iBAChE,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAKD,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YACd,MAAM,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;YACxB,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;QACnB,CAAC;QAED,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QAED,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF;AAvTD,gCAuTC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=example.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"example.d.ts","sourceRoot":"","sources":["../src/example.ts"],"names":[],"mappings":""}
@@ -0,0 +1,47 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const index_1 = require("./index");
4
+ async function runExample() {
5
+ const scraper = new index_1.WebScraper({
6
+ timeout: 30000,
7
+ waitForTimeout: 3000,
8
+ });
9
+ try {
10
+ console.log('Starting web scraping example...');
11
+ const result = await scraper.scrape('https://httpbin.org/html');
12
+ console.log('Scraped successfully:', {
13
+ url: result.url,
14
+ method: result.method,
15
+ status: result.status,
16
+ htmlLength: result.html.length,
17
+ error: result.error,
18
+ });
19
+ const urls = [
20
+ 'https://httpbin.org/html',
21
+ 'https://httpbin.org/status/200',
22
+ 'https://httpbin.org/status/404',
23
+ ];
24
+ console.log('\nScraping multiple URLs...');
25
+ const results = await scraper.scrapeMultiple(urls);
26
+ results.forEach((result, index) => {
27
+ console.log(`Result ${index + 1}:`, {
28
+ url: result.url,
29
+ method: result.method,
30
+ status: result.status,
31
+ htmlLength: result.html.length,
32
+ error: result.error,
33
+ });
34
+ });
35
+ }
36
+ catch (error) {
37
+ console.error('Error during scraping:', error);
38
+ }
39
+ finally {
40
+ await scraper.dispose();
41
+ console.log('Scraper disposed successfully');
42
+ }
43
+ }
44
+ if (require.main === module) {
45
+ runExample().catch(console.error);
46
+ }
47
+ //# sourceMappingURL=example.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"example.js","sourceRoot":"","sources":["../src/example.ts"],"names":[],"mappings":";;AAAA,mCAAqC;AAErC,KAAK,UAAU,UAAU;IACvB,MAAM,OAAO,GAAG,IAAI,kBAAU,CAAC;QAC7B,OAAO,EAAE,KAAK;QACd,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;QAGhD,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,MAAM,CAAC,0BAA0B,CAAC,CAAC;QAChE,OAAO,CAAC,GAAG,CAAC,uBAAuB,EAAE;YACnC,GAAG,EAAE,MAAM,CAAC,GAAG;YACf,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,UAAU,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM;YAC9B,KAAK,EAAE,MAAM,CAAC,KAAK;SACpB,CAAC,CAAC;QAGH,MAAM,IAAI,GAAG;YACX,0BAA0B;YAC1B,gCAAgC;YAChC,gCAAgC;SACjC,CAAC;QAEF,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;QAC3C,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;QAEnD,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE;YAChC,OAAO,CAAC,GAAG,CAAC,UAAU,KAAK,GAAG,CAAC,GAAG,EAAE;gBAClC,GAAG,EAAE,MAAM,CAAC,GAAG;gBACf,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,UAAU,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM;gBAC9B,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,wBAAwB,EAAE,KAAK,CAAC,CAAC;IACjD,CAAC;YAAS,CAAC;QAET,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACxB,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC;IAC/C,CAAC;AACH,CAAC;AAGD,IAAI,OAAO,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;IAC5B,UAAU,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;AACpC,CAAC"}
@@ -0,0 +1,5 @@
1
+ import { WebScraper } from './WebScraper';
2
+ export type { ScrapingOptions, ScrapingResult } from './WebScraper';
3
+ export { WebScraper } from './WebScraper';
4
+ export default WebScraper;
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAE1C,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AACpE,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAE1C,eAAe,UAAU,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,8 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.WebScraper = void 0;
4
+ const WebScraper_1 = require("./WebScraper");
5
+ var WebScraper_2 = require("./WebScraper");
6
+ Object.defineProperty(exports, "WebScraper", { enumerable: true, get: function () { return WebScraper_2.WebScraper; } });
7
+ exports.default = WebScraper_1.WebScraper;
8
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AACA,6CAA0C;AAG1C,2CAA0C;AAAjC,wGAAA,UAAU,OAAA;AAEnB,kBAAe,uBAAU,CAAC"}
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "ag-webscrape",
3
+ "version": "0.0.1",
4
+ "author": "admin@gec.dev",
5
+ "description": "TypeScript web scraper with Playwright fallback for anti-scraping protection",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "keywords": [
9
+ "scraping",
10
+ "playwright",
11
+ "typescript",
12
+ "web-scraper",
13
+ "anti-scraping"
14
+ ],
15
+ "license": "MIT",
16
+ "dependencies": {
17
+ "playwright": "1.54.1"
18
+ },
19
+ "devDependencies": {
20
+ "@types/node": "24.0.13",
21
+ "eslint": "9.31.0",
22
+ "eslint-config-e7npm": "0.1.23",
23
+ "typescript": "5.8.3"
24
+ },
25
+ "files": [
26
+ "dist/**/*",
27
+ "README.md"
28
+ ],
29
+ "engines": {
30
+ "node": ">=20",
31
+ "yarn": "use pnpm",
32
+ "npm": "use pnpm",
33
+ "pnpm": ">=3"
34
+ },
35
+ "scripts": {
36
+ "preinstall": "npx only-allow pnpm",
37
+ "build": "tsc",
38
+ "dev": "tsc --watch",
39
+ "lint": "next lint",
40
+ "format": "eslint src --fix"
41
+ }
42
+ }