@olib-ai/owl-browser-sdk 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +107 -0
  2. package/dist/extraction/content-cleaner.d.ts +40 -0
  3. package/dist/extraction/content-cleaner.d.ts.map +1 -0
  4. package/dist/extraction/content-cleaner.js +393 -0
  5. package/dist/extraction/content-cleaner.js.map +1 -0
  6. package/dist/extraction/extractor.d.ts +139 -0
  7. package/dist/extraction/extractor.d.ts.map +1 -0
  8. package/dist/extraction/extractor.js +212 -0
  9. package/dist/extraction/extractor.js.map +1 -0
  10. package/dist/extraction/html-processor.d.ts +75 -0
  11. package/dist/extraction/html-processor.d.ts.map +1 -0
  12. package/dist/extraction/html-processor.js +192 -0
  13. package/dist/extraction/html-processor.js.map +1 -0
  14. package/dist/extraction/index.d.ts +14 -0
  15. package/dist/extraction/index.d.ts.map +1 -0
  16. package/dist/extraction/index.js +19 -0
  17. package/dist/extraction/index.js.map +1 -0
  18. package/dist/extraction/list-extractor.d.ts +24 -0
  19. package/dist/extraction/list-extractor.d.ts.map +1 -0
  20. package/dist/extraction/list-extractor.js +303 -0
  21. package/dist/extraction/list-extractor.js.map +1 -0
  22. package/dist/extraction/meta-extractor.d.ts +40 -0
  23. package/dist/extraction/meta-extractor.d.ts.map +1 -0
  24. package/dist/extraction/meta-extractor.js +216 -0
  25. package/dist/extraction/meta-extractor.js.map +1 -0
  26. package/dist/extraction/pagination.d.ts +29 -0
  27. package/dist/extraction/pagination.d.ts.map +1 -0
  28. package/dist/extraction/pagination.js +323 -0
  29. package/dist/extraction/pagination.js.map +1 -0
  30. package/dist/extraction/pattern-detector.d.ts +16 -0
  31. package/dist/extraction/pattern-detector.d.ts.map +1 -0
  32. package/dist/extraction/pattern-detector.js +390 -0
  33. package/dist/extraction/pattern-detector.js.map +1 -0
  34. package/dist/extraction/scrape-session.d.ts +23 -0
  35. package/dist/extraction/scrape-session.d.ts.map +1 -0
  36. package/dist/extraction/scrape-session.js +192 -0
  37. package/dist/extraction/scrape-session.js.map +1 -0
  38. package/dist/extraction/selector-engine.d.ts +23 -0
  39. package/dist/extraction/selector-engine.d.ts.map +1 -0
  40. package/dist/extraction/selector-engine.js +127 -0
  41. package/dist/extraction/selector-engine.js.map +1 -0
  42. package/dist/extraction/table-extractor.d.ts +29 -0
  43. package/dist/extraction/table-extractor.d.ts.map +1 -0
  44. package/dist/extraction/table-extractor.js +282 -0
  45. package/dist/extraction/table-extractor.js.map +1 -0
  46. package/dist/extraction/transforms.d.ts +47 -0
  47. package/dist/extraction/transforms.d.ts.map +1 -0
  48. package/dist/extraction/transforms.js +277 -0
  49. package/dist/extraction/transforms.js.map +1 -0
  50. package/dist/extraction/types.d.ts +199 -0
  51. package/dist/extraction/types.d.ts.map +1 -0
  52. package/dist/extraction/types.js +5 -0
  53. package/dist/extraction/types.js.map +1 -0
  54. package/dist/index.d.ts +1 -0
  55. package/dist/index.d.ts.map +1 -1
  56. package/dist/index.js +2 -0
  57. package/dist/index.js.map +1 -1
  58. package/dist/playwright/browser-type.d.ts +101 -0
  59. package/dist/playwright/browser-type.d.ts.map +1 -0
  60. package/dist/playwright/browser-type.js +134 -0
  61. package/dist/playwright/browser-type.js.map +1 -0
  62. package/dist/playwright/browser.d.ts +98 -0
  63. package/dist/playwright/browser.d.ts.map +1 -0
  64. package/dist/playwright/browser.js +229 -0
  65. package/dist/playwright/browser.js.map +1 -0
  66. package/dist/playwright/context.d.ts +217 -0
  67. package/dist/playwright/context.d.ts.map +1 -0
  68. package/dist/playwright/context.js +518 -0
  69. package/dist/playwright/context.js.map +1 -0
  70. package/dist/playwright/extractor.d.ts +108 -0
  71. package/dist/playwright/extractor.d.ts.map +1 -0
  72. package/dist/playwright/extractor.js +404 -0
  73. package/dist/playwright/extractor.js.map +1 -0
  74. package/dist/playwright/frame.d.ts +147 -0
  75. package/dist/playwright/frame.d.ts.map +1 -0
  76. package/dist/playwright/frame.js +492 -0
  77. package/dist/playwright/frame.js.map +1 -0
  78. package/dist/playwright/index.d.ts +163 -0
  79. package/dist/playwright/index.d.ts.map +1 -0
  80. package/dist/playwright/index.js +313 -0
  81. package/dist/playwright/index.js.map +1 -0
  82. package/dist/playwright/keyboard.d.ts +74 -0
  83. package/dist/playwright/keyboard.d.ts.map +1 -0
  84. package/dist/playwright/keyboard.js +187 -0
  85. package/dist/playwright/keyboard.js.map +1 -0
  86. package/dist/playwright/locator.d.ts +237 -0
  87. package/dist/playwright/locator.d.ts.map +1 -0
  88. package/dist/playwright/locator.js +667 -0
  89. package/dist/playwright/locator.js.map +1 -0
  90. package/dist/playwright/mouse.d.ts +82 -0
  91. package/dist/playwright/mouse.d.ts.map +1 -0
  92. package/dist/playwright/mouse.js +137 -0
  93. package/dist/playwright/mouse.js.map +1 -0
  94. package/dist/playwright/page-helpers.d.ts +267 -0
  95. package/dist/playwright/page-helpers.d.ts.map +1 -0
  96. package/dist/playwright/page-helpers.js +449 -0
  97. package/dist/playwright/page-helpers.js.map +1 -0
  98. package/dist/playwright/page.d.ts +605 -0
  99. package/dist/playwright/page.d.ts.map +1 -0
  100. package/dist/playwright/page.js +1698 -0
  101. package/dist/playwright/page.js.map +1 -0
  102. package/dist/playwright/response.d.ts +100 -0
  103. package/dist/playwright/response.d.ts.map +1 -0
  104. package/dist/playwright/response.js +194 -0
  105. package/dist/playwright/response.js.map +1 -0
  106. package/dist/playwright/types.d.ts +354 -0
  107. package/dist/playwright/types.d.ts.map +1 -0
  108. package/dist/playwright/types.js +8 -0
  109. package/dist/playwright/types.js.map +1 -0
  110. package/openapi.json +327 -35
  111. package/package.json +10 -1
package/README.md CHANGED
@@ -249,6 +249,89 @@ const expectation = {
249
249
  }
250
250
  ```
251
251
 
252
+ ## Playwright-Compatible API
253
+
254
+ Drop-in Playwright API that translates Playwright calls to Owl Browser tools. Use your existing Playwright code with Owl Browser's antidetect capabilities.
255
+
256
+ ```typescript
257
+ import { chromium, devices } from '@olib-ai/owl-browser-sdk/playwright';
258
+
259
+ const browser = await chromium.connect('http://localhost:8080', { token: 'your-token' });
260
+ const context = await browser.newContext(devices['iPhone 15 Pro']);
261
+ const page = await context.newPage();
262
+
263
+ await page.goto('https://example.com');
264
+ await page.click('button#submit');
265
+ await page.fill('#search', 'query');
266
+
267
+ const text = await page.textContent('h1');
268
+ const screenshot = await page.screenshot({ path: 'page.png' });
269
+
270
+ // Locators
271
+ const button = page.locator('button.primary');
272
+ await button.click();
273
+
274
+ // Playwright-style selectors
275
+ const login = page.getByRole('button', { name: 'Log in' });
276
+ const input = page.getByPlaceholder('Enter email');
277
+ const heading = page.getByText('Welcome');
278
+
279
+ await context.close();
280
+ await browser.close();
281
+ ```
282
+
283
+ **Supported features:** Page navigation, click/fill/type/press, locators (CSS, text, role, test-id, xpath), frames, keyboard & mouse input, screenshots, network interception (`route`/`unroute`), dialogs, downloads, viewport emulation, and 20+ device descriptors (iPhone, Pixel, Galaxy, iPad, Desktop).
284
+
285
+ ## Data Extraction
286
+
287
+ Universal structured data extraction from any website — CSS selectors, auto-detection, tables, metadata, and multi-page scraping with pagination. No AI dependencies, works deterministically with cheerio.
288
+
289
+ ```typescript
290
+ import { OwlBrowser } from '@olib-ai/owl-browser-sdk';
291
+ import { Extractor } from '@olib-ai/owl-browser-sdk/extraction';
292
+
293
+ const browser = new OwlBrowser({ url: '...', token: '...' });
294
+ await browser.connect();
295
+ const ctx = await browser.createContext();
296
+
297
+ const ex = new Extractor(browser, ctx.context_id);
298
+ await ex.goto('https://example.com/products');
299
+
300
+ // CSS selector extraction
301
+ const products = await ex.select('.product-card', {
302
+ name: 'h3',
303
+ price: '.price',
304
+ image: 'img@src',
305
+ link: 'a@href',
306
+ });
307
+
308
+ // Auto-detect repeating patterns (zero-config)
309
+ const patterns = await ex.detect();
310
+
311
+ // Multi-page scraping with automatic pagination
312
+ const result = await ex.scrape('.product-card', {
313
+ fields: { name: 'h3', price: '.price', sku: '@data-sku' },
314
+ maxPages: 10,
315
+ deduplicateBy: 'sku',
316
+ });
317
+ console.log(`${result.totalItems} items from ${result.pagesScraped} pages`);
318
+ ```
319
+
320
+ **Capabilities:**
321
+
322
+ | Method | Description |
323
+ |--------|-------------|
324
+ | `select()` / `selectFirst()` | Extract with CSS selectors and field specs (`"selector"`, `"selector@attr"`, object specs with transforms) |
325
+ | `table()` / `grid()` / `definitionList()` | Parse `<table>`, CSS grid/flexbox, and `<dl>` structures |
326
+ | `meta()` / `jsonLd()` | Extract OpenGraph, Twitter Card, JSON-LD, microdata, feeds |
327
+ | `detect()` / `detectAndExtract()` | Auto-discover repeating DOM patterns |
328
+ | `lists()` | Extract list/card containers with auto-field inference |
329
+ | `scrape()` | Multi-page with pagination detection (click-next, URL patterns, buttons, load-more, infinite scroll) |
330
+ | `clean()` | Remove cookie banners, modals, fixed elements, ads |
331
+ | `html()` / `markdown()` / `text()` | Raw content with cleaning levels |
332
+
333
+ All extraction functions are also available as standalone pure functions for use without a browser connection.
334
+
252
335
  ## Error Handling
253
336
 
254
337
  ```typescript
@@ -334,6 +417,30 @@ for (const [name, tool] of loader.tools) {
334
417
  - `static loadFlow(path): Flow` - Load flow from JSON file
335
418
  - `static parseFlow(data): Flow` - Parse flow from object
336
419
 
420
+ ### Extractor
421
+
422
+ - `goto(url, options?): Promise<void>` - Navigate to URL
423
+ - `select(selector, fields): Promise<Record[]>` - Extract from all matches
424
+ - `selectFirst(selector, fields): Promise<Record | null>` - Extract first match
425
+ - `count(selector): Promise<number>` - Count matching elements
426
+ - `table(selector?, options?): Promise<Record[]>` - Parse HTML tables
427
+ - `grid(container, item?): Promise<Record[]>` - Parse CSS grids
428
+ - `definitionList(selector?): Promise<Record>` - Parse `<dl>` lists
429
+ - `detectTables(): Promise<TableInfo[]>` - Auto-detect tables
430
+ - `meta(): Promise<MetaData>` - Extract page metadata
431
+ - `jsonLd(): Promise<object[]>` - Extract JSON-LD
432
+ - `detect(options?): Promise<DetectedPattern[]>` - Detect repeating patterns
433
+ - `detectAndExtract(options?): Promise<Record[]>` - Detect + extract
434
+ - `lists(selector, options?): Promise<Record[]>` - Extract lists/cards
435
+ - `scrape(selector, options?): Promise<ExtractionResult>` - Multi-page scrape
436
+ - `abortScrape(): void` - Abort running scrape
437
+ - `clean(options?): Promise<CleanResult>` - Remove obstructions
438
+ - `html(options?): Promise<string>` - Get page HTML
439
+ - `markdown(): Promise<string>` - Get page markdown
440
+ - `text(selector?, regex?): Promise<string>` - Get filtered text
441
+ - `detectSite(): Promise<string>` - Detect site type
442
+ - `siteData(template?): Promise<unknown>` - Site-specific extraction
443
+
337
444
  ## Requirements
338
445
 
339
446
  - Node.js 18+
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Remove obstructions from live pages via browser_evaluate JS injection.
3
+ *
4
+ * Handles cookie banners, modals, fixed elements, ads, and lazy loading.
5
+ */
6
+ import type { CleanOptions, CleanResult } from './types.js';
7
+ import type { HTMLProcessor } from './html-processor.js';
8
+ /**
9
+ * Remove all obstructions from the page.
10
+ */
11
+ export declare function removeObstructions(proc: HTMLProcessor, options?: CleanOptions): Promise<CleanResult>;
12
+ /**
13
+ * Try to dismiss cookie consent banners by clicking accept buttons.
14
+ */
15
+ export declare function dismissCookieBanners(proc: HTMLProcessor): Promise<number>;
16
+ /**
17
+ * Close modal dialogs and overlays.
18
+ */
19
+ export declare function closeModals(proc: HTMLProcessor): Promise<number>;
20
+ /**
21
+ * Remove position:fixed/sticky elements with high z-index.
22
+ */
23
+ export declare function removeFixedElements(proc: HTMLProcessor): Promise<number>;
24
+ /**
25
+ * Dismiss newsletter signup popups and overlays.
26
+ */
27
+ export declare function dismissNewsletterPopups(proc: HTMLProcessor): Promise<number>;
28
+ /**
29
+ * Remove paywall overlays and restore scrolling.
30
+ */
31
+ export declare function removePaywallOverlays(proc: HTMLProcessor): Promise<number>;
32
+ /**
33
+ * Remove common ad network elements.
34
+ */
35
+ export declare function removeAds(proc: HTMLProcessor): Promise<number>;
36
+ /**
37
+ * Trigger lazy-loaded content by scrolling the page.
38
+ */
39
+ export declare function triggerLazyLoad(proc: HTMLProcessor, scrolls?: number, wait?: number): Promise<void>;
40
+ //# sourceMappingURL=content-cleaner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-cleaner.d.ts","sourceRoot":"","sources":["../../src/extraction/content-cleaner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAC5D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA2CzD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,IAAI,EAAE,aAAa,EACnB,OAAO,CAAC,EAAE,YAAY,GACrB,OAAO,CAAC,WAAW,CAAC,CAqEtB;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAkE/E;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAuBtE;AAED;;GAEG;AACH,wBAAsB,mBAAmB,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAgE9E;AAED;;GAEG;AACH,wBAAsB,uBAAuB,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAsClF;AAED;;GAEG;AACH,wBAAsB,qBAAqB,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAmChF;AAED;;GAEG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAuBpE;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,IAAI,EAAE,aAAa,EACnB,OAAO,GAAE,MAAU,EACnB,IAAI,GAAE,MAAY,GACjB,OAAO,CAAC,IAAI,CAAC,CAOf"}
@@ -0,0 +1,393 @@
1
+ /**
2
+ * Remove obstructions from live pages via browser_evaluate JS injection.
3
+ *
4
+ * Handles cookie banners, modals, fixed elements, ads, and lazy loading.
5
+ */
6
+ // Common cookie consent button selectors
7
+ const COOKIE_SELECTORS = [
8
+ // Generic consent buttons
9
+ '[class*="cookie"] button[class*="accept"]',
10
+ '[class*="cookie"] button[class*="agree"]',
11
+ '[class*="cookie"] button[class*="allow"]',
12
+ '[class*="consent"] button[class*="accept"]',
13
+ '[class*="consent"] button[class*="agree"]',
14
+ '[id*="cookie"] button',
15
+ '[id*="consent"] button',
16
+ // Common cookie banner IDs
17
+ '#onetrust-accept-btn-handler',
18
+ '#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll',
19
+ '.cc-accept',
20
+ '.cc-btn.cc-dismiss',
21
+ '#accept-cookies',
22
+ '#acceptAllCookies',
23
+ '.cookie-accept',
24
+ '.cookie-consent-accept',
25
+ '[data-action="accept-cookies"]',
26
+ '[data-testid="cookie-accept"]',
27
+ // GDPR banners
28
+ '.gdpr-accept',
29
+ '#gdpr-accept',
30
+ '[class*="gdpr"] button[class*="accept"]',
31
+ // i18n consent buttons
32
+ 'button[class*="accept"]',
33
+ 'button[class*="agree"]',
34
+ 'button[class*="allow"]',
35
+ ];
36
+ // Common modal/overlay selectors
37
+ const MODAL_SELECTORS = [
38
+ 'dialog[open]',
39
+ '[class*="modal"][class*="overlay"]',
40
+ '[class*="popup"][class*="overlay"]',
41
+ '[class*="modal-backdrop"]',
42
+ '[class*="overlay"][style*="z-index"]',
43
+ '[role="dialog"]',
44
+ ];
45
+ /**
46
+ * Remove all obstructions from the page.
47
+ */
48
+ export async function removeObstructions(proc, options) {
49
+ const opts = {
50
+ cookieBanners: true,
51
+ modals: true,
52
+ fixedElements: true,
53
+ ads: false,
54
+ lazyLoad: false,
55
+ ...options,
56
+ };
57
+ const removedTypes = [];
58
+ let totalRemoved = 0;
59
+ if (opts.cookieBanners) {
60
+ const count = await dismissCookieBanners(proc);
61
+ if (count > 0) {
62
+ removedTypes.push('cookie-banners');
63
+ totalRemoved += count;
64
+ }
65
+ }
66
+ if (opts.modals) {
67
+ const count = await closeModals(proc);
68
+ if (count > 0) {
69
+ removedTypes.push('modals');
70
+ totalRemoved += count;
71
+ }
72
+ }
73
+ // Newsletter popups (always on by default)
74
+ {
75
+ const count = await dismissNewsletterPopups(proc);
76
+ if (count > 0) {
77
+ removedTypes.push('newsletter-popups');
78
+ totalRemoved += count;
79
+ }
80
+ }
81
+ // Paywall overlays (always on by default)
82
+ {
83
+ const count = await removePaywallOverlays(proc);
84
+ if (count > 0) {
85
+ removedTypes.push('paywall-overlays');
86
+ totalRemoved += count;
87
+ }
88
+ }
89
+ if (opts.fixedElements) {
90
+ const count = await removeFixedElements(proc);
91
+ if (count > 0) {
92
+ removedTypes.push('fixed-elements');
93
+ totalRemoved += count;
94
+ }
95
+ }
96
+ if (opts.ads) {
97
+ const count = await removeAds(proc);
98
+ if (count > 0) {
99
+ removedTypes.push('ads');
100
+ totalRemoved += count;
101
+ }
102
+ }
103
+ if (opts.lazyLoad) {
104
+ await triggerLazyLoad(proc, opts.lazyLoadScrolls, opts.lazyLoadWait);
105
+ removedTypes.push('lazy-load-triggered');
106
+ }
107
+ return { removedCount: totalRemoved, removedTypes };
108
+ }
109
+ /**
110
+ * Try to dismiss cookie consent banners by clicking accept buttons.
111
+ */
112
+ export async function dismissCookieBanners(proc) {
113
+ const js = `
114
+ (function() {
115
+ var selectors = ${JSON.stringify(COOKIE_SELECTORS)};
116
+ var removed = 0;
117
+ for (var i = 0; i < selectors.length; i++) {
118
+ try {
119
+ var els = document.querySelectorAll(selectors[i]);
120
+ for (var j = 0; j < els.length; j++) {
121
+ var el = els[j];
122
+ if (el.offsetParent !== null) {
123
+ el.click();
124
+ removed++;
125
+ }
126
+ }
127
+ } catch(e) {}
128
+ }
129
+ // i18n text-based cookie accept detection
130
+ var acceptTexts = [
131
+ 'accept', 'agree', 'allow', 'ok', 'got it', 'i understand',
132
+ 'akzeptieren', 'zustimmen', 'einverstanden',
133
+ 'accepter', 'accepte',
134
+ 'aceptar', 'acepto',
135
+ 'accetta', 'accetto',
136
+ 'accepteren',
137
+ 'godk\\u00e4nn', 'acceptera',
138
+ '\\u043f\\u0440\\u0438\\u043d\\u044f\\u0442\\u044c', '\\u0441\\u043e\\u0433\\u043b\\u0430\\u0441\\u0435\\u043d',
139
+ ];
140
+ var cookieContainers = document.querySelectorAll(
141
+ '[class*="cookie"], [class*="consent"], [class*="gdpr"], [class*="privacy"], [id*="cookie"], [id*="consent"]'
142
+ );
143
+ for (var c = 0; c < cookieContainers.length; c++) {
144
+ var buttons = cookieContainers[c].querySelectorAll('button, a[role="button"], [class*="btn"]');
145
+ for (var b = 0; b < buttons.length; b++) {
146
+ var btn = buttons[b];
147
+ if (btn.offsetParent === null) continue;
148
+ var text = (btn.textContent || '').trim().toLowerCase();
149
+ for (var t = 0; t < acceptTexts.length; t++) {
150
+ if (text.indexOf(acceptTexts[t]) !== -1) {
151
+ btn.click();
152
+ removed++;
153
+ break;
154
+ }
155
+ }
156
+ }
157
+ }
158
+ // Remove common banner containers
159
+ var bannerSels = [
160
+ '[class*="cookie-banner"]', '[class*="cookie-notice"]',
161
+ '[class*="cookie-bar"]', '[id*="cookie-banner"]',
162
+ '[class*="consent-banner"]', '[class*="consent-bar"]',
163
+ ];
164
+ for (var i = 0; i < bannerSels.length; i++) {
165
+ try {
166
+ var els = document.querySelectorAll(bannerSels[i]);
167
+ for (var j = 0; j < els.length; j++) {
168
+ els[j].remove();
169
+ removed++;
170
+ }
171
+ } catch(e) {}
172
+ }
173
+ return removed;
174
+ })()
175
+ `;
176
+ const result = await proc.evaluate(js);
177
+ return typeof result === 'number' ? result : 0;
178
+ }
179
+ /**
180
+ * Close modal dialogs and overlays.
181
+ */
182
+ export async function closeModals(proc) {
183
+ const js = `
184
+ (function() {
185
+ var selectors = ${JSON.stringify(MODAL_SELECTORS)};
186
+ var removed = 0;
187
+ for (var i = 0; i < selectors.length; i++) {
188
+ var els = document.querySelectorAll(selectors[i]);
189
+ for (var j = 0; j < els.length; j++) {
190
+ els[j].remove();
191
+ removed++;
192
+ }
193
+ }
194
+ // Close open <dialog> elements
195
+ var dialogs = document.querySelectorAll('dialog[open]');
196
+ for (var i = 0; i < dialogs.length; i++) {
197
+ dialogs[i].close();
198
+ removed++;
199
+ }
200
+ return removed;
201
+ })()
202
+ `;
203
+ const result = await proc.evaluate(js);
204
+ return typeof result === 'number' ? result : 0;
205
+ }
206
+ /**
207
+ * Remove position:fixed/sticky elements with high z-index.
208
+ */
209
+ export async function removeFixedElements(proc) {
210
+ const js = `
211
+ (function() {
212
+ var removed = 0;
213
+ // Strategy 1: Check elements with known fixed/sticky class patterns first
214
+ var knownPatterns = [
215
+ '[class*="sticky"]', '[class*="fixed"]', '[class*="toolbar"]',
216
+ '[class*="header"]', '[class*="navbar"]', '[class*="topbar"]',
217
+ '[class*="bottombar"]', '[class*="footer"]', '[class*="dock"]',
218
+ '[class*="float"]', '[class*="overlay"]',
219
+ ];
220
+ var checked = new Set();
221
+ for (var p = 0; p < knownPatterns.length; p++) {
222
+ try {
223
+ var els = document.querySelectorAll(knownPatterns[p]);
224
+ for (var i = 0; i < els.length; i++) {
225
+ var el = els[i];
226
+ if (checked.has(el)) continue;
227
+ checked.add(el);
228
+ if (el === document.body || el === document.documentElement) continue;
229
+ var style = window.getComputedStyle(el);
230
+ var pos = style.position;
231
+ if (pos === 'fixed' || pos === 'sticky') {
232
+ var zIndex = parseInt(style.zIndex) || 0;
233
+ var rect = el.getBoundingClientRect();
234
+ if ((rect.width > 200 || rect.height > 50) && zIndex >= 100) {
235
+ el.remove();
236
+ removed++;
237
+ }
238
+ }
239
+ }
240
+ } catch(e) {}
241
+ }
242
+ // Strategy 2: Limited scan — top-level children + first 500 elements
243
+ var fallback = document.body ? document.body.children : [];
244
+ var toCheck = Array.from(fallback);
245
+ // Also grab up to 500 elements total
246
+ var allEls = document.querySelectorAll('body > *, body > * > *');
247
+ for (var i = 0; i < Math.min(allEls.length, 500); i++) {
248
+ if (!checked.has(allEls[i])) toCheck.push(allEls[i]);
249
+ }
250
+ for (var i = 0; i < toCheck.length; i++) {
251
+ var el = toCheck[i];
252
+ if (checked.has(el)) continue;
253
+ checked.add(el);
254
+ if (el === document.body || el === document.documentElement) continue;
255
+ try {
256
+ var style = window.getComputedStyle(el);
257
+ var pos = style.position;
258
+ if (pos === 'fixed' || pos === 'sticky') {
259
+ var zIndex = parseInt(style.zIndex) || 0;
260
+ var rect = el.getBoundingClientRect();
261
+ if ((rect.width > 200 || rect.height > 50) && zIndex >= 100) {
262
+ el.remove();
263
+ removed++;
264
+ }
265
+ }
266
+ } catch(e) {}
267
+ }
268
+ return removed;
269
+ })()
270
+ `;
271
+ const result = await proc.evaluate(js);
272
+ return typeof result === 'number' ? result : 0;
273
+ }
274
+ /**
275
+ * Dismiss newsletter signup popups and overlays.
276
+ */
277
+ export async function dismissNewsletterPopups(proc) {
278
+ const js = `
279
+ (function() {
280
+ var removed = 0;
281
+ var selectors = [
282
+ '[class*="newsletter"][class*="modal"]',
283
+ '[class*="newsletter"][class*="popup"]',
284
+ '[class*="newsletter"][class*="overlay"]',
285
+ '[class*="subscribe"][class*="modal"]',
286
+ '[class*="subscribe"][class*="popup"]',
287
+ '[class*="signup"][class*="modal"]',
288
+ '[class*="signup"][class*="popup"]',
289
+ '[class*="email-capture"]',
290
+ '[class*="newsletter-signup"]',
291
+ ];
292
+ for (var i = 0; i < selectors.length; i++) {
293
+ try {
294
+ var els = document.querySelectorAll(selectors[i]);
295
+ for (var j = 0; j < els.length; j++) {
296
+ els[j].remove();
297
+ removed++;
298
+ }
299
+ } catch(e) {}
300
+ }
301
+ // Also try to find and click close buttons within newsletter popups
302
+ var containers = document.querySelectorAll('[class*="newsletter"], [class*="subscribe"]');
303
+ for (var i = 0; i < containers.length; i++) {
304
+ var close = containers[i].querySelector('[class*="close"], [aria-label*="close" i], button[class*="dismiss"]');
305
+ if (close) {
306
+ close.click();
307
+ removed++;
308
+ }
309
+ }
310
+ return removed;
311
+ })()
312
+ `;
313
+ const result = await proc.evaluate(js);
314
+ return typeof result === 'number' ? result : 0;
315
+ }
316
+ /**
317
+ * Remove paywall overlays and restore scrolling.
318
+ */
319
+ export async function removePaywallOverlays(proc) {
320
+ const js = `
321
+ (function() {
322
+ var removed = 0;
323
+ var selectors = [
324
+ '[class*="paywall"]',
325
+ '[class*="regwall"]',
326
+ '[class*="gate"][class*="overlay"]',
327
+ '[class*="gate"][class*="modal"]',
328
+ '[id*="paywall"]',
329
+ '[data-testid*="paywall"]',
330
+ '[class*="metered"]',
331
+ ];
332
+ for (var i = 0; i < selectors.length; i++) {
333
+ try {
334
+ var els = document.querySelectorAll(selectors[i]);
335
+ for (var j = 0; j < els.length; j++) {
336
+ els[j].remove();
337
+ removed++;
338
+ }
339
+ } catch(e) {}
340
+ }
341
+ // Remove body overflow:hidden that paywalls often set
342
+ if (document.body) {
343
+ var bodyStyle = window.getComputedStyle(document.body);
344
+ if (bodyStyle.overflow === 'hidden') {
345
+ document.body.style.overflow = '';
346
+ removed++;
347
+ }
348
+ }
349
+ return removed;
350
+ })()
351
+ `;
352
+ const result = await proc.evaluate(js);
353
+ return typeof result === 'number' ? result : 0;
354
+ }
355
+ /**
356
+ * Remove common ad network elements.
357
+ */
358
+ export async function removeAds(proc) {
359
+ const js = `
360
+ (function() {
361
+ var selectors = [
362
+ '[class*="ad-container"]', '[class*="ad-wrapper"]',
363
+ '[class*="advertisement"]', '[id*="google_ads"]',
364
+ '[id*="ad-slot"]', 'ins.adsbygoogle',
365
+ '[data-ad]', '[class*="sponsored"]',
366
+ 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
367
+ ];
368
+ var removed = 0;
369
+ for (var i = 0; i < selectors.length; i++) {
370
+ var els = document.querySelectorAll(selectors[i]);
371
+ for (var j = 0; j < els.length; j++) {
372
+ els[j].remove();
373
+ removed++;
374
+ }
375
+ }
376
+ return removed;
377
+ })()
378
+ `;
379
+ const result = await proc.evaluate(js);
380
+ return typeof result === 'number' ? result : 0;
381
+ }
382
+ /**
383
+ * Trigger lazy-loaded content by scrolling the page.
384
+ */
385
+ export async function triggerLazyLoad(proc, scrolls = 3, wait = 500) {
386
+ for (let i = 0; i < scrolls; i++) {
387
+ await proc.scrollBy(0, 800);
388
+ await proc.wait(wait);
389
+ }
390
+ // Scroll back to top
391
+ await proc.evaluate('window.scrollTo(0, 0)');
392
+ }
393
+ //# sourceMappingURL=content-cleaner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-cleaner.js","sourceRoot":"","sources":["../../src/extraction/content-cleaner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,yCAAyC;AACzC,MAAM,gBAAgB,GAAG;IACvB,0BAA0B;IAC1B,2CAA2C;IAC3C,0CAA0C;IAC1C,0CAA0C;IAC1C,4CAA4C;IAC5C,2CAA2C;IAC3C,uBAAuB;IACvB,wBAAwB;IACxB,2BAA2B;IAC3B,8BAA8B;IAC9B,wDAAwD;IACxD,YAAY;IACZ,oBAAoB;IACpB,iBAAiB;IACjB,mBAAmB;IACnB,gBAAgB;IAChB,wBAAwB;IACxB,gCAAgC;IAChC,+BAA+B;IAC/B,eAAe;IACf,cAAc;IACd,cAAc;IACd,yCAAyC;IACzC,uBAAuB;IACvB,yBAAyB;IACzB,wBAAwB;IACxB,wBAAwB;CACzB,CAAC;AAEF,iCAAiC;AACjC,MAAM,eAAe,GAAG;IACtB,cAAc;IACd,oCAAoC;IACpC,oCAAoC;IACpC,2BAA2B;IAC3B,sCAAsC;IACtC,iBAAiB;CAClB,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,IAAmB,EACnB,OAAsB;IAEtB,MAAM,IAAI,GAAiB;QACzB,aAAa,EAAE,IAAI;QACnB,MAAM,EAAE,IAAI;QACZ,aAAa,EAAE,IAAI;QACnB,GAAG,EAAE,KAAK;QACV,QAAQ,EAAE,KAAK;QACf,GAAG,OAAO;KACX,CAAC;IAEF,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,IAAI,YAAY,GAAG,CAAC,CAAC;IAErB,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,MAAM,oBAAoB,CAAC,IAAI,CAAC,CAAC;QAC/C,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YACpC,YAAY,IAAI,KAAK,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,MAAM,KAAK,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,CAAC;QACtC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAC5B,YAAY,IAAI,KAAK,CAAC;QACxB,CAAC;IACH,CAAC;IAED,2CAA2C;IAC3C,CAAC;QACC,MAAM,KAAK,GAAG,MAAM,uBAAuB,CAAC,IAAI,CAAC,CAAC;QAClD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;YACvC,YAAY,IAAI,KAAK,CAAC;QACxB,CAAC;IACH,CAAC;IAED,0CAA0C;IAC1C,CAAC;QACC,MAAM,KAAK,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;YACtC,YAAY,IAAI,KAAK,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,MAAM,mBAAmB,CAAC,IAAI,CAAC,CAAC;QAC9C,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YACpC,YAAY,IAAI,KAAK,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,GAAG,EAAE,CAAC;QACb,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC;QACpC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACzB,YAAY,IAAI,KAAK,CAAC;QACxB,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClB,MAAM,eAAe,CAAC,IAAI,EAAE,IAAI,CAAC,eAAe,EAAE,IAAI,CAAC,YAAY,CAAC,CAAC;QACrE,YAAY,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,CAAC;AACtD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,IAAmB;IAC5D,MAAM,EAAE,GAAG;;wBAEW,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4DrD,CAAC;IACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvC,OAAO,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,IAAmB;IACnD,MAAM,EAAE,GAAG;;wBAEW,IAAI,CAAC,SAAS,CAAC,eAAe,CAAC;;;;;;;;;;;;;;;;;GAiBpD,CAAC;IACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvC,OAAO,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,IAAmB;IAC3D,MAAM,EAAE,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4DV,CAAC;IACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvC,OAAO,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAAC,IAAmB;IAC/D,MAAM,EAAE,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCV,CAAC;IACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvC,OAAO,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CAAC,IAAmB;IAC7D,MAAM,EAAE,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BV,CAAC;IACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvC,OAAO,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAmB;IACjD,MAAM,EAAE,GAAG;;;;;;;;;;;;;;;;;;;GAmBV,CAAC;IACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACvC,OAAO,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,IAAmB,EACnB,UAAkB,CAAC,EACnB,OAAe,GAAG;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;QACjC,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QAC5B,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxB,CAAC;IACD,qBAAqB;IACrB,MAAM,IAAI,CAAC,QAAQ,CAAC,uBAAuB,CAAC,CAAC;AAC/C,CAAC"}