maxun-core 0.0.12 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +93 -45
- package/build/interpret.js +137 -72
- package/package.json +1 -1
|
@@ -174,7 +174,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
174
174
|
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
|
|
175
175
|
return Array.from(document.querySelectorAll(config.selector));
|
|
176
176
|
}
|
|
177
|
-
// First handle iframe traversal if present
|
|
178
177
|
if (config.selector.includes(':>>')) {
|
|
179
178
|
const parts = config.selector.split(':>>').map(s => s.trim());
|
|
180
179
|
let currentElements = [document];
|
|
@@ -185,24 +184,42 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
185
184
|
const isLast = i === parts.length - 1;
|
|
186
185
|
for (const element of currentElements) {
|
|
187
186
|
try {
|
|
188
|
-
// For document or iframe document
|
|
189
187
|
const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
|
|
190
188
|
if (!doc)
|
|
191
189
|
continue;
|
|
192
|
-
|
|
190
|
+
if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
|
|
191
|
+
const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
|
|
192
|
+
if (nameMatch && nameMatch[1]) {
|
|
193
|
+
const frameName = nameMatch[1];
|
|
194
|
+
let foundFrames = [];
|
|
195
|
+
if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
|
|
196
|
+
foundFrames = Array.from(doc.getElementsByName(frameName))
|
|
197
|
+
.filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
|
|
198
|
+
}
|
|
199
|
+
if (foundFrames.length === 0) {
|
|
200
|
+
const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
|
|
201
|
+
foundFrames = framesBySelector;
|
|
202
|
+
}
|
|
203
|
+
if (isLast) {
|
|
204
|
+
nextElements.push(...foundFrames);
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
nextElements.push(...foundFrames);
|
|
208
|
+
}
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
193
212
|
const found = Array.from(doc.querySelectorAll(part));
|
|
194
213
|
if (isLast) {
|
|
195
|
-
// If it's the last part, keep all matching elements
|
|
196
214
|
nextElements.push(...found);
|
|
197
215
|
}
|
|
198
216
|
else {
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
nextElements.push(...iframes);
|
|
217
|
+
const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
|
|
218
|
+
nextElements.push(...frames);
|
|
202
219
|
}
|
|
203
220
|
}
|
|
204
221
|
catch (error) {
|
|
205
|
-
console.warn('Cannot access iframe content:', error, {
|
|
222
|
+
console.warn('Cannot access iframe/frame content:', error, {
|
|
206
223
|
part,
|
|
207
224
|
element,
|
|
208
225
|
index: i
|
|
@@ -242,13 +259,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
242
259
|
}
|
|
243
260
|
return [];
|
|
244
261
|
}
|
|
245
|
-
// Modified to handle iframe context for URL resolution
|
|
246
262
|
function getElementValue(element, attribute) {
|
|
247
|
-
var _a, _b, _c, _d, _e;
|
|
263
|
+
var _a, _b, _c, _d, _e, _f;
|
|
248
264
|
if (!element)
|
|
249
265
|
return null;
|
|
250
|
-
|
|
251
|
-
|
|
266
|
+
let baseURL;
|
|
267
|
+
try {
|
|
268
|
+
baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) ||
|
|
269
|
+
((_c = element.ownerDocument) === null || _c === void 0 ? void 0 : _c.baseURI) ||
|
|
270
|
+
window.location.origin;
|
|
271
|
+
}
|
|
272
|
+
catch (e) {
|
|
273
|
+
baseURL = window.location.origin;
|
|
274
|
+
}
|
|
252
275
|
switch (attribute) {
|
|
253
276
|
case 'href': {
|
|
254
277
|
const relativeHref = element.getAttribute('href');
|
|
@@ -259,11 +282,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
259
282
|
return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
|
|
260
283
|
}
|
|
261
284
|
case 'innerText':
|
|
262
|
-
return (
|
|
285
|
+
return (_d = element.innerText) === null || _d === void 0 ? void 0 : _d.trim();
|
|
263
286
|
case 'textContent':
|
|
264
|
-
return (
|
|
287
|
+
return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
|
|
288
|
+
case 'innerHTML':
|
|
289
|
+
return element.innerHTML;
|
|
290
|
+
case 'outerHTML':
|
|
291
|
+
return element.outerHTML;
|
|
265
292
|
default:
|
|
266
|
-
return element.getAttribute(attribute) || ((
|
|
293
|
+
return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
|
|
267
294
|
}
|
|
268
295
|
}
|
|
269
296
|
// Rest of the functions remain largely the same
|
|
@@ -332,7 +359,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
332
359
|
*/
|
|
333
360
|
window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
|
|
334
361
|
return __awaiter(this, void 0, void 0, function* () {
|
|
335
|
-
// Enhanced query function to handle
|
|
362
|
+
// Enhanced query function to handle iframe, frame and shadow DOM
|
|
336
363
|
const queryElement = (rootElement, selector) => {
|
|
337
364
|
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
338
365
|
return rootElement.querySelector(selector);
|
|
@@ -342,15 +369,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
342
369
|
for (let i = 0; i < parts.length; i++) {
|
|
343
370
|
if (!currentElement)
|
|
344
371
|
return null;
|
|
345
|
-
// Handle iframe traversal
|
|
346
|
-
if (currentElement.tagName === 'IFRAME') {
|
|
372
|
+
// Handle iframe and frame traversal
|
|
373
|
+
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
|
347
374
|
try {
|
|
348
|
-
const
|
|
349
|
-
currentElement =
|
|
375
|
+
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
|
376
|
+
currentElement = frameDoc.querySelector(parts[i]);
|
|
350
377
|
continue;
|
|
351
378
|
}
|
|
352
379
|
catch (e) {
|
|
353
|
-
console.warn(
|
|
380
|
+
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
|
354
381
|
return null;
|
|
355
382
|
}
|
|
356
383
|
}
|
|
@@ -385,14 +412,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
385
412
|
for (const part of parts) {
|
|
386
413
|
const nextElements = [];
|
|
387
414
|
for (const element of currentElements) {
|
|
388
|
-
// Handle iframe traversal
|
|
389
|
-
if (element.tagName === 'IFRAME') {
|
|
415
|
+
// Handle iframe and frame traversal
|
|
416
|
+
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
|
390
417
|
try {
|
|
391
|
-
const
|
|
392
|
-
nextElements.push(...
|
|
418
|
+
const frameDoc = element.contentDocument || element.contentWindow.document;
|
|
419
|
+
nextElements.push(...frameDoc.querySelectorAll(part));
|
|
393
420
|
}
|
|
394
421
|
catch (e) {
|
|
395
|
-
console.warn(
|
|
422
|
+
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
|
396
423
|
continue;
|
|
397
424
|
}
|
|
398
425
|
}
|
|
@@ -440,7 +467,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
440
467
|
}
|
|
441
468
|
else if (attribute === 'src' || attribute === 'href') {
|
|
442
469
|
const attrValue = element.getAttribute(attribute);
|
|
443
|
-
|
|
470
|
+
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
471
|
+
if (!dataAttr || dataAttr.trim() === '') {
|
|
472
|
+
if (attribute === 'src') {
|
|
473
|
+
const style = window.getComputedStyle(element);
|
|
474
|
+
const bgImage = style.backgroundImage;
|
|
475
|
+
if (bgImage && bgImage !== 'none') {
|
|
476
|
+
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
477
|
+
return matches ? new URL(matches[1], baseURL).href : null;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
return null;
|
|
481
|
+
}
|
|
482
|
+
try {
|
|
483
|
+
return new URL(dataAttr, baseURL).href;
|
|
484
|
+
}
|
|
485
|
+
catch (e) {
|
|
486
|
+
console.warn('Error creating URL from', dataAttr, e);
|
|
487
|
+
return dataAttr; // Return the original value if URL construction fails
|
|
488
|
+
}
|
|
444
489
|
}
|
|
445
490
|
return element.getAttribute(attribute);
|
|
446
491
|
}
|
|
@@ -461,8 +506,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
461
506
|
else if (currentElement.tagName === 'TR') {
|
|
462
507
|
return { type: 'TR', element: currentElement };
|
|
463
508
|
}
|
|
464
|
-
// Handle iframe crossing
|
|
465
|
-
if (currentElement.tagName === 'IFRAME') {
|
|
509
|
+
// Handle iframe and frame crossing
|
|
510
|
+
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
|
466
511
|
try {
|
|
467
512
|
currentElement = currentElement.contentDocument.body;
|
|
468
513
|
}
|
|
@@ -504,7 +549,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
504
549
|
}
|
|
505
550
|
if (current.tagName === 'TH')
|
|
506
551
|
return true;
|
|
507
|
-
if (current.tagName === 'IFRAME') {
|
|
552
|
+
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
|
508
553
|
try {
|
|
509
554
|
current = current.contentDocument.body;
|
|
510
555
|
}
|
|
@@ -556,15 +601,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
556
601
|
const shadowHost = baseElement.getRootNode().host;
|
|
557
602
|
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
|
558
603
|
}
|
|
559
|
-
// Get elements from iframes
|
|
560
|
-
const
|
|
561
|
-
|
|
604
|
+
// Get elements from iframes and frames
|
|
605
|
+
const frames = [
|
|
606
|
+
...Array.from(document.getElementsByTagName('iframe')),
|
|
607
|
+
...Array.from(document.getElementsByTagName('frame'))
|
|
608
|
+
];
|
|
609
|
+
for (const frame of frames) {
|
|
562
610
|
try {
|
|
563
|
-
const
|
|
564
|
-
allElements.push(...
|
|
611
|
+
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
|
612
|
+
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
|
565
613
|
}
|
|
566
614
|
catch (e) {
|
|
567
|
-
console.warn(
|
|
615
|
+
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
|
568
616
|
}
|
|
569
617
|
}
|
|
570
618
|
return allElements.filter(element => {
|
|
@@ -611,7 +659,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
611
659
|
});
|
|
612
660
|
const tableData = [];
|
|
613
661
|
const nonTableData = [];
|
|
614
|
-
// Process table data with
|
|
662
|
+
// Process table data with support for iframes, frames, and shadow DOM
|
|
615
663
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
616
664
|
const container = containers[containerIndex];
|
|
617
665
|
const { tableFields } = containerFields[containerIndex];
|
|
@@ -619,13 +667,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
619
667
|
const firstField = Object.values(tableFields)[0];
|
|
620
668
|
const firstElement = queryElement(container, firstField.selector);
|
|
621
669
|
let tableContext = firstElement;
|
|
622
|
-
// Find table context including
|
|
670
|
+
// Find table context including iframe, frame and shadow DOM
|
|
623
671
|
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
|
624
672
|
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
625
673
|
tableContext = tableContext.getRootNode().host;
|
|
626
674
|
continue;
|
|
627
675
|
}
|
|
628
|
-
if (tableContext.tagName === 'IFRAME') {
|
|
676
|
+
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
|
629
677
|
try {
|
|
630
678
|
tableContext = tableContext.contentDocument.body;
|
|
631
679
|
}
|
|
@@ -646,14 +694,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
646
694
|
if (tableContext.shadowRoot) {
|
|
647
695
|
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
|
648
696
|
}
|
|
649
|
-
// Get rows from iframes
|
|
650
|
-
if (tableContext.tagName === 'IFRAME') {
|
|
697
|
+
// Get rows from iframes and frames
|
|
698
|
+
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
|
651
699
|
try {
|
|
652
|
-
const
|
|
653
|
-
rows.push(...
|
|
700
|
+
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
|
701
|
+
rows.push(...frameDoc.getElementsByTagName('TR'));
|
|
654
702
|
}
|
|
655
703
|
catch (e) {
|
|
656
|
-
console.warn(
|
|
704
|
+
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
|
657
705
|
}
|
|
658
706
|
}
|
|
659
707
|
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
|
@@ -713,7 +761,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
713
761
|
}
|
|
714
762
|
}
|
|
715
763
|
}
|
|
716
|
-
// Process non-table data with
|
|
764
|
+
// Process non-table data with all contexts support
|
|
717
765
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
718
766
|
if (nonTableData.length >= limit)
|
|
719
767
|
break;
|
package/build/interpret.js
CHANGED
|
@@ -234,6 +234,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
234
234
|
parsedSuperset[key] = Array.isArray(superset[key])
|
|
235
235
|
? (0, utils_1.arrayToObject)(superset[key])
|
|
236
236
|
: superset[key];
|
|
237
|
+
if ((key === 'url' || key === 'selectors') &&
|
|
238
|
+
Array.isArray(value) && Array.isArray(superset[key]) &&
|
|
239
|
+
value.length === 0 && superset[key].length === 0) {
|
|
240
|
+
return true;
|
|
241
|
+
}
|
|
237
242
|
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
|
238
243
|
return value.some(selector => superset[key].includes(selector));
|
|
239
244
|
}
|
|
@@ -495,29 +500,45 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
495
500
|
return false;
|
|
496
501
|
};
|
|
497
502
|
// Enhanced button finder with retry mechanism
|
|
498
|
-
const findWorkingButton = (selectors
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
503
|
+
const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
|
|
504
|
+
let updatedSelectors = [...selectors];
|
|
505
|
+
for (let i = 0; i < selectors.length; i++) {
|
|
506
|
+
const selector = selectors[i];
|
|
507
|
+
let retryCount = 0;
|
|
508
|
+
let selectorSuccess = false;
|
|
509
|
+
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
|
510
|
+
try {
|
|
511
|
+
const button = yield page.waitForSelector(selector, {
|
|
512
|
+
state: 'attached',
|
|
513
|
+
timeout: 10000
|
|
514
|
+
});
|
|
515
|
+
if (button) {
|
|
516
|
+
debugLog('Found working selector:', selector);
|
|
517
|
+
return {
|
|
518
|
+
button,
|
|
519
|
+
workingSelector: selector,
|
|
520
|
+
updatedSelectors
|
|
521
|
+
};
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
catch (error) {
|
|
525
|
+
retryCount++;
|
|
526
|
+
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
|
|
527
|
+
if (retryCount < MAX_RETRIES) {
|
|
528
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
529
|
+
}
|
|
530
|
+
else {
|
|
531
|
+
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
532
|
+
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
533
|
+
}
|
|
508
534
|
}
|
|
509
535
|
}
|
|
510
|
-
catch (error) {
|
|
511
|
-
debugLog(`Selector failed: ${selector}`);
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
// Implement retry mechanism when no selectors work
|
|
515
|
-
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
|
516
|
-
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
517
|
-
yield page.waitForTimeout(RETRY_DELAY);
|
|
518
|
-
return findWorkingButton(selectors, retryCount + 1);
|
|
519
536
|
}
|
|
520
|
-
return {
|
|
537
|
+
return {
|
|
538
|
+
button: null,
|
|
539
|
+
workingSelector: null,
|
|
540
|
+
updatedSelectors
|
|
541
|
+
};
|
|
521
542
|
});
|
|
522
543
|
const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
523
544
|
try {
|
|
@@ -569,7 +590,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
569
590
|
yield scrapeCurrentPage();
|
|
570
591
|
if (checkLimit())
|
|
571
592
|
return allResults;
|
|
572
|
-
const { button, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
593
|
+
const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
594
|
+
availableSelectors = updatedSelectors;
|
|
573
595
|
if (!button || !workingSelector) {
|
|
574
596
|
// Final retry for navigation when no selectors work
|
|
575
597
|
const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -586,82 +608,114 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
586
608
|
return allResults;
|
|
587
609
|
break;
|
|
588
610
|
}
|
|
589
|
-
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
590
611
|
let retryCount = 0;
|
|
591
|
-
let
|
|
592
|
-
|
|
612
|
+
let paginationSuccess = false;
|
|
613
|
+
// Capture basic content signature before click
|
|
614
|
+
const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
|
|
615
|
+
return yield page.evaluate((selector) => {
|
|
616
|
+
const items = document.querySelectorAll(selector);
|
|
617
|
+
return {
|
|
618
|
+
url: window.location.href,
|
|
619
|
+
itemCount: items.length,
|
|
620
|
+
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
|
|
621
|
+
};
|
|
622
|
+
}, config.listSelector);
|
|
623
|
+
});
|
|
624
|
+
const beforeSignature = yield captureContentSignature();
|
|
625
|
+
debugLog(`Before click: ${beforeSignature.itemCount} items`);
|
|
626
|
+
while (retryCount < MAX_RETRIES && !paginationSuccess) {
|
|
593
627
|
try {
|
|
594
628
|
try {
|
|
595
629
|
yield Promise.all([
|
|
596
630
|
page.waitForNavigation({
|
|
597
631
|
waitUntil: 'networkidle',
|
|
598
632
|
timeout: 15000
|
|
633
|
+
}).catch(e => {
|
|
634
|
+
throw e;
|
|
599
635
|
}),
|
|
600
636
|
button.click()
|
|
601
637
|
]);
|
|
602
|
-
|
|
638
|
+
debugLog("Navigation successful after regular click");
|
|
639
|
+
paginationSuccess = true;
|
|
603
640
|
}
|
|
604
|
-
catch (
|
|
605
|
-
debugLog(
|
|
606
|
-
|
|
607
|
-
|
|
641
|
+
catch (navError) {
|
|
642
|
+
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
|
643
|
+
try {
|
|
644
|
+
yield Promise.all([
|
|
645
|
+
page.waitForNavigation({
|
|
646
|
+
waitUntil: 'networkidle',
|
|
647
|
+
timeout: 15000
|
|
648
|
+
}).catch(e => {
|
|
649
|
+
throw e;
|
|
650
|
+
}),
|
|
651
|
+
button.dispatchEvent('click')
|
|
652
|
+
]);
|
|
653
|
+
debugLog("Navigation successful after dispatch event");
|
|
654
|
+
paginationSuccess = true;
|
|
655
|
+
}
|
|
656
|
+
catch (dispatchNavError) {
|
|
608
657
|
try {
|
|
609
|
-
yield
|
|
610
|
-
|
|
611
|
-
waitUntil: 'networkidle',
|
|
612
|
-
timeout: 15000
|
|
613
|
-
}),
|
|
614
|
-
button.dispatchEvent('click')
|
|
615
|
-
]);
|
|
616
|
-
navigationSuccess = true;
|
|
658
|
+
yield button.click();
|
|
659
|
+
yield page.waitForTimeout(2000);
|
|
617
660
|
}
|
|
618
|
-
catch (
|
|
619
|
-
|
|
661
|
+
catch (clickError) {
|
|
662
|
+
yield button.dispatchEvent('click');
|
|
663
|
+
yield page.waitForTimeout(2000);
|
|
620
664
|
}
|
|
621
665
|
}
|
|
622
|
-
else {
|
|
623
|
-
navigationSuccess = true;
|
|
624
|
-
}
|
|
625
|
-
}
|
|
626
|
-
const newUrl = page.url();
|
|
627
|
-
if (visitedUrls.has(newUrl)) {
|
|
628
|
-
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
|
629
|
-
navigationSuccess = false;
|
|
630
666
|
}
|
|
631
|
-
|
|
632
|
-
|
|
667
|
+
yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
|
|
668
|
+
if (!paginationSuccess) {
|
|
669
|
+
const newUrl = page.url();
|
|
670
|
+
const afterSignature = yield captureContentSignature();
|
|
671
|
+
if (newUrl !== currentUrl) {
|
|
672
|
+
debugLog(`URL changed to ${newUrl}`);
|
|
673
|
+
visitedUrls.add(newUrl);
|
|
674
|
+
paginationSuccess = true;
|
|
675
|
+
}
|
|
676
|
+
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
|
|
677
|
+
debugLog("Content changed without URL change");
|
|
678
|
+
paginationSuccess = true;
|
|
679
|
+
}
|
|
680
|
+
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
|
|
681
|
+
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
|
|
682
|
+
paginationSuccess = true;
|
|
683
|
+
}
|
|
633
684
|
}
|
|
634
685
|
}
|
|
635
686
|
catch (error) {
|
|
636
|
-
debugLog(`
|
|
637
|
-
navigationSuccess = false;
|
|
687
|
+
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
|
|
638
688
|
}
|
|
639
|
-
if (!
|
|
689
|
+
if (!paginationSuccess) {
|
|
640
690
|
retryCount++;
|
|
641
691
|
if (retryCount < MAX_RETRIES) {
|
|
642
|
-
debugLog(`Retrying
|
|
692
|
+
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
643
693
|
yield page.waitForTimeout(RETRY_DELAY);
|
|
644
694
|
}
|
|
645
695
|
}
|
|
646
696
|
}
|
|
647
|
-
if (!
|
|
648
|
-
debugLog(`
|
|
697
|
+
if (!paginationSuccess) {
|
|
698
|
+
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
|
649
699
|
return allResults;
|
|
650
700
|
}
|
|
651
701
|
break;
|
|
652
702
|
}
|
|
653
703
|
case 'clickLoadMore': {
|
|
704
|
+
yield scrapeCurrentPage();
|
|
705
|
+
if (checkLimit())
|
|
706
|
+
return allResults;
|
|
707
|
+
let loadMoreCounter = 0;
|
|
708
|
+
let previousResultCount = allResults.length;
|
|
709
|
+
let noNewItemsCounter = 0;
|
|
710
|
+
const MAX_NO_NEW_ITEMS = 2;
|
|
654
711
|
while (true) {
|
|
655
|
-
// Find working button with retry mechanism
|
|
656
|
-
const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
712
|
+
// Find working button with retry mechanism
|
|
713
|
+
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
714
|
+
availableSelectors = updatedSelectors;
|
|
657
715
|
if (!workingSelector || !loadMoreButton) {
|
|
658
716
|
debugLog('No working Load More selector found after retries');
|
|
659
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
660
|
-
allResults = allResults.concat(finalResults);
|
|
661
717
|
return allResults;
|
|
662
718
|
}
|
|
663
|
-
// Update available selectors to start from the working one
|
|
664
|
-
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
665
719
|
// Implement retry mechanism for clicking the button
|
|
666
720
|
let retryCount = 0;
|
|
667
721
|
let clickSuccess = false;
|
|
@@ -685,6 +739,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
685
739
|
}
|
|
686
740
|
if (clickSuccess) {
|
|
687
741
|
yield page.waitForTimeout(1000);
|
|
742
|
+
loadMoreCounter++;
|
|
743
|
+
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
|
|
688
744
|
}
|
|
689
745
|
}
|
|
690
746
|
catch (error) {
|
|
@@ -698,8 +754,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
698
754
|
}
|
|
699
755
|
if (!clickSuccess) {
|
|
700
756
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
701
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
702
|
-
allResults = allResults.concat(finalResults);
|
|
703
757
|
return allResults;
|
|
704
758
|
}
|
|
705
759
|
// Wait for content to load and check scroll height
|
|
@@ -707,19 +761,30 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
707
761
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
708
762
|
yield page.waitForTimeout(2000);
|
|
709
763
|
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
710
|
-
|
|
764
|
+
const heightChanged = currentHeight !== previousHeight;
|
|
765
|
+
previousHeight = currentHeight;
|
|
766
|
+
yield scrapeCurrentPage();
|
|
767
|
+
const currentResultCount = allResults.length;
|
|
768
|
+
const newItemsAdded = currentResultCount > previousResultCount;
|
|
769
|
+
if (!newItemsAdded) {
|
|
770
|
+
noNewItemsCounter++;
|
|
771
|
+
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
|
772
|
+
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
|
773
|
+
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
|
774
|
+
return allResults;
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
else {
|
|
778
|
+
noNewItemsCounter = 0;
|
|
779
|
+
previousResultCount = currentResultCount;
|
|
780
|
+
}
|
|
781
|
+
if (checkLimit())
|
|
782
|
+
return allResults;
|
|
783
|
+
if (!heightChanged) {
|
|
711
784
|
debugLog('No more items loaded after Load More');
|
|
712
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
713
|
-
allResults = allResults.concat(finalResults);
|
|
714
785
|
return allResults;
|
|
715
786
|
}
|
|
716
|
-
previousHeight = currentHeight;
|
|
717
|
-
if (config.limit && allResults.length >= config.limit) {
|
|
718
|
-
allResults = allResults.slice(0, config.limit);
|
|
719
|
-
break;
|
|
720
|
-
}
|
|
721
787
|
}
|
|
722
|
-
break;
|
|
723
788
|
}
|
|
724
789
|
default: {
|
|
725
790
|
yield scrapeCurrentPage();
|