maxun-core 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.js +108 -7
- package/package.json +1 -1
package/build/interpret.js
CHANGED
|
@@ -461,6 +461,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
461
461
|
let previousHeight = 0;
|
|
462
462
|
// track unique items per page to avoid re-scraping
|
|
463
463
|
let scrapedItems = new Set();
|
|
464
|
+
let visitedUrls = [];
|
|
465
|
+
let availableSelectors = config.pagination.selector.split(',');
|
|
464
466
|
while (true) {
|
|
465
467
|
switch (config.pagination.type) {
|
|
466
468
|
case 'scrollDown':
|
|
@@ -486,6 +488,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
486
488
|
previousHeight = currentTopHeight;
|
|
487
489
|
break;
|
|
488
490
|
case 'clickNext':
|
|
491
|
+
console.log("Page URL:", page.url());
|
|
489
492
|
const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
490
493
|
// console.log("Page results:", pageResults);
|
|
491
494
|
// Filter out already scraped items
|
|
@@ -497,30 +500,128 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
497
500
|
return true;
|
|
498
501
|
});
|
|
499
502
|
allResults = allResults.concat(newResults);
|
|
503
|
+
console.log("Results so far:", allResults.length);
|
|
500
504
|
if (config.limit && allResults.length >= config.limit) {
|
|
501
505
|
return allResults.slice(0, config.limit);
|
|
502
506
|
}
|
|
503
|
-
|
|
507
|
+
let checkButton = null;
|
|
508
|
+
let workingSelector = null;
|
|
509
|
+
for (let i = 0; i < availableSelectors.length; i++) {
|
|
510
|
+
const selector = availableSelectors[i];
|
|
511
|
+
try {
|
|
512
|
+
// Wait for selector with a short timeout
|
|
513
|
+
checkButton = yield page.waitForSelector(selector, { state: 'attached' });
|
|
514
|
+
if (checkButton) {
|
|
515
|
+
workingSelector = selector;
|
|
516
|
+
break;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
catch (error) {
|
|
520
|
+
console.log(`Selector failed: ${selector}`);
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
if (!workingSelector) {
|
|
524
|
+
return allResults;
|
|
525
|
+
}
|
|
526
|
+
// const nextButton = await page.$(config.pagination.selector);
|
|
527
|
+
const nextButton = yield page.$(workingSelector);
|
|
504
528
|
if (!nextButton) {
|
|
505
529
|
return allResults; // No more pages to scrape
|
|
506
530
|
}
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
531
|
+
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
532
|
+
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
533
|
+
// await Promise.all([
|
|
534
|
+
// nextButton.dispatchEvent('click'),
|
|
535
|
+
// page.waitForNavigation({ waitUntil: 'networkidle' })
|
|
536
|
+
// ]);
|
|
537
|
+
const previousUrl = page.url();
|
|
538
|
+
visitedUrls.push(previousUrl);
|
|
539
|
+
try {
|
|
540
|
+
// Try both click methods simultaneously
|
|
541
|
+
yield Promise.race([
|
|
542
|
+
Promise.all([
|
|
543
|
+
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
|
544
|
+
nextButton.click()
|
|
545
|
+
]),
|
|
546
|
+
Promise.all([
|
|
547
|
+
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
|
548
|
+
nextButton.dispatchEvent('click')
|
|
549
|
+
])
|
|
550
|
+
]);
|
|
551
|
+
}
|
|
552
|
+
catch (error) {
|
|
553
|
+
// Verify if navigation actually succeeded
|
|
554
|
+
const currentUrl = page.url();
|
|
555
|
+
if (currentUrl === previousUrl) {
|
|
556
|
+
console.log("Previous URL same as current URL. Navigation failed.");
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
const currentUrl = page.url();
|
|
560
|
+
if (visitedUrls.includes(currentUrl)) {
|
|
561
|
+
console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
|
|
562
|
+
// Extract the current page number from the URL
|
|
563
|
+
const match = currentUrl.match(/\d+/);
|
|
564
|
+
if (match) {
|
|
565
|
+
const currentNumber = match[0];
|
|
566
|
+
// Use visitedUrls.length + 1 as the next page number
|
|
567
|
+
const nextNumber = visitedUrls.length + 1;
|
|
568
|
+
// Create new URL by replacing the current number with the next number
|
|
569
|
+
const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
|
|
570
|
+
console.log(`Navigating to constructed URL: ${nextUrl}`);
|
|
571
|
+
// Navigate to the next page
|
|
572
|
+
yield Promise.all([
|
|
573
|
+
page.waitForNavigation({ waitUntil: 'networkidle' }),
|
|
574
|
+
page.goto(nextUrl)
|
|
575
|
+
]);
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
// Give the page a moment to stabilize after navigation
|
|
511
579
|
yield page.waitForTimeout(1000);
|
|
512
580
|
break;
|
|
513
581
|
case 'clickLoadMore':
|
|
514
582
|
while (true) {
|
|
515
|
-
|
|
583
|
+
let checkButton = null;
|
|
584
|
+
let workingSelector = null;
|
|
585
|
+
for (let i = 0; i < availableSelectors.length; i++) {
|
|
586
|
+
const selector = availableSelectors[i];
|
|
587
|
+
try {
|
|
588
|
+
// Wait for selector with a short timeout
|
|
589
|
+
checkButton = yield page.waitForSelector(selector, { state: 'attached' });
|
|
590
|
+
if (checkButton) {
|
|
591
|
+
workingSelector = selector;
|
|
592
|
+
break;
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
catch (error) {
|
|
596
|
+
console.log(`Selector failed: ${selector}`);
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
if (!workingSelector) {
|
|
600
|
+
// No more working selectors available, so scrape the remaining items
|
|
601
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
602
|
+
allResults = allResults.concat(finalResults);
|
|
603
|
+
return allResults;
|
|
604
|
+
}
|
|
605
|
+
const loadMoreButton = yield page.$(workingSelector);
|
|
516
606
|
if (!loadMoreButton) {
|
|
517
607
|
// No more "Load More" button, so scrape the remaining items
|
|
518
608
|
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
519
609
|
allResults = allResults.concat(finalResults);
|
|
520
610
|
return allResults;
|
|
521
611
|
}
|
|
612
|
+
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
613
|
+
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
522
614
|
// Click the 'Load More' button to load additional items
|
|
523
|
-
|
|
615
|
+
// await loadMoreButton.dispatchEvent('click');
|
|
616
|
+
try {
|
|
617
|
+
yield Promise.race([
|
|
618
|
+
loadMoreButton.click(),
|
|
619
|
+
loadMoreButton.dispatchEvent('click')
|
|
620
|
+
]);
|
|
621
|
+
}
|
|
622
|
+
catch (error) {
|
|
623
|
+
console.log('Both click attempts failed');
|
|
624
|
+
}
|
|
524
625
|
yield page.waitForTimeout(2000); // Wait for new items to load
|
|
525
626
|
// After clicking 'Load More', scroll down to load more items
|
|
526
627
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|