maxun-core 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/build/interpret.js +108 -7
  2. package/package.json +1 -1
@@ -461,6 +461,8 @@ class Interpreter extends events_1.EventEmitter {
461
461
  let previousHeight = 0;
462
462
  // track unique items per page to avoid re-scraping
463
463
  let scrapedItems = new Set();
464
+ let visitedUrls = [];
465
+ let availableSelectors = config.pagination.selector.split(',');
464
466
  while (true) {
465
467
  switch (config.pagination.type) {
466
468
  case 'scrollDown':
@@ -486,6 +488,7 @@ class Interpreter extends events_1.EventEmitter {
486
488
  previousHeight = currentTopHeight;
487
489
  break;
488
490
  case 'clickNext':
491
+ console.log("Page URL:", page.url());
489
492
  const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
490
493
  // console.log("Page results:", pageResults);
491
494
  // Filter out already scraped items
@@ -497,30 +500,128 @@ class Interpreter extends events_1.EventEmitter {
497
500
  return true;
498
501
  });
499
502
  allResults = allResults.concat(newResults);
503
+ console.log("Results so far:", allResults.length);
500
504
  if (config.limit && allResults.length >= config.limit) {
501
505
  return allResults.slice(0, config.limit);
502
506
  }
503
- const nextButton = yield page.$(config.pagination.selector);
507
+ let checkButton = null;
508
+ let workingSelector = null;
509
+ for (let i = 0; i < availableSelectors.length; i++) {
510
+ const selector = availableSelectors[i];
511
+ try {
512
+ // Wait for selector with a short timeout
513
+ checkButton = yield page.waitForSelector(selector, { state: 'attached' });
514
+ if (checkButton) {
515
+ workingSelector = selector;
516
+ break;
517
+ }
518
+ }
519
+ catch (error) {
520
+ console.log(`Selector failed: ${selector}`);
521
+ }
522
+ }
523
+ if (!workingSelector) {
524
+ return allResults;
525
+ }
526
+ // const nextButton = await page.$(config.pagination.selector);
527
+ const nextButton = yield page.$(workingSelector);
504
528
  if (!nextButton) {
505
529
  return allResults; // No more pages to scrape
506
530
  }
507
- yield Promise.all([
508
- nextButton.dispatchEvent('click'),
509
- page.waitForNavigation({ waitUntil: 'networkidle' })
510
- ]);
531
+ const selectorIndex = availableSelectors.indexOf(workingSelector);
532
+ availableSelectors = availableSelectors.slice(selectorIndex);
533
+ // await Promise.all([
534
+ // nextButton.dispatchEvent('click'),
535
+ // page.waitForNavigation({ waitUntil: 'networkidle' })
536
+ // ]);
537
+ const previousUrl = page.url();
538
+ visitedUrls.push(previousUrl);
539
+ try {
540
+ // Try both click methods simultaneously
541
+ yield Promise.race([
542
+ Promise.all([
543
+ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
544
+ nextButton.click()
545
+ ]),
546
+ Promise.all([
547
+ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
548
+ nextButton.dispatchEvent('click')
549
+ ])
550
+ ]);
551
+ }
552
+ catch (error) {
553
+ // Verify if navigation actually succeeded
554
+ const currentUrl = page.url();
555
+ if (currentUrl === previousUrl) {
556
+ console.log("Previous URL same as current URL. Navigation failed.");
557
+ }
558
+ }
559
+ const currentUrl = page.url();
560
+ if (visitedUrls.includes(currentUrl)) {
561
+ console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
562
+ // Extract the current page number from the URL
563
+ const match = currentUrl.match(/\d+/);
564
+ if (match) {
565
+ const currentNumber = match[0];
566
+ // Use visitedUrls.length + 1 as the next page number
567
+ const nextNumber = visitedUrls.length + 1;
568
+ // Create new URL by replacing the current number with the next number
569
+ const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
570
+ console.log(`Navigating to constructed URL: ${nextUrl}`);
571
+ // Navigate to the next page
572
+ yield Promise.all([
573
+ page.waitForNavigation({ waitUntil: 'networkidle' }),
574
+ page.goto(nextUrl)
575
+ ]);
576
+ }
577
+ }
578
+ // Give the page a moment to stabilize after navigation
511
579
  yield page.waitForTimeout(1000);
512
580
  break;
513
581
  case 'clickLoadMore':
514
582
  while (true) {
515
- const loadMoreButton = yield page.$(config.pagination.selector);
583
+ let checkButton = null;
584
+ let workingSelector = null;
585
+ for (let i = 0; i < availableSelectors.length; i++) {
586
+ const selector = availableSelectors[i];
587
+ try {
588
+ // Wait for selector with a short timeout
589
+ checkButton = yield page.waitForSelector(selector, { state: 'attached' });
590
+ if (checkButton) {
591
+ workingSelector = selector;
592
+ break;
593
+ }
594
+ }
595
+ catch (error) {
596
+ console.log(`Selector failed: ${selector}`);
597
+ }
598
+ }
599
+ if (!workingSelector) {
600
+ // No more working selectors available, so scrape the remaining items
601
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
602
+ allResults = allResults.concat(finalResults);
603
+ return allResults;
604
+ }
605
+ const loadMoreButton = yield page.$(workingSelector);
516
606
  if (!loadMoreButton) {
517
607
  // No more "Load More" button, so scrape the remaining items
518
608
  const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
519
609
  allResults = allResults.concat(finalResults);
520
610
  return allResults;
521
611
  }
612
+ const selectorIndex = availableSelectors.indexOf(workingSelector);
613
+ availableSelectors = availableSelectors.slice(selectorIndex);
522
614
  // Click the 'Load More' button to load additional items
523
- yield loadMoreButton.dispatchEvent('click');
615
+ // await loadMoreButton.dispatchEvent('click');
616
+ try {
617
+ yield Promise.race([
618
+ loadMoreButton.click(),
619
+ loadMoreButton.dispatchEvent('click')
620
+ ]);
621
+ }
622
+ catch (error) {
623
+ console.log('Both click attempts failed');
624
+ }
524
625
  yield page.waitForTimeout(2000); // Wait for new items to load
525
626
  // After clicking 'Load More', scroll down to load more items
526
627
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.8",
3
+ "version": "0.0.9",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",