maxun-core 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/build/interpret.js +129 -19
  2. package/package.json +1 -1
@@ -404,6 +404,12 @@ class Interpreter extends events_1.EventEmitter {
404
404
  };
405
405
  const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
406
406
  console.log("Executing action:", methodName, args);
407
+ if (methodName === 'press' || methodName === 'type') {
408
+ // Extract only the first two arguments for these methods
409
+ const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
410
+ yield invokee[methodName](...limitedArgs);
411
+ return;
412
+ }
407
413
  if (!args || Array.isArray(args)) {
408
414
  yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
409
415
  }
@@ -461,6 +467,12 @@ class Interpreter extends events_1.EventEmitter {
461
467
  let previousHeight = 0;
462
468
  // track unique items per page to avoid re-scraping
463
469
  let scrapedItems = new Set();
470
+ let visitedUrls = [];
471
+ // Debug logging helper
472
+ const debugLog = (message, ...args) => {
473
+ console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
474
+ };
475
+ let availableSelectors = config.pagination.selector.split(',');
464
476
  while (true) {
465
477
  switch (config.pagination.type) {
466
478
  case 'scrollDown':
@@ -486,56 +498,154 @@ class Interpreter extends events_1.EventEmitter {
486
498
  previousHeight = currentTopHeight;
487
499
  break;
488
500
  case 'clickNext':
501
+ debugLog("Current URL:", page.url());
489
502
  const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
490
- // console.log("Page results:", pageResults);
491
503
  // Filter out already scraped items
492
504
  const newResults = pageResults.filter(item => {
493
505
  const uniqueKey = JSON.stringify(item);
494
506
  if (scrapedItems.has(uniqueKey))
495
- return false; // Ignore if already scraped
496
- scrapedItems.add(uniqueKey); // Mark as scraped
507
+ return false;
508
+ scrapedItems.add(uniqueKey);
497
509
  return true;
498
510
  });
499
511
  allResults = allResults.concat(newResults);
512
+ debugLog("Results collected so far:", allResults.length);
500
513
  if (config.limit && allResults.length >= config.limit) {
501
514
  return allResults.slice(0, config.limit);
502
515
  }
503
- const nextButton = yield page.$(config.pagination.selector);
516
+ yield page.waitForLoadState('networkidle', { timeout: 30000 });
517
+ yield page.waitForTimeout(2000);
518
+ let checkButton = null;
519
+ let workingSelector = null;
520
+ // Try each selector with explicit waiting
521
+ for (const selector of availableSelectors) {
522
+ try {
523
+ checkButton = yield page.waitForSelector(selector, {
524
+ state: 'attached',
525
+ timeout: 30000
526
+ });
527
+ if (checkButton) {
528
+ workingSelector = selector;
529
+ debugLog('Found working selector:', selector);
530
+ break;
531
+ }
532
+ }
533
+ catch (error) {
534
+ debugLog(`Selector failed: ${selector} - ${error.message}`);
535
+ }
536
+ }
537
+ if (!workingSelector) {
538
+ debugLog('No working selector found after trying all options');
539
+ return allResults;
540
+ }
541
+ const nextButton = yield page.$(workingSelector);
504
542
  if (!nextButton) {
505
- return allResults; // No more pages to scrape
543
+ debugLog('Next button not found');
544
+ return allResults;
545
+ }
546
+ const selectorIndex = availableSelectors.indexOf(workingSelector);
547
+ availableSelectors = availableSelectors.slice(selectorIndex);
548
+ try {
549
+ // Store current URL to check if navigation succeeded
550
+ const previousUrl = page.url();
551
+ visitedUrls.push(previousUrl);
552
+ // Try both click methods in sequence
553
+ try {
554
+ yield Promise.all([
555
+ page.waitForNavigation({
556
+ waitUntil: 'networkidle',
557
+ timeout: 15000
558
+ }),
559
+ nextButton.click()
560
+ ]);
561
+ }
562
+ catch (error) {
563
+ // If we're still on the same URL, try dispatch event
564
+ if (page.url() === previousUrl) {
565
+ yield Promise.all([
566
+ page.waitForNavigation({
567
+ waitUntil: 'networkidle',
568
+ timeout: 15000
569
+ }),
570
+ nextButton.dispatchEvent('click')
571
+ ]);
572
+ }
573
+ }
574
+ yield page.waitForLoadState('domcontentloaded');
575
+ yield page.waitForLoadState('networkidle', { timeout: 30000 });
576
+ const currentUrl = page.url();
577
+ if (visitedUrls.includes(currentUrl)) {
578
+ debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
579
+ return allResults;
580
+ }
581
+ // Give the page a moment to stabilize after navigation
582
+ yield page.waitForTimeout(1000);
583
+ }
584
+ catch (error) {
585
+ debugLog(`Navigation failed completely: ${error.message}`);
586
+ return allResults;
506
587
  }
507
- yield Promise.all([
508
- nextButton.dispatchEvent('click'),
509
- page.waitForNavigation({ waitUntil: 'networkidle' })
510
- ]);
511
- yield page.waitForTimeout(1000);
512
588
  break;
513
589
  case 'clickLoadMore':
514
590
  while (true) {
515
- const loadMoreButton = yield page.$(config.pagination.selector);
591
+ let checkButton = null;
592
+ let workingSelector = null;
593
+ for (const selector of availableSelectors) {
594
+ try {
595
+ checkButton = yield page.waitForSelector(selector, {
596
+ state: 'attached',
597
+ timeout: 30000
598
+ });
599
+ if (checkButton) {
600
+ workingSelector = selector;
601
+ debugLog('Found working selector:', selector);
602
+ break;
603
+ }
604
+ }
605
+ catch (error) {
606
+ debugLog(`Load More selector failed: ${selector}`);
607
+ }
608
+ }
609
+ if (!workingSelector) {
610
+ debugLog('No working Load More selector found');
611
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
612
+ allResults = allResults.concat(finalResults);
613
+ return allResults;
614
+ }
615
+ const loadMoreButton = yield page.$(workingSelector);
516
616
  if (!loadMoreButton) {
517
- // No more "Load More" button, so scrape the remaining items
617
+ debugLog('Load More button not found');
518
618
  const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
519
619
  allResults = allResults.concat(finalResults);
520
620
  return allResults;
521
621
  }
522
- // Click the 'Load More' button to load additional items
523
- yield loadMoreButton.dispatchEvent('click');
524
- yield page.waitForTimeout(2000); // Wait for new items to load
525
- // After clicking 'Load More', scroll down to load more items
622
+ const selectorIndex = availableSelectors.indexOf(workingSelector);
623
+ availableSelectors = availableSelectors.slice(selectorIndex);
624
+ try {
625
+ try {
626
+ yield loadMoreButton.click();
627
+ }
628
+ catch (error) {
629
+ yield loadMoreButton.dispatchEvent('click');
630
+ }
631
+ }
632
+ catch (error) {
633
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
634
+ allResults = allResults.concat(finalResults);
635
+ return allResults;
636
+ }
637
+ yield page.waitForTimeout(2000);
526
638
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
527
639
  yield page.waitForTimeout(2000);
528
- // Check if more items are available
529
640
  const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
530
641
  if (currentHeight === previousHeight) {
531
- // No more items loaded, return the scraped results
642
+ debugLog('No more items loaded after Load More');
532
643
  const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
533
644
  allResults = allResults.concat(finalResults);
534
645
  return allResults;
535
646
  }
536
647
  previousHeight = currentHeight;
537
648
  if (config.limit && allResults.length >= config.limit) {
538
- // If limit is set and reached, return the limited results
539
649
  allResults = allResults.slice(0, config.limit);
540
650
  break;
541
651
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.8",
3
+ "version": "0.0.10",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",