maxun-core 0.0.12 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,7 +174,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
174
174
  if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
175
175
  return Array.from(document.querySelectorAll(config.selector));
176
176
  }
177
- // First handle iframe traversal if present
178
177
  if (config.selector.includes(':>>')) {
179
178
  const parts = config.selector.split(':>>').map(s => s.trim());
180
179
  let currentElements = [document];
@@ -185,24 +184,42 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
185
184
  const isLast = i === parts.length - 1;
186
185
  for (const element of currentElements) {
187
186
  try {
188
- // For document or iframe document
189
187
  const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
190
188
  if (!doc)
191
189
  continue;
192
- // Query elements in current context
190
+ if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
191
+ const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
192
+ if (nameMatch && nameMatch[1]) {
193
+ const frameName = nameMatch[1];
194
+ let foundFrames = [];
195
+ if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
196
+ foundFrames = Array.from(doc.getElementsByName(frameName))
197
+ .filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
198
+ }
199
+ if (foundFrames.length === 0) {
200
+ const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
201
+ foundFrames = framesBySelector;
202
+ }
203
+ if (isLast) {
204
+ nextElements.push(...foundFrames);
205
+ }
206
+ else {
207
+ nextElements.push(...foundFrames);
208
+ }
209
+ continue;
210
+ }
211
+ }
193
212
  const found = Array.from(doc.querySelectorAll(part));
194
213
  if (isLast) {
195
- // If it's the last part, keep all matching elements
196
214
  nextElements.push(...found);
197
215
  }
198
216
  else {
199
- // If not last, only keep iframes for next iteration
200
- const iframes = found.filter(el => el.tagName === 'IFRAME');
201
- nextElements.push(...iframes);
217
+ const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
218
+ nextElements.push(...frames);
202
219
  }
203
220
  }
204
221
  catch (error) {
205
- console.warn('Cannot access iframe content:', error, {
222
+ console.warn('Cannot access iframe/frame content:', error, {
206
223
  part,
207
224
  element,
208
225
  index: i
@@ -242,13 +259,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
242
259
  }
243
260
  return [];
244
261
  }
245
- // Modified to handle iframe context for URL resolution
246
262
  function getElementValue(element, attribute) {
247
- var _a, _b, _c, _d, _e;
263
+ var _a, _b, _c, _d, _e, _f;
248
264
  if (!element)
249
265
  return null;
250
- // Get the base URL for resolving relative URLs
251
- const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
266
+ let baseURL;
267
+ try {
268
+ baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) ||
269
+ ((_c = element.ownerDocument) === null || _c === void 0 ? void 0 : _c.baseURI) ||
270
+ window.location.origin;
271
+ }
272
+ catch (e) {
273
+ baseURL = window.location.origin;
274
+ }
252
275
  switch (attribute) {
253
276
  case 'href': {
254
277
  const relativeHref = element.getAttribute('href');
@@ -259,11 +282,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
259
282
  return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
260
283
  }
261
284
  case 'innerText':
262
- return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
285
+ return (_d = element.innerText) === null || _d === void 0 ? void 0 : _d.trim();
263
286
  case 'textContent':
264
- return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
287
+ return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
288
+ case 'innerHTML':
289
+ return element.innerHTML;
290
+ case 'outerHTML':
291
+ return element.outerHTML;
265
292
  default:
266
- return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
293
+ return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
267
294
  }
268
295
  }
269
296
  // Rest of the functions remain largely the same
@@ -332,7 +359,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
332
359
  */
333
360
  window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
334
361
  return __awaiter(this, void 0, void 0, function* () {
335
- // Enhanced query function to handle both iframe and shadow DOM
362
+ // Enhanced query function to handle iframe, frame and shadow DOM
336
363
  const queryElement = (rootElement, selector) => {
337
364
  if (!selector.includes('>>') && !selector.includes(':>>')) {
338
365
  return rootElement.querySelector(selector);
@@ -342,15 +369,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
342
369
  for (let i = 0; i < parts.length; i++) {
343
370
  if (!currentElement)
344
371
  return null;
345
- // Handle iframe traversal
346
- if (currentElement.tagName === 'IFRAME') {
372
+ // Handle iframe and frame traversal
373
+ if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
347
374
  try {
348
- const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
349
- currentElement = iframeDoc.querySelector(parts[i]);
375
+ const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
376
+ currentElement = frameDoc.querySelector(parts[i]);
350
377
  continue;
351
378
  }
352
379
  catch (e) {
353
- console.warn('Cannot access iframe content:', e);
380
+ console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
354
381
  return null;
355
382
  }
356
383
  }
@@ -385,14 +412,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
385
412
  for (const part of parts) {
386
413
  const nextElements = [];
387
414
  for (const element of currentElements) {
388
- // Handle iframe traversal
389
- if (element.tagName === 'IFRAME') {
415
+ // Handle iframe and frame traversal
416
+ if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
390
417
  try {
391
- const iframeDoc = element.contentDocument || element.contentWindow.document;
392
- nextElements.push(...iframeDoc.querySelectorAll(part));
418
+ const frameDoc = element.contentDocument || element.contentWindow.document;
419
+ nextElements.push(...frameDoc.querySelectorAll(part));
393
420
  }
394
421
  catch (e) {
395
- console.warn('Cannot access iframe content:', e);
422
+ console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
396
423
  continue;
397
424
  }
398
425
  }
@@ -440,7 +467,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
440
467
  }
441
468
  else if (attribute === 'src' || attribute === 'href') {
442
469
  const attrValue = element.getAttribute(attribute);
443
- return attrValue ? new URL(attrValue, baseURL).href : null;
470
+ const dataAttr = attrValue || element.getAttribute('data-' + attribute);
471
+ if (!dataAttr || dataAttr.trim() === '') {
472
+ if (attribute === 'src') {
473
+ const style = window.getComputedStyle(element);
474
+ const bgImage = style.backgroundImage;
475
+ if (bgImage && bgImage !== 'none') {
476
+ const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
477
+ return matches ? new URL(matches[1], baseURL).href : null;
478
+ }
479
+ }
480
+ return null;
481
+ }
482
+ try {
483
+ return new URL(dataAttr, baseURL).href;
484
+ }
485
+ catch (e) {
486
+ console.warn('Error creating URL from', dataAttr, e);
487
+ return dataAttr; // Return the original value if URL construction fails
488
+ }
444
489
  }
445
490
  return element.getAttribute(attribute);
446
491
  }
@@ -461,8 +506,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
461
506
  else if (currentElement.tagName === 'TR') {
462
507
  return { type: 'TR', element: currentElement };
463
508
  }
464
- // Handle iframe crossing
465
- if (currentElement.tagName === 'IFRAME') {
509
+ // Handle iframe and frame crossing
510
+ if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
466
511
  try {
467
512
  currentElement = currentElement.contentDocument.body;
468
513
  }
@@ -504,7 +549,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
504
549
  }
505
550
  if (current.tagName === 'TH')
506
551
  return true;
507
- if (current.tagName === 'IFRAME') {
552
+ if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
508
553
  try {
509
554
  current = current.contentDocument.body;
510
555
  }
@@ -556,15 +601,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
556
601
  const shadowHost = baseElement.getRootNode().host;
557
602
  allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
558
603
  }
559
- // Get elements from iframes
560
- const iframes = document.getElementsByTagName('iframe');
561
- for (const iframe of iframes) {
604
+ // Get elements from iframes and frames
605
+ const frames = [
606
+ ...Array.from(document.getElementsByTagName('iframe')),
607
+ ...Array.from(document.getElementsByTagName('frame'))
608
+ ];
609
+ for (const frame of frames) {
562
610
  try {
563
- const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
564
- allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
611
+ const frameDoc = frame.contentDocument || frame.contentWindow.document;
612
+ allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
565
613
  }
566
614
  catch (e) {
567
- console.warn('Cannot access iframe content:', e);
615
+ console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
568
616
  }
569
617
  }
570
618
  return allElements.filter(element => {
@@ -611,7 +659,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
611
659
  });
612
660
  const tableData = [];
613
661
  const nonTableData = [];
614
- // Process table data with both iframe and shadow DOM support
662
+ // Process table data with support for iframes, frames, and shadow DOM
615
663
  for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
616
664
  const container = containers[containerIndex];
617
665
  const { tableFields } = containerFields[containerIndex];
@@ -619,13 +667,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
619
667
  const firstField = Object.values(tableFields)[0];
620
668
  const firstElement = queryElement(container, firstField.selector);
621
669
  let tableContext = firstElement;
622
- // Find table context including both iframe and shadow DOM
670
+ // Find table context including iframe, frame and shadow DOM
623
671
  while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
624
672
  if (tableContext.getRootNode() instanceof ShadowRoot) {
625
673
  tableContext = tableContext.getRootNode().host;
626
674
  continue;
627
675
  }
628
- if (tableContext.tagName === 'IFRAME') {
676
+ if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
629
677
  try {
630
678
  tableContext = tableContext.contentDocument.body;
631
679
  }
@@ -646,14 +694,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
646
694
  if (tableContext.shadowRoot) {
647
695
  rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
648
696
  }
649
- // Get rows from iframes
650
- if (tableContext.tagName === 'IFRAME') {
697
+ // Get rows from iframes and frames
698
+ if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
651
699
  try {
652
- const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
653
- rows.push(...iframeDoc.getElementsByTagName('TR'));
700
+ const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
701
+ rows.push(...frameDoc.getElementsByTagName('TR'));
654
702
  }
655
703
  catch (e) {
656
- console.warn('Cannot access iframe rows:', e);
704
+ console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
657
705
  }
658
706
  }
659
707
  const processedRows = filterRowsBasedOnTag(rows, tableFields);
@@ -713,7 +761,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
713
761
  }
714
762
  }
715
763
  }
716
- // Process non-table data with both contexts support
764
+ // Process non-table data with all contexts support
717
765
  for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
718
766
  if (nonTableData.length >= limit)
719
767
  break;
@@ -234,6 +234,11 @@ class Interpreter extends events_1.EventEmitter {
234
234
  parsedSuperset[key] = Array.isArray(superset[key])
235
235
  ? (0, utils_1.arrayToObject)(superset[key])
236
236
  : superset[key];
237
+ if ((key === 'url' || key === 'selectors') &&
238
+ Array.isArray(value) && Array.isArray(superset[key]) &&
239
+ value.length === 0 && superset[key].length === 0) {
240
+ return true;
241
+ }
237
242
  if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
238
243
  return value.some(selector => superset[key].includes(selector));
239
244
  }
@@ -495,29 +500,45 @@ class Interpreter extends events_1.EventEmitter {
495
500
  return false;
496
501
  };
497
502
  // Enhanced button finder with retry mechanism
498
- const findWorkingButton = (selectors, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
499
- for (const selector of selectors) {
500
- try {
501
- const button = yield page.waitForSelector(selector, {
502
- state: 'attached',
503
- timeout: 10000 // Reduced timeout for faster checks
504
- });
505
- if (button) {
506
- debugLog('Found working selector:', selector);
507
- return { button, workingSelector: selector };
503
+ const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
504
+ let updatedSelectors = [...selectors];
505
+ for (let i = 0; i < selectors.length; i++) {
506
+ const selector = selectors[i];
507
+ let retryCount = 0;
508
+ let selectorSuccess = false;
509
+ while (retryCount < MAX_RETRIES && !selectorSuccess) {
510
+ try {
511
+ const button = yield page.waitForSelector(selector, {
512
+ state: 'attached',
513
+ timeout: 10000
514
+ });
515
+ if (button) {
516
+ debugLog('Found working selector:', selector);
517
+ return {
518
+ button,
519
+ workingSelector: selector,
520
+ updatedSelectors
521
+ };
522
+ }
523
+ }
524
+ catch (error) {
525
+ retryCount++;
526
+ debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
527
+ if (retryCount < MAX_RETRIES) {
528
+ yield page.waitForTimeout(RETRY_DELAY);
529
+ }
530
+ else {
531
+ debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
532
+ updatedSelectors = updatedSelectors.filter(s => s !== selector);
533
+ }
508
534
  }
509
535
  }
510
- catch (error) {
511
- debugLog(`Selector failed: ${selector}`);
512
- }
513
- }
514
- // Implement retry mechanism when no selectors work
515
- if (selectors.length > 0 && retryCount < MAX_RETRIES) {
516
- debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
517
- yield page.waitForTimeout(RETRY_DELAY);
518
- return findWorkingButton(selectors, retryCount + 1);
519
536
  }
520
- return { button: null, workingSelector: null };
537
+ return {
538
+ button: null,
539
+ workingSelector: null,
540
+ updatedSelectors
541
+ };
521
542
  });
522
543
  const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
523
544
  try {
@@ -569,7 +590,8 @@ class Interpreter extends events_1.EventEmitter {
569
590
  yield scrapeCurrentPage();
570
591
  if (checkLimit())
571
592
  return allResults;
572
- const { button, workingSelector } = yield findWorkingButton(availableSelectors);
593
+ const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
594
+ availableSelectors = updatedSelectors;
573
595
  if (!button || !workingSelector) {
574
596
  // Final retry for navigation when no selectors work
575
597
  const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
@@ -586,82 +608,114 @@ class Interpreter extends events_1.EventEmitter {
586
608
  return allResults;
587
609
  break;
588
610
  }
589
- availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
590
611
  let retryCount = 0;
591
- let navigationSuccess = false;
592
- while (retryCount < MAX_RETRIES && !navigationSuccess) {
612
+ let paginationSuccess = false;
613
+ // Capture basic content signature before click
614
+ const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
615
+ return yield page.evaluate((selector) => {
616
+ const items = document.querySelectorAll(selector);
617
+ return {
618
+ url: window.location.href,
619
+ itemCount: items.length,
620
+ firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
621
+ };
622
+ }, config.listSelector);
623
+ });
624
+ const beforeSignature = yield captureContentSignature();
625
+ debugLog(`Before click: ${beforeSignature.itemCount} items`);
626
+ while (retryCount < MAX_RETRIES && !paginationSuccess) {
593
627
  try {
594
628
  try {
595
629
  yield Promise.all([
596
630
  page.waitForNavigation({
597
631
  waitUntil: 'networkidle',
598
632
  timeout: 15000
633
+ }).catch(e => {
634
+ throw e;
599
635
  }),
600
636
  button.click()
601
637
  ]);
602
- navigationSuccess = true;
638
+ debugLog("Navigation successful after regular click");
639
+ paginationSuccess = true;
603
640
  }
604
- catch (error) {
605
- debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
606
- // If regular click fails, try dispatchEvent
607
- if (page.url() === currentUrl) {
641
+ catch (navError) {
642
+ debugLog("Regular click with navigation failed, trying dispatch event with navigation");
643
+ try {
644
+ yield Promise.all([
645
+ page.waitForNavigation({
646
+ waitUntil: 'networkidle',
647
+ timeout: 15000
648
+ }).catch(e => {
649
+ throw e;
650
+ }),
651
+ button.dispatchEvent('click')
652
+ ]);
653
+ debugLog("Navigation successful after dispatch event");
654
+ paginationSuccess = true;
655
+ }
656
+ catch (dispatchNavError) {
608
657
  try {
609
- yield Promise.all([
610
- page.waitForNavigation({
611
- waitUntil: 'networkidle',
612
- timeout: 15000
613
- }),
614
- button.dispatchEvent('click')
615
- ]);
616
- navigationSuccess = true;
658
+ yield button.click();
659
+ yield page.waitForTimeout(2000);
617
660
  }
618
- catch (dispatchError) {
619
- debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
661
+ catch (clickError) {
662
+ yield button.dispatchEvent('click');
663
+ yield page.waitForTimeout(2000);
620
664
  }
621
665
  }
622
- else {
623
- navigationSuccess = true;
624
- }
625
- }
626
- const newUrl = page.url();
627
- if (visitedUrls.has(newUrl)) {
628
- debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
629
- navigationSuccess = false;
630
666
  }
631
- if (navigationSuccess) {
632
- yield page.waitForTimeout(1000);
667
+ yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
668
+ if (!paginationSuccess) {
669
+ const newUrl = page.url();
670
+ const afterSignature = yield captureContentSignature();
671
+ if (newUrl !== currentUrl) {
672
+ debugLog(`URL changed to ${newUrl}`);
673
+ visitedUrls.add(newUrl);
674
+ paginationSuccess = true;
675
+ }
676
+ else if (afterSignature.firstItems !== beforeSignature.firstItems) {
677
+ debugLog("Content changed without URL change");
678
+ paginationSuccess = true;
679
+ }
680
+ else if (afterSignature.itemCount !== beforeSignature.itemCount) {
681
+ debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
682
+ paginationSuccess = true;
683
+ }
633
684
  }
634
685
  }
635
686
  catch (error) {
636
- debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
637
- navigationSuccess = false;
687
+ debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
638
688
  }
639
- if (!navigationSuccess) {
689
+ if (!paginationSuccess) {
640
690
  retryCount++;
641
691
  if (retryCount < MAX_RETRIES) {
642
- debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
692
+ debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
643
693
  yield page.waitForTimeout(RETRY_DELAY);
644
694
  }
645
695
  }
646
696
  }
647
- if (!navigationSuccess) {
648
- debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
697
+ if (!paginationSuccess) {
698
+ debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
649
699
  return allResults;
650
700
  }
651
701
  break;
652
702
  }
653
703
  case 'clickLoadMore': {
704
+ yield scrapeCurrentPage();
705
+ if (checkLimit())
706
+ return allResults;
707
+ let loadMoreCounter = 0;
708
+ let previousResultCount = allResults.length;
709
+ let noNewItemsCounter = 0;
710
+ const MAX_NO_NEW_ITEMS = 2;
654
711
  while (true) {
655
- // Find working button with retry mechanism, consistent with clickNext
656
- const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
712
+ // Find working button with retry mechanism
713
+ const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
714
+ availableSelectors = updatedSelectors;
657
715
  if (!workingSelector || !loadMoreButton) {
658
716
  debugLog('No working Load More selector found after retries');
659
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
660
- allResults = allResults.concat(finalResults);
661
717
  return allResults;
662
718
  }
663
- // Update available selectors to start from the working one
664
- availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
665
719
  // Implement retry mechanism for clicking the button
666
720
  let retryCount = 0;
667
721
  let clickSuccess = false;
@@ -685,6 +739,8 @@ class Interpreter extends events_1.EventEmitter {
685
739
  }
686
740
  if (clickSuccess) {
687
741
  yield page.waitForTimeout(1000);
742
+ loadMoreCounter++;
743
+ debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
688
744
  }
689
745
  }
690
746
  catch (error) {
@@ -698,8 +754,6 @@ class Interpreter extends events_1.EventEmitter {
698
754
  }
699
755
  if (!clickSuccess) {
700
756
  debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
701
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
702
- allResults = allResults.concat(finalResults);
703
757
  return allResults;
704
758
  }
705
759
  // Wait for content to load and check scroll height
@@ -707,19 +761,30 @@ class Interpreter extends events_1.EventEmitter {
707
761
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
708
762
  yield page.waitForTimeout(2000);
709
763
  const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
710
- if (currentHeight === previousHeight) {
764
+ const heightChanged = currentHeight !== previousHeight;
765
+ previousHeight = currentHeight;
766
+ yield scrapeCurrentPage();
767
+ const currentResultCount = allResults.length;
768
+ const newItemsAdded = currentResultCount > previousResultCount;
769
+ if (!newItemsAdded) {
770
+ noNewItemsCounter++;
771
+ debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
772
+ if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
773
+ debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
774
+ return allResults;
775
+ }
776
+ }
777
+ else {
778
+ noNewItemsCounter = 0;
779
+ previousResultCount = currentResultCount;
780
+ }
781
+ if (checkLimit())
782
+ return allResults;
783
+ if (!heightChanged) {
711
784
  debugLog('No more items loaded after Load More');
712
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
713
- allResults = allResults.concat(finalResults);
714
785
  return allResults;
715
786
  }
716
- previousHeight = currentHeight;
717
- if (config.limit && allResults.length >= config.limit) {
718
- allResults = allResults.slice(0, config.limit);
719
- break;
720
- }
721
787
  }
722
- break;
723
788
  }
724
789
  default: {
725
790
  yield scrapeCurrentPage();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.12",
3
+ "version": "0.0.14",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",