maxun-core 0.0.11 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,7 +174,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
174
174
  if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
175
175
  return Array.from(document.querySelectorAll(config.selector));
176
176
  }
177
- // First handle iframe traversal if present
178
177
  if (config.selector.includes(':>>')) {
179
178
  const parts = config.selector.split(':>>').map(s => s.trim());
180
179
  let currentElements = [document];
@@ -185,24 +184,42 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
185
184
  const isLast = i === parts.length - 1;
186
185
  for (const element of currentElements) {
187
186
  try {
188
- // For document or iframe document
189
187
  const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
190
188
  if (!doc)
191
189
  continue;
192
- // Query elements in current context
190
+ if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
191
+ const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
192
+ if (nameMatch && nameMatch[1]) {
193
+ const frameName = nameMatch[1];
194
+ let foundFrames = [];
195
+ if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
196
+ foundFrames = Array.from(doc.getElementsByName(frameName))
197
+ .filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
198
+ }
199
+ if (foundFrames.length === 0) {
200
+ const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
201
+ foundFrames = framesBySelector;
202
+ }
203
+ if (isLast) {
204
+ nextElements.push(...foundFrames);
205
+ }
206
+ else {
207
+ nextElements.push(...foundFrames);
208
+ }
209
+ continue;
210
+ }
211
+ }
193
212
  const found = Array.from(doc.querySelectorAll(part));
194
213
  if (isLast) {
195
- // If it's the last part, keep all matching elements
196
214
  nextElements.push(...found);
197
215
  }
198
216
  else {
199
- // If not last, only keep iframes for next iteration
200
- const iframes = found.filter(el => el.tagName === 'IFRAME');
201
- nextElements.push(...iframes);
217
+ const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
218
+ nextElements.push(...frames);
202
219
  }
203
220
  }
204
221
  catch (error) {
205
- console.warn('Cannot access iframe content:', error, {
222
+ console.warn('Cannot access iframe/frame content:', error, {
206
223
  part,
207
224
  element,
208
225
  index: i
@@ -242,13 +259,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
242
259
  }
243
260
  return [];
244
261
  }
245
- // Modified to handle iframe context for URL resolution
246
262
  function getElementValue(element, attribute) {
247
- var _a, _b, _c, _d, _e;
263
+ var _a, _b, _c, _d, _e, _f;
248
264
  if (!element)
249
265
  return null;
250
- // Get the base URL for resolving relative URLs
251
- const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
266
+ let baseURL;
267
+ try {
268
+ baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) ||
269
+ ((_c = element.ownerDocument) === null || _c === void 0 ? void 0 : _c.baseURI) ||
270
+ window.location.origin;
271
+ }
272
+ catch (e) {
273
+ baseURL = window.location.origin;
274
+ }
252
275
  switch (attribute) {
253
276
  case 'href': {
254
277
  const relativeHref = element.getAttribute('href');
@@ -259,11 +282,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
259
282
  return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
260
283
  }
261
284
  case 'innerText':
262
- return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
285
+ return (_d = element.innerText) === null || _d === void 0 ? void 0 : _d.trim();
263
286
  case 'textContent':
264
- return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
287
+ return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
288
+ case 'innerHTML':
289
+ return element.innerHTML;
290
+ case 'outerHTML':
291
+ return element.outerHTML;
265
292
  default:
266
- return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
293
+ return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
267
294
  }
268
295
  }
269
296
  // Rest of the functions remain largely the same
@@ -332,7 +359,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
332
359
  */
333
360
  window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
334
361
  return __awaiter(this, void 0, void 0, function* () {
335
- // Enhanced query function to handle both iframe and shadow DOM
362
+ // Enhanced query function to handle iframe, frame and shadow DOM
336
363
  const queryElement = (rootElement, selector) => {
337
364
  if (!selector.includes('>>') && !selector.includes(':>>')) {
338
365
  return rootElement.querySelector(selector);
@@ -342,15 +369,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
342
369
  for (let i = 0; i < parts.length; i++) {
343
370
  if (!currentElement)
344
371
  return null;
345
- // Handle iframe traversal
346
- if (currentElement.tagName === 'IFRAME') {
372
+ // Handle iframe and frame traversal
373
+ if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
347
374
  try {
348
- const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
349
- currentElement = iframeDoc.querySelector(parts[i]);
375
+ const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
376
+ currentElement = frameDoc.querySelector(parts[i]);
350
377
  continue;
351
378
  }
352
379
  catch (e) {
353
- console.warn('Cannot access iframe content:', e);
380
+ console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
354
381
  return null;
355
382
  }
356
383
  }
@@ -385,14 +412,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
385
412
  for (const part of parts) {
386
413
  const nextElements = [];
387
414
  for (const element of currentElements) {
388
- // Handle iframe traversal
389
- if (element.tagName === 'IFRAME') {
415
+ // Handle iframe and frame traversal
416
+ if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
390
417
  try {
391
- const iframeDoc = element.contentDocument || element.contentWindow.document;
392
- nextElements.push(...iframeDoc.querySelectorAll(part));
418
+ const frameDoc = element.contentDocument || element.contentWindow.document;
419
+ nextElements.push(...frameDoc.querySelectorAll(part));
393
420
  }
394
421
  catch (e) {
395
- console.warn('Cannot access iframe content:', e);
422
+ console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
396
423
  continue;
397
424
  }
398
425
  }
@@ -461,8 +488,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
461
488
  else if (currentElement.tagName === 'TR') {
462
489
  return { type: 'TR', element: currentElement };
463
490
  }
464
- // Handle iframe crossing
465
- if (currentElement.tagName === 'IFRAME') {
491
+ // Handle iframe and frame crossing
492
+ if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
466
493
  try {
467
494
  currentElement = currentElement.contentDocument.body;
468
495
  }
@@ -504,7 +531,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
504
531
  }
505
532
  if (current.tagName === 'TH')
506
533
  return true;
507
- if (current.tagName === 'IFRAME') {
534
+ if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
508
535
  try {
509
536
  current = current.contentDocument.body;
510
537
  }
@@ -556,15 +583,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
556
583
  const shadowHost = baseElement.getRootNode().host;
557
584
  allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
558
585
  }
559
- // Get elements from iframes
560
- const iframes = document.getElementsByTagName('iframe');
561
- for (const iframe of iframes) {
586
+ // Get elements from iframes and frames
587
+ const frames = [
588
+ ...Array.from(document.getElementsByTagName('iframe')),
589
+ ...Array.from(document.getElementsByTagName('frame'))
590
+ ];
591
+ for (const frame of frames) {
562
592
  try {
563
- const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
564
- allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
593
+ const frameDoc = frame.contentDocument || frame.contentWindow.document;
594
+ allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
565
595
  }
566
596
  catch (e) {
567
- console.warn('Cannot access iframe content:', e);
597
+ console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
568
598
  }
569
599
  }
570
600
  return allElements.filter(element => {
@@ -611,7 +641,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
611
641
  });
612
642
  const tableData = [];
613
643
  const nonTableData = [];
614
- // Process table data with both iframe and shadow DOM support
644
+ // Process table data with support for iframes, frames, and shadow DOM
615
645
  for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
616
646
  const container = containers[containerIndex];
617
647
  const { tableFields } = containerFields[containerIndex];
@@ -619,13 +649,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
619
649
  const firstField = Object.values(tableFields)[0];
620
650
  const firstElement = queryElement(container, firstField.selector);
621
651
  let tableContext = firstElement;
622
- // Find table context including both iframe and shadow DOM
652
+ // Find table context including iframe, frame and shadow DOM
623
653
  while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
624
654
  if (tableContext.getRootNode() instanceof ShadowRoot) {
625
655
  tableContext = tableContext.getRootNode().host;
626
656
  continue;
627
657
  }
628
- if (tableContext.tagName === 'IFRAME') {
658
+ if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
629
659
  try {
630
660
  tableContext = tableContext.contentDocument.body;
631
661
  }
@@ -646,14 +676,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
646
676
  if (tableContext.shadowRoot) {
647
677
  rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
648
678
  }
649
- // Get rows from iframes
650
- if (tableContext.tagName === 'IFRAME') {
679
+ // Get rows from iframes and frames
680
+ if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
651
681
  try {
652
- const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
653
- rows.push(...iframeDoc.getElementsByTagName('TR'));
682
+ const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
683
+ rows.push(...frameDoc.getElementsByTagName('TR'));
654
684
  }
655
685
  catch (e) {
656
- console.warn('Cannot access iframe rows:', e);
686
+ console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
657
687
  }
658
688
  }
659
689
  const processedRows = filterRowsBasedOnTag(rows, tableFields);
@@ -713,7 +743,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
713
743
  }
714
744
  }
715
745
  }
716
- // Process non-table data with both contexts support
746
+ // Process non-table data with all contexts support
717
747
  for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
718
748
  if (nonTableData.length >= limit)
719
749
  break;
@@ -234,6 +234,14 @@ class Interpreter extends events_1.EventEmitter {
234
234
  parsedSuperset[key] = Array.isArray(superset[key])
235
235
  ? (0, utils_1.arrayToObject)(superset[key])
236
236
  : superset[key];
237
+ if ((key === 'url' || key === 'selectors') &&
238
+ Array.isArray(value) && Array.isArray(superset[key]) &&
239
+ value.length === 0 && superset[key].length === 0) {
240
+ return true;
241
+ }
242
+ if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
243
+ return value.some(selector => superset[key].includes(selector));
244
+ }
237
245
  // Every `subset` key must exist in the `superset` and
238
246
  // have the same value (strict equality), or subset[key] <= superset[key]
239
247
  return parsedSuperset[key]
@@ -492,29 +500,45 @@ class Interpreter extends events_1.EventEmitter {
492
500
  return false;
493
501
  };
494
502
  // Enhanced button finder with retry mechanism
495
- const findWorkingButton = (selectors, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
496
- for (const selector of selectors) {
497
- try {
498
- const button = yield page.waitForSelector(selector, {
499
- state: 'attached',
500
- timeout: 10000 // Reduced timeout for faster checks
501
- });
502
- if (button) {
503
- debugLog('Found working selector:', selector);
504
- return { button, workingSelector: selector };
503
+ const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
504
+ let updatedSelectors = [...selectors];
505
+ for (let i = 0; i < selectors.length; i++) {
506
+ const selector = selectors[i];
507
+ let retryCount = 0;
508
+ let selectorSuccess = false;
509
+ while (retryCount < MAX_RETRIES && !selectorSuccess) {
510
+ try {
511
+ const button = yield page.waitForSelector(selector, {
512
+ state: 'attached',
513
+ timeout: 10000
514
+ });
515
+ if (button) {
516
+ debugLog('Found working selector:', selector);
517
+ return {
518
+ button,
519
+ workingSelector: selector,
520
+ updatedSelectors
521
+ };
522
+ }
523
+ }
524
+ catch (error) {
525
+ retryCount++;
526
+ debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
527
+ if (retryCount < MAX_RETRIES) {
528
+ yield page.waitForTimeout(RETRY_DELAY);
529
+ }
530
+ else {
531
+ debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
532
+ updatedSelectors = updatedSelectors.filter(s => s !== selector);
533
+ }
505
534
  }
506
535
  }
507
- catch (error) {
508
- debugLog(`Selector failed: ${selector}`);
509
- }
510
- }
511
- // Implement retry mechanism when no selectors work
512
- if (selectors.length > 0 && retryCount < MAX_RETRIES) {
513
- debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
514
- yield page.waitForTimeout(RETRY_DELAY);
515
- return findWorkingButton(selectors, retryCount + 1);
516
536
  }
517
- return { button: null, workingSelector: null };
537
+ return {
538
+ button: null,
539
+ workingSelector: null,
540
+ updatedSelectors
541
+ };
518
542
  });
519
543
  const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
520
544
  try {
@@ -566,7 +590,8 @@ class Interpreter extends events_1.EventEmitter {
566
590
  yield scrapeCurrentPage();
567
591
  if (checkLimit())
568
592
  return allResults;
569
- const { button, workingSelector } = yield findWorkingButton(availableSelectors);
593
+ const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
594
+ availableSelectors = updatedSelectors;
570
595
  if (!button || !workingSelector) {
571
596
  // Final retry for navigation when no selectors work
572
597
  const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
@@ -583,7 +608,6 @@ class Interpreter extends events_1.EventEmitter {
583
608
  return allResults;
584
609
  break;
585
610
  }
586
- availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
587
611
  let retryCount = 0;
588
612
  let navigationSuccess = false;
589
613
  while (retryCount < MAX_RETRIES && !navigationSuccess) {
@@ -648,17 +672,21 @@ class Interpreter extends events_1.EventEmitter {
648
672
  break;
649
673
  }
650
674
  case 'clickLoadMore': {
675
+ yield scrapeCurrentPage();
676
+ if (checkLimit())
677
+ return allResults;
678
+ let loadMoreCounter = 0;
679
+ let previousResultCount = allResults.length;
680
+ let noNewItemsCounter = 0;
681
+ const MAX_NO_NEW_ITEMS = 2;
651
682
  while (true) {
652
- // Find working button with retry mechanism, consistent with clickNext
653
- const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
683
+ // Find working button with retry mechanism
684
+ const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
685
+ availableSelectors = updatedSelectors;
654
686
  if (!workingSelector || !loadMoreButton) {
655
687
  debugLog('No working Load More selector found after retries');
656
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
657
- allResults = allResults.concat(finalResults);
658
688
  return allResults;
659
689
  }
660
- // Update available selectors to start from the working one
661
- availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
662
690
  // Implement retry mechanism for clicking the button
663
691
  let retryCount = 0;
664
692
  let clickSuccess = false;
@@ -682,6 +710,8 @@ class Interpreter extends events_1.EventEmitter {
682
710
  }
683
711
  if (clickSuccess) {
684
712
  yield page.waitForTimeout(1000);
713
+ loadMoreCounter++;
714
+ debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
685
715
  }
686
716
  }
687
717
  catch (error) {
@@ -695,8 +725,6 @@ class Interpreter extends events_1.EventEmitter {
695
725
  }
696
726
  if (!clickSuccess) {
697
727
  debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
698
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
699
- allResults = allResults.concat(finalResults);
700
728
  return allResults;
701
729
  }
702
730
  // Wait for content to load and check scroll height
@@ -704,19 +732,30 @@ class Interpreter extends events_1.EventEmitter {
704
732
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
705
733
  yield page.waitForTimeout(2000);
706
734
  const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
707
- if (currentHeight === previousHeight) {
735
+ const heightChanged = currentHeight !== previousHeight;
736
+ previousHeight = currentHeight;
737
+ yield scrapeCurrentPage();
738
+ const currentResultCount = allResults.length;
739
+ const newItemsAdded = currentResultCount > previousResultCount;
740
+ if (!newItemsAdded) {
741
+ noNewItemsCounter++;
742
+ debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
743
+ if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
744
+ debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
745
+ return allResults;
746
+ }
747
+ }
748
+ else {
749
+ noNewItemsCounter = 0;
750
+ previousResultCount = currentResultCount;
751
+ }
752
+ if (checkLimit())
753
+ return allResults;
754
+ if (!heightChanged) {
708
755
  debugLog('No more items loaded after Load More');
709
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
710
- allResults = allResults.concat(finalResults);
711
756
  return allResults;
712
757
  }
713
- previousHeight = currentHeight;
714
- if (config.limit && allResults.length >= config.limit) {
715
- allResults = allResults.slice(0, config.limit);
716
- break;
717
- }
718
758
  }
719
- break;
720
759
  }
721
760
  default: {
722
761
  yield scrapeCurrentPage();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.11",
3
+ "version": "0.0.13",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",