mx-cloud 0.0.11 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -359,20 +359,170 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
359
359
  */
360
360
  window.scrapeList = function (_a) {
361
361
  return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
362
- var _b;
363
362
  // XPath evaluation functions
364
- const evaluateXPath = (rootElement, xpath) => {
363
+ const queryInsideContext = (context, part) => {
365
364
  try {
366
- const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
367
- ? rootElement
368
- : rootElement.ownerDocument;
369
- if (!ownerDoc)
370
- return null;
371
- const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
372
- return result.singleNodeValue;
365
+ const { tagName, conditions } = parseXPathPart(part);
366
+ const candidateElements = Array.from(context.querySelectorAll(tagName));
367
+ if (candidateElements.length === 0) {
368
+ return [];
369
+ }
370
+ const matchingElements = candidateElements.filter((el) => {
371
+ return elementMatchesConditions(el, conditions);
372
+ });
373
+ return matchingElements;
373
374
  }
374
- catch (error) {
375
- console.warn('XPath evaluation failed:', xpath, error);
375
+ catch (err) {
376
+ console.error("Error in queryInsideContext:", err);
377
+ return [];
378
+ }
379
+ };
380
+ // Helper function to parse XPath part
381
+ const parseXPathPart = (part) => {
382
+ const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
383
+ const tagName = tagMatch ? tagMatch[1] : "*";
384
+ const conditionMatches = part.match(/\[([^\]]+)\]/g);
385
+ const conditions = conditionMatches
386
+ ? conditionMatches.map((c) => c.slice(1, -1))
387
+ : [];
388
+ return { tagName, conditions };
389
+ };
390
+ // Helper function to check if element matches all conditions
391
+ const elementMatchesConditions = (element, conditions) => {
392
+ for (const condition of conditions) {
393
+ if (!elementMatchesCondition(element, condition)) {
394
+ return false;
395
+ }
396
+ }
397
+ return true;
398
+ };
399
+ // Helper function to check if element matches a single condition
400
+ const elementMatchesCondition = (element, condition) => {
401
+ var _a, _b;
402
+ condition = condition.trim();
403
+ if (/^\d+$/.test(condition)) {
404
+ return true;
405
+ }
406
+ // Handle @attribute="value"
407
+ const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
408
+ if (attrMatch) {
409
+ const [, attr, value] = attrMatch;
410
+ const elementValue = element.getAttribute(attr);
411
+ return elementValue === value;
412
+ }
413
+ // Handle contains(@class, 'value')
414
+ const classContainsMatch = condition.match(/^contains\(@class,\s*["']([^"']+)["']\)$/);
415
+ if (classContainsMatch) {
416
+ const className = classContainsMatch[1];
417
+ return element.classList.contains(className);
418
+ }
419
+ // Handle contains(@attribute, 'value')
420
+ const attrContainsMatch = condition.match(/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/);
421
+ if (attrContainsMatch) {
422
+ const [, attr, value] = attrContainsMatch;
423
+ const elementValue = element.getAttribute(attr) || "";
424
+ return elementValue.includes(value);
425
+ }
426
+ // Handle text()="value"
427
+ const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
428
+ if (textMatch) {
429
+ const expectedText = textMatch[1];
430
+ const elementText = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || "";
431
+ return elementText === expectedText;
432
+ }
433
+ // Handle contains(text(), 'value')
434
+ const textContainsMatch = condition.match(/^contains\(text\(\),\s*["']([^"']+)["']\)$/);
435
+ if (textContainsMatch) {
436
+ const expectedText = textContainsMatch[1];
437
+ const elementText = ((_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()) || "";
438
+ return elementText.includes(expectedText);
439
+ }
440
+ // Handle count(*)=0 (element has no children)
441
+ if (condition === "count(*)=0") {
442
+ return element.children.length === 0;
443
+ }
444
+ // Handle other count conditions
445
+ const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
446
+ if (countMatch) {
447
+ const expectedCount = parseInt(countMatch[1]);
448
+ return element.children.length === expectedCount;
449
+ }
450
+ return true;
451
+ };
452
+ const evaluateXPath = (document, xpath, isShadow = false) => {
453
+ try {
454
+ const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
455
+ if (!isShadow) {
456
+ if (result === null) {
457
+ return null;
458
+ }
459
+ return result;
460
+ }
461
+ let cleanPath = xpath;
462
+ let isIndexed = false;
463
+ const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
464
+ if (indexedMatch) {
465
+ cleanPath = indexedMatch[1] + indexedMatch[3];
466
+ isIndexed = true;
467
+ }
468
+ const pathParts = cleanPath
469
+ .replace(/^\/\//, "")
470
+ .split("/")
471
+ .map((p) => p.trim())
472
+ .filter((p) => p.length > 0);
473
+ let currentContexts = [document];
474
+ for (let i = 0; i < pathParts.length; i++) {
475
+ const part = pathParts[i];
476
+ const nextContexts = [];
477
+ for (const ctx of currentContexts) {
478
+ const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
479
+ let partWithoutPosition = part;
480
+ let requestedPosition = null;
481
+ if (positionalMatch) {
482
+ partWithoutPosition = positionalMatch[1];
483
+ requestedPosition = parseInt(positionalMatch[2]);
484
+ }
485
+ const matched = queryInsideContext(ctx, partWithoutPosition);
486
+ let elementsToAdd = matched;
487
+ if (requestedPosition !== null) {
488
+ const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
489
+ if (index >= 0 && index < matched.length) {
490
+ elementsToAdd = [matched[index]];
491
+ }
492
+ else {
493
+ console.warn(`Position ${requestedPosition} out of range (${matched.length} elements found)`);
494
+ elementsToAdd = [];
495
+ }
496
+ }
497
+ elementsToAdd.forEach((el) => {
498
+ nextContexts.push(el);
499
+ if (el.shadowRoot) {
500
+ nextContexts.push(el.shadowRoot);
501
+ }
502
+ });
503
+ }
504
+ if (nextContexts.length === 0) {
505
+ return null;
506
+ }
507
+ currentContexts = nextContexts;
508
+ }
509
+ if (currentContexts.length > 0) {
510
+ if (isIndexed && indexedMatch) {
511
+ const requestedIndex = parseInt(indexedMatch[2]) - 1;
512
+ if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
513
+ return currentContexts[requestedIndex];
514
+ }
515
+ else {
516
+ console.warn(`Requested index ${requestedIndex + 1} out of range (${currentContexts.length} elements found)`);
517
+ return null;
518
+ }
519
+ }
520
+ return currentContexts[0];
521
+ }
522
+ return null;
523
+ }
524
+ catch (err) {
525
+ console.error("Critical XPath failure:", xpath, err);
376
526
  return null;
377
527
  }
378
528
  };
@@ -394,33 +544,41 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
394
544
  return elements;
395
545
  }
396
546
  catch (error) {
397
- console.warn('XPath evaluation failed:', xpath, error);
547
+ console.warn("XPath evaluation failed:", xpath, error);
398
548
  return [];
399
549
  }
400
550
  };
401
- // Enhanced query function to handle iframe, frame, shadow DOM, and XPath
551
+ // Helper function to detect selector type
552
+ const isXPathSelector = (selector) => {
553
+ return (selector.startsWith("//") ||
554
+ selector.startsWith("/") ||
555
+ selector.startsWith("./"));
556
+ };
557
+ // Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
402
558
  const queryElement = (rootElement, selector) => {
403
- if (!selector.includes('>>') && !selector.includes(':>>')) {
559
+ if (!selector.includes(">>") && !selector.includes(":>>")) {
404
560
  // Check if it's an XPath selector
405
- if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
561
+ if (isXPathSelector(selector)) {
406
562
  return evaluateXPath(rootElement, selector);
407
563
  }
408
564
  else {
409
565
  return rootElement.querySelector(selector);
410
566
  }
411
567
  }
412
- const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
568
+ const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
413
569
  let currentElement = rootElement;
414
570
  for (let i = 0; i < parts.length; i++) {
415
571
  if (!currentElement)
416
572
  return null;
417
573
  // Handle iframe and frame traversal
418
- if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
574
+ if (currentElement.tagName === "IFRAME" ||
575
+ currentElement.tagName === "FRAME") {
419
576
  try {
420
- const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
577
+ const frameDoc = currentElement.contentDocument ||
578
+ currentElement.contentWindow.document;
421
579
  if (!frameDoc)
422
580
  return null;
423
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
581
+ if (isXPathSelector(parts[i])) {
424
582
  currentElement = evaluateXPath(frameDoc, parts[i]);
425
583
  }
426
584
  else {
@@ -434,9 +592,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
434
592
  }
435
593
  }
436
594
  let nextElement = null;
437
- if ('querySelector' in currentElement) {
438
- // Handle XPath vs CSS selector
439
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
595
+ // Try regular DOM first
596
+ if ("querySelector" in currentElement) {
597
+ if (isXPathSelector(parts[i])) {
440
598
  nextElement = evaluateXPath(currentElement, parts[i]);
441
599
  }
442
600
  else {
@@ -444,8 +602,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
444
602
  }
445
603
  }
446
604
  // Try shadow DOM if not found
447
- if (!nextElement && 'shadowRoot' in currentElement && currentElement.shadowRoot) {
448
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
605
+ if (!nextElement &&
606
+ "shadowRoot" in currentElement &&
607
+ currentElement.shadowRoot) {
608
+ if (isXPathSelector(parts[i])) {
449
609
  nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
450
610
  }
451
611
  else {
@@ -453,11 +613,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
453
613
  }
454
614
  }
455
615
  // Check children's shadow roots if still not found
456
- if (!nextElement && 'children' in currentElement) {
616
+ if (!nextElement && "children" in currentElement) {
457
617
  const children = Array.from(currentElement.children || []);
458
618
  for (const child of children) {
459
619
  if (child.shadowRoot) {
460
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
620
+ if (isXPathSelector(parts[i])) {
461
621
  nextElement = evaluateXPath(child.shadowRoot, parts[i]);
462
622
  }
463
623
  else {
@@ -472,28 +632,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
472
632
  }
473
633
  return currentElement;
474
634
  };
475
- // Enhanced query all function for XPath and CSS selectors
635
+ // Enhanced query all function for both contexts
476
636
  const queryElementAll = (rootElement, selector) => {
477
- if (!selector.includes('>>') && !selector.includes(':>>')) {
478
- // Check if it's an XPath selector
479
- if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
637
+ if (!selector.includes(">>") && !selector.includes(":>>")) {
638
+ if (isXPathSelector(selector)) {
480
639
  return evaluateXPathAll(rootElement, selector);
481
640
  }
482
641
  else {
483
642
  return Array.from(rootElement.querySelectorAll(selector));
484
643
  }
485
644
  }
486
- const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
645
+ const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
487
646
  let currentElements = [rootElement];
488
647
  for (const part of parts) {
489
648
  const nextElements = [];
490
649
  for (const element of currentElements) {
491
650
  // Handle iframe and frame traversal
492
- if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
651
+ if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
493
652
  try {
494
653
  const frameDoc = element.contentDocument || element.contentWindow.document;
495
654
  if (frameDoc) {
496
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
655
+ if (isXPathSelector(part)) {
497
656
  nextElements.push(...evaluateXPathAll(frameDoc, part));
498
657
  }
499
658
  else {
@@ -509,7 +668,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
509
668
  else {
510
669
  // Regular DOM elements
511
670
  if (element.querySelectorAll) {
512
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
671
+ if (isXPathSelector(part)) {
513
672
  nextElements.push(...evaluateXPathAll(element, part));
514
673
  }
515
674
  else {
@@ -518,7 +677,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
518
677
  }
519
678
  // Shadow DOM elements
520
679
  if (element.shadowRoot) {
521
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
680
+ if (isXPathSelector(part)) {
522
681
  nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
523
682
  }
524
683
  else {
@@ -529,7 +688,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
529
688
  const children = Array.from(element.children || []);
530
689
  for (const child of children) {
531
690
  if (child.shadowRoot) {
532
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
691
+ if (isXPathSelector(part)) {
533
692
  nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
534
693
  }
535
694
  else {
@@ -545,7 +704,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
545
704
  };
546
705
  // Enhanced value extraction with context awareness
547
706
  const extractValue = (element, attribute) => {
548
- var _a, _b, _c, _d, _e, _f;
707
+ var _a, _b, _c, _d, _e;
549
708
  if (!element)
550
709
  return null;
551
710
  // Get context-aware base URL
@@ -557,17 +716,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
557
716
  return shadowContent.trim();
558
717
  }
559
718
  }
560
- if (attribute === 'innerText') {
719
+ if (attribute === "innerText") {
561
720
  // First try standard innerText/textContent
562
721
  let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
563
722
  // If empty, check for common data attributes that might contain the text
564
723
  if (!textContent) {
565
724
  const dataAttributes = [
566
- 'data-600',
567
- 'data-text',
568
- 'data-label',
569
- 'data-value',
570
- 'data-content',
725
+ "data-600",
726
+ "data-text",
727
+ "data-label",
728
+ "data-value",
729
+ "data-content",
571
730
  ];
572
731
  for (const attr of dataAttributes) {
573
732
  const dataValue = element.getAttribute(attr);
@@ -579,140 +738,437 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
579
738
  }
580
739
  return textContent || null;
581
740
  }
582
- else if (attribute === 'innerHTML') {
741
+ else if (attribute === "innerHTML") {
583
742
  return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
584
743
  }
585
- else if (attribute === 'href') {
586
- // For href, we need to find the anchor tag if the current element isn't one
587
- let anchorElement = element;
588
- // If current element is not an anchor, look for parent anchor
589
- if (element.tagName !== 'A') {
590
- anchorElement = element.closest('a') || ((_f = element.parentElement) === null || _f === void 0 ? void 0 : _f.closest('a')) || element;
744
+ else if (attribute === "src" || attribute === "href") {
745
+ if (attribute === "href" && element.tagName !== "A") {
746
+ const parentElement = element.parentElement;
747
+ if (parentElement && parentElement.tagName === "A") {
748
+ const parentHref = parentElement.getAttribute("href");
749
+ if (parentHref) {
750
+ try {
751
+ return new URL(parentHref, baseURL).href;
752
+ }
753
+ catch (e) {
754
+ return parentHref;
755
+ }
756
+ }
757
+ }
591
758
  }
592
- const hrefValue = anchorElement.getAttribute('href');
593
- if (!hrefValue || hrefValue.trim() === '') {
759
+ const attrValue = element.getAttribute(attribute);
760
+ const dataAttr = attrValue || element.getAttribute("data-" + attribute);
761
+ if (!dataAttr || dataAttr.trim() === "") {
762
+ if (attribute === "src") {
763
+ const style = window.getComputedStyle(element);
764
+ const bgImage = style.backgroundImage;
765
+ if (bgImage && bgImage !== "none") {
766
+ const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
767
+ return matches ? new URL(matches[1], baseURL).href : null;
768
+ }
769
+ }
594
770
  return null;
595
771
  }
596
772
  try {
597
- return new URL(hrefValue, baseURL).href;
773
+ return new URL(dataAttr, baseURL).href;
598
774
  }
599
775
  catch (e) {
600
- console.warn('Error creating URL from', hrefValue, e);
601
- return hrefValue;
776
+ console.warn("Error creating URL from", dataAttr, e);
777
+ return dataAttr;
602
778
  }
603
779
  }
604
- else if (attribute === 'src') {
605
- const attrValue = element.getAttribute(attribute);
606
- const dataAttr = attrValue || element.getAttribute('data-' + attribute);
607
- if (!dataAttr || dataAttr.trim() === '') {
608
- const style = window.getComputedStyle(element);
609
- const bgImage = style.backgroundImage;
610
- if (bgImage && bgImage !== 'none') {
611
- const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
612
- return matches ? new URL(matches[1], baseURL).href : null;
780
+ return element.getAttribute(attribute);
781
+ };
782
+ // Enhanced table ancestor finding with context support
783
+ const findTableAncestor = (element) => {
784
+ let currentElement = element;
785
+ const MAX_DEPTH = 5;
786
+ let depth = 0;
787
+ while (currentElement && depth < MAX_DEPTH) {
788
+ // Handle shadow DOM
789
+ if (currentElement.getRootNode() instanceof ShadowRoot) {
790
+ currentElement = currentElement.getRootNode().host;
791
+ continue;
792
+ }
793
+ if (currentElement.tagName === "TD") {
794
+ return { type: "TD", element: currentElement };
795
+ }
796
+ else if (currentElement.tagName === "TR") {
797
+ return { type: "TR", element: currentElement };
798
+ }
799
+ // Handle iframe and frame crossing
800
+ if (currentElement.tagName === "IFRAME" ||
801
+ currentElement.tagName === "FRAME") {
802
+ try {
803
+ currentElement = currentElement.contentDocument.body;
613
804
  }
614
- return null;
805
+ catch (e) {
806
+ return null;
807
+ }
808
+ }
809
+ else {
810
+ currentElement = currentElement.parentElement;
615
811
  }
812
+ depth++;
813
+ }
814
+ return null;
815
+ };
816
+ // Helper function to get cell index
817
+ const getCellIndex = (td) => {
818
+ if (td.getRootNode() instanceof ShadowRoot) {
819
+ const shadowRoot = td.getRootNode();
820
+ const allCells = Array.from(shadowRoot.querySelectorAll("td"));
821
+ return allCells.indexOf(td);
822
+ }
823
+ let index = 0;
824
+ let sibling = td;
825
+ while ((sibling = sibling.previousElementSibling)) {
826
+ index++;
827
+ }
828
+ return index;
829
+ };
830
+ // Helper function to check for TH elements
831
+ const hasThElement = (row, tableFields) => {
832
+ for (const [_, { selector }] of Object.entries(tableFields)) {
833
+ const element = queryElement(row, selector);
834
+ if (element) {
835
+ let current = element;
836
+ while (current && current !== row) {
837
+ if (current.getRootNode() instanceof ShadowRoot) {
838
+ current = current.getRootNode().host;
839
+ continue;
840
+ }
841
+ if (current.tagName === "TH")
842
+ return true;
843
+ if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
844
+ try {
845
+ current = current.contentDocument.body;
846
+ }
847
+ catch (e) {
848
+ break;
849
+ }
850
+ }
851
+ else {
852
+ current = current.parentElement;
853
+ }
854
+ }
855
+ }
856
+ }
857
+ return false;
858
+ };
859
+ // Helper function to filter rows
860
+ const filterRowsBasedOnTag = (rows, tableFields) => {
861
+ for (const row of rows) {
862
+ if (hasThElement(row, tableFields)) {
863
+ return rows;
864
+ }
865
+ }
866
+ return rows.filter((row) => {
867
+ const directTH = row.getElementsByTagName("TH").length === 0;
868
+ const shadowTH = row.shadowRoot
869
+ ? row.shadowRoot.querySelector("th") === null
870
+ : true;
871
+ return directTH && shadowTH;
872
+ });
873
+ };
874
+ // Class similarity comparison functions
875
+ const calculateClassSimilarity = (classList1, classList2) => {
876
+ const set1 = new Set(classList1);
877
+ const set2 = new Set(classList2);
878
+ const intersection = new Set([...set1].filter((x) => set2.has(x)));
879
+ const union = new Set([...set1, ...set2]);
880
+ return intersection.size / union.size;
881
+ };
882
+ // Enhanced similar elements finding with context support
883
+ const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
884
+ const baseClasses = Array.from(baseElement.classList);
885
+ if (baseClasses.length === 0)
886
+ return [];
887
+ const allElements = [];
888
+ // Get elements from main document
889
+ allElements.push(...document.getElementsByTagName(baseElement.tagName));
890
+ // Get elements from shadow DOM
891
+ if (baseElement.getRootNode() instanceof ShadowRoot) {
892
+ const shadowHost = baseElement.getRootNode().host;
893
+ allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
894
+ }
895
+ // Get elements from iframes and frames
896
+ const frames = [
897
+ ...Array.from(document.getElementsByTagName("iframe")),
898
+ ...Array.from(document.getElementsByTagName("frame")),
899
+ ];
900
+ for (const frame of frames) {
616
901
  try {
617
- return new URL(dataAttr, baseURL).href;
902
+ const frameDoc = frame.contentDocument || frame.contentWindow.document;
903
+ allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
618
904
  }
619
905
  catch (e) {
620
- console.warn('Error creating URL from', dataAttr, e);
621
- return dataAttr;
906
+ console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
622
907
  }
623
908
  }
624
- return element.getAttribute(attribute);
909
+ return allElements.filter((element) => {
910
+ if (element === baseElement)
911
+ return false;
912
+ const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
913
+ return similarity >= similarityThreshold;
914
+ });
915
+ };
916
+ const tryFallbackSelector = (rootElement, originalSelector) => {
917
+ let element = queryElement(rootElement, originalSelector);
918
+ if (!element && originalSelector.includes("nth-child")) {
919
+ const match = originalSelector.match(/nth-child\((\d+)\)/);
920
+ if (match) {
921
+ const position = parseInt(match[1], 10);
922
+ for (let i = position - 1; i >= 1; i--) {
923
+ const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
924
+ element = queryElement(rootElement, fallbackSelector);
925
+ if (element)
926
+ break;
927
+ }
928
+ if (!element) {
929
+ const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, "");
930
+ element = queryElement(rootElement, baseSelector);
931
+ }
932
+ }
933
+ }
934
+ return element;
625
935
  };
626
936
  // Create indexed XPath for specific container instance
627
937
  const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
628
- console.log(`Creating indexed XPath for container ${containerIndex}`);
629
- console.log(`Child selector: ${childSelector}`);
630
- console.log(`List selector: ${listSelector}`);
631
938
  // Check if the child selector contains the list selector pattern
632
- if (childSelector.includes(listSelector.replace('//', ''))) {
939
+ if (childSelector.includes(listSelector.replace("//", ""))) {
633
940
  // Replace the list selector part with indexed version
634
- const listPattern = listSelector.replace('//', '');
941
+ const listPattern = listSelector.replace("//", "");
635
942
  const indexedListSelector = `(${listSelector})[${containerIndex}]`;
636
943
  const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
637
- console.log(`Generated indexed selector: ${indexedSelector}`);
638
944
  return indexedSelector;
639
945
  }
640
946
  else {
641
947
  // If pattern doesn't match, create a more generic indexed selector
642
- console.warn(`Pattern doesn't match, using fallback approach`);
643
- return `(${listSelector})[${containerIndex}]${childSelector.replace('//', '/')}`;
948
+ return `(${listSelector})[${containerIndex}]${childSelector.replace("//", "/")}`;
644
949
  }
645
950
  };
646
- // Main scraping logic
647
- console.log('🚀 Starting list data extraction');
648
- console.log('List Selector:', listSelector);
649
- console.log('Fields:', fields);
650
- // Step 1: Get all container elements matching the list selector
651
- const containers = queryElementAll(document, listSelector);
652
- console.log(`📦 Found ${containers.length} list containers`);
951
+ // Main scraping logic with unified support for both CSS and XPath
952
+ console.log("🚀 Starting unified list data extraction");
953
+ console.log("List Selector:", listSelector);
954
+ console.log("Fields:", fields);
955
+ let containers = queryElementAll(document, listSelector);
956
+ containers = Array.from(containers);
653
957
  if (containers.length === 0) {
654
- console.warn('❌ No containers found for listSelector:', listSelector);
958
+ console.warn("❌ No containers found for listSelector:", listSelector);
655
959
  return [];
656
960
  }
657
- // Step 2: Extract data from each container up to the limit
658
- const extractedData = [];
659
- const containersToProcess = Math.min(containers.length, limit);
660
- console.log(`🔄 Processing ${containersToProcess} containers...`);
661
- for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
662
- const container = containers[containerIndex];
663
- const record = {};
664
- console.log(`\n📋 Processing container ${containerIndex + 1}/${containersToProcess}`);
665
- // Step 3: For each field, extract data from the current container
666
- for (const [label, field] of Object.entries(fields)) {
667
- console.log(`\n 🔍 Extracting field "${label}"`);
668
- console.log(` Original selector: ${field.selector}`);
669
- console.log(` Attribute: ${field.attribute}`);
670
- let element = null;
671
- // Handle XPath selectors with container indexing
672
- if (field.selector.startsWith('//')) {
673
- // Create indexed absolute XPath
674
- const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
675
- console.log(` 📍 Indexed selector: ${indexedSelector}`);
676
- element = evaluateXPath(document, indexedSelector);
677
- console.log(` 📍 Indexed XPath result: ${element ? 'FOUND' : 'NOT FOUND'}`);
961
+ console.log(`📦 Found ${containers.length} list containers`);
962
+ // For CSS selectors, try to find similar containers if needed
963
+ if (!isXPathSelector(listSelector) &&
964
+ limit > 1 &&
965
+ containers.length === 1) {
966
+ const baseContainer = containers[0];
967
+ const similarContainers = findSimilarElements(baseContainer);
968
+ if (similarContainers.length > 0) {
969
+ const newContainers = similarContainers.filter((container) => !container.matches(listSelector));
970
+ containers = [...containers, ...newContainers];
971
+ }
972
+ }
973
+ const containerFields = containers.map(() => ({
974
+ tableFields: {},
975
+ nonTableFields: {},
976
+ }));
977
+ // For XPath selectors, use the new approach
978
+ if (isXPathSelector(listSelector)) {
979
+ const extractedData = [];
980
+ const containersToProcess = Math.min(containers.length, limit);
981
+ for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
982
+ const record = {};
983
+ for (const [label, field] of Object.entries(fields)) {
984
+ let element = null;
985
+ if (isXPathSelector(field.selector)) {
986
+ // Create indexed absolute XPath
987
+ const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
988
+ element = evaluateXPath(document, indexedSelector, field.isShadow);
989
+ }
990
+ else {
991
+ // Fallback for CSS selectors within XPath containers
992
+ const container = containers[containerIndex];
993
+ element = queryElement(container, field.selector);
994
+ }
678
995
  if (element) {
679
- console.log(` 📍 Found element text: "${(_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()}"`);
996
+ const value = extractValue(element, field.attribute);
997
+ if (value !== null && value !== "") {
998
+ record[label] = value;
999
+ }
1000
+ else {
1001
+ record[label] = "";
1002
+ }
1003
+ }
1004
+ else {
1005
+ record[label] = "";
680
1006
  }
681
1007
  }
682
- else {
683
- // Fallback for non-XPath selectors - search within container
684
- element = queryElement(container, field.selector);
1008
+ if (Object.values(record).some((value) => value !== "")) {
1009
+ extractedData.push(record);
685
1010
  }
686
- // Step 4: Extract the value from the found element
687
- if (element) {
688
- const value = extractValue(element, field.attribute);
689
- if (value !== null && value !== '') {
690
- record[label] = value;
691
- console.log(` ✅ Extracted "${label}": "${value}"`);
1011
+ }
1012
+ console.log(`📊 Total records extracted: ${extractedData.length}`);
1013
+ return extractedData;
1014
+ }
1015
+ // For CSS selectors, use the original table-aware approach
1016
+ containers.forEach((container, containerIndex) => {
1017
+ for (const [label, field] of Object.entries(fields)) {
1018
+ const sampleElement = queryElement(container, field.selector);
1019
+ if (sampleElement) {
1020
+ const ancestor = findTableAncestor(sampleElement);
1021
+ if (ancestor) {
1022
+ containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1 });
692
1023
  }
693
1024
  else {
694
- console.warn(` ⚠️ Empty value for "${label}"`);
695
- record[label] = '';
1025
+ containerFields[containerIndex].nonTableFields[label] = field;
696
1026
  }
697
1027
  }
698
1028
  else {
699
- console.warn(` ❌ Element not found for "${label}"`);
700
- record[label] = '';
1029
+ containerFields[containerIndex].nonTableFields[label] = field;
701
1030
  }
702
1031
  }
703
- // Step 5: Add record if it has any non-empty values
704
- if (Object.values(record).some(value => value !== '')) {
705
- extractedData.push(record);
706
- console.log(` ✅ Added record ${containerIndex + 1}:`, record);
1032
+ });
1033
+ const tableData = [];
1034
+ const nonTableData = [];
1035
+ // Process table data with support for iframes, frames, and shadow DOM
1036
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
1037
+ const container = containers[containerIndex];
1038
+ const { tableFields } = containerFields[containerIndex];
1039
+ if (Object.keys(tableFields).length > 0) {
1040
+ const firstField = Object.values(tableFields)[0];
1041
+ const firstElement = queryElement(container, firstField.selector);
1042
+ let tableContext = firstElement;
1043
+ // Find table context including iframe, frame and shadow DOM
1044
+ while (tableContext &&
1045
+ tableContext.tagName !== "TABLE" &&
1046
+ tableContext !== container) {
1047
+ if (tableContext.getRootNode() instanceof ShadowRoot) {
1048
+ tableContext = tableContext.getRootNode().host;
1049
+ continue;
1050
+ }
1051
+ if (tableContext.tagName === "IFRAME" ||
1052
+ tableContext.tagName === "FRAME") {
1053
+ try {
1054
+ tableContext = tableContext.contentDocument.body;
1055
+ }
1056
+ catch (e) {
1057
+ break;
1058
+ }
1059
+ }
1060
+ else {
1061
+ tableContext = tableContext.parentElement;
1062
+ }
1063
+ }
1064
+ if (tableContext) {
1065
+ // Get rows from all contexts
1066
+ const rows = [];
1067
+ // Get rows from regular DOM
1068
+ rows.push(...tableContext.getElementsByTagName("TR"));
1069
+ // Get rows from shadow DOM
1070
+ if (tableContext.shadowRoot) {
1071
+ rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
1072
+ }
1073
+ // Get rows from iframes and frames
1074
+ if (tableContext.tagName === "IFRAME" ||
1075
+ tableContext.tagName === "FRAME") {
1076
+ try {
1077
+ const frameDoc = tableContext.contentDocument ||
1078
+ tableContext.contentWindow.document;
1079
+ rows.push(...frameDoc.getElementsByTagName("TR"));
1080
+ }
1081
+ catch (e) {
1082
+ console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
1083
+ }
1084
+ }
1085
+ const processedRows = filterRowsBasedOnTag(rows, tableFields);
1086
+ for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
1087
+ const record = {};
1088
+ const currentRow = processedRows[rowIndex];
1089
+ for (const [label, { selector, attribute, cellIndex },] of Object.entries(tableFields)) {
1090
+ let element = null;
1091
+ if (cellIndex >= 0) {
1092
+ // Get TD element considering both contexts
1093
+ let td = currentRow.children[cellIndex];
1094
+ // Check shadow DOM for td
1095
+ if (!td && currentRow.shadowRoot) {
1096
+ const shadowCells = currentRow.shadowRoot.children;
1097
+ if (shadowCells && shadowCells.length > cellIndex) {
1098
+ td = shadowCells[cellIndex];
1099
+ }
1100
+ }
1101
+ if (td) {
1102
+ element = queryElement(td, selector);
1103
+ if (!element &&
1104
+ selector
1105
+ .split(/(?:>>|:>>)/)
1106
+ .pop()
1107
+ .includes("td:nth-child")) {
1108
+ element = td;
1109
+ }
1110
+ if (!element) {
1111
+ const tagOnlySelector = selector.split(".")[0];
1112
+ element = queryElement(td, tagOnlySelector);
1113
+ }
1114
+ if (!element) {
1115
+ let currentElement = td;
1116
+ while (currentElement &&
1117
+ currentElement.children.length > 0) {
1118
+ let foundContentChild = false;
1119
+ for (const child of currentElement.children) {
1120
+ if (extractValue(child, attribute)) {
1121
+ currentElement = child;
1122
+ foundContentChild = true;
1123
+ break;
1124
+ }
1125
+ }
1126
+ if (!foundContentChild)
1127
+ break;
1128
+ }
1129
+ element = currentElement;
1130
+ }
1131
+ }
1132
+ }
1133
+ else {
1134
+ element = queryElement(currentRow, selector);
1135
+ }
1136
+ if (element) {
1137
+ record[label] = extractValue(element, attribute);
1138
+ }
1139
+ }
1140
+ if (Object.keys(record).length > 0) {
1141
+ tableData.push(record);
1142
+ }
1143
+ }
1144
+ }
707
1145
  }
708
- else {
709
- console.warn(` ⚠️ Skipping empty record for container ${containerIndex + 1}`);
1146
+ }
1147
+ // Process non-table data with all contexts support
1148
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
1149
+ if (nonTableData.length >= limit)
1150
+ break;
1151
+ const container = containers[containerIndex];
1152
+ const { nonTableFields } = containerFields[containerIndex];
1153
+ if (Object.keys(nonTableFields).length > 0) {
1154
+ const record = {};
1155
+ for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
1156
+ // Get the last part of the selector after any context delimiter
1157
+ const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
1158
+ const element = tryFallbackSelector(container, relativeSelector);
1159
+ if (element) {
1160
+ record[label] = extractValue(element, attribute);
1161
+ }
1162
+ }
1163
+ if (Object.keys(record).length > 0) {
1164
+ nonTableData.push(record);
1165
+ }
710
1166
  }
711
1167
  }
712
- console.log('\n🎉 Extraction complete!');
713
- console.log(`📊 Total records extracted: ${extractedData.length}`);
714
- console.log('📋 All records:', extractedData);
715
- return extractedData;
1168
+ // Merge and limit the results
1169
+ const scrapedData = [...tableData, ...nonTableData];
1170
+ console.log(`📊 Total records extracted: ${scrapedData.length}`);
1171
+ return scrapedData;
716
1172
  });
717
1173
  };
718
1174
  /**
@@ -829,9 +829,9 @@ class Interpreter extends events_1.EventEmitter {
829
829
  if (checkLimit())
830
830
  return allResults;
831
831
  let loadMoreCounter = 0;
832
- let previousResultCount = allResults.length;
833
- let noNewItemsCounter = 0;
834
- const MAX_NO_NEW_ITEMS = 2;
832
+ // let previousResultCount = allResults.length;
833
+ // let noNewItemsCounter = 0;
834
+ // const MAX_NO_NEW_ITEMS = 2;
835
835
  while (true) {
836
836
  // Find working button with retry mechanism
837
837
  const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
@@ -888,20 +888,19 @@ class Interpreter extends events_1.EventEmitter {
888
888
  const heightChanged = currentHeight !== previousHeight;
889
889
  previousHeight = currentHeight;
890
890
  yield scrapeCurrentPage();
891
- const currentResultCount = allResults.length;
892
- const newItemsAdded = currentResultCount > previousResultCount;
893
- if (!newItemsAdded) {
894
- noNewItemsCounter++;
895
- debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
896
- if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
897
- debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
898
- return allResults;
899
- }
900
- }
901
- else {
902
- noNewItemsCounter = 0;
903
- previousResultCount = currentResultCount;
904
- }
891
+ // const currentResultCount = allResults.length;
892
+ // const newItemsAdded = currentResultCount > previousResultCount;
893
+ // if (!newItemsAdded) {
894
+ // noNewItemsCounter++;
895
+ // debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
896
+ // if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
897
+ // debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
898
+ // return allResults;
899
+ // }
900
+ // } else {
901
+ // noNewItemsCounter = 0;
902
+ // previousResultCount = currentResultCount;
903
+ // }
905
904
  if (checkLimit())
906
905
  return allResults;
907
906
  if (!heightChanged) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.11",
3
+ "version": "0.0.13",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",