mx-cloud 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -359,7 +359,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
359
359
  */
360
360
  window.scrapeList = function (_a) {
361
361
  return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
362
- var _b;
363
362
  // XPath evaluation functions
364
363
  const evaluateXPath = (rootElement, xpath) => {
365
364
  try {
@@ -372,7 +371,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
372
371
  return result.singleNodeValue;
373
372
  }
374
373
  catch (error) {
375
- console.warn('XPath evaluation failed:', xpath, error);
374
+ console.warn("XPath evaluation failed:", xpath, error);
376
375
  return null;
377
376
  }
378
377
  };
@@ -394,33 +393,41 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
394
393
  return elements;
395
394
  }
396
395
  catch (error) {
397
- console.warn('XPath evaluation failed:', xpath, error);
396
+ console.warn("XPath evaluation failed:", xpath, error);
398
397
  return [];
399
398
  }
400
399
  };
401
- // Enhanced query function to handle iframe, frame, shadow DOM, and XPath
400
+ // Helper function to detect selector type
401
+ const isXPathSelector = (selector) => {
402
+ return (selector.startsWith("//") ||
403
+ selector.startsWith("/") ||
404
+ selector.startsWith("./"));
405
+ };
406
+ // Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
402
407
  const queryElement = (rootElement, selector) => {
403
- if (!selector.includes('>>') && !selector.includes(':>>')) {
408
+ if (!selector.includes(">>") && !selector.includes(":>>")) {
404
409
  // Check if it's an XPath selector
405
- if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
410
+ if (isXPathSelector(selector)) {
406
411
  return evaluateXPath(rootElement, selector);
407
412
  }
408
413
  else {
409
414
  return rootElement.querySelector(selector);
410
415
  }
411
416
  }
412
- const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
417
+ const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
413
418
  let currentElement = rootElement;
414
419
  for (let i = 0; i < parts.length; i++) {
415
420
  if (!currentElement)
416
421
  return null;
417
422
  // Handle iframe and frame traversal
418
- if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
423
+ if (currentElement.tagName === "IFRAME" ||
424
+ currentElement.tagName === "FRAME") {
419
425
  try {
420
- const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
426
+ const frameDoc = currentElement.contentDocument ||
427
+ currentElement.contentWindow.document;
421
428
  if (!frameDoc)
422
429
  return null;
423
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
430
+ if (isXPathSelector(parts[i])) {
424
431
  currentElement = evaluateXPath(frameDoc, parts[i]);
425
432
  }
426
433
  else {
@@ -434,9 +441,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
434
441
  }
435
442
  }
436
443
  let nextElement = null;
437
- if ('querySelector' in currentElement) {
438
- // Handle XPath vs CSS selector
439
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
444
+ // Try regular DOM first
445
+ if ("querySelector" in currentElement) {
446
+ if (isXPathSelector(parts[i])) {
440
447
  nextElement = evaluateXPath(currentElement, parts[i]);
441
448
  }
442
449
  else {
@@ -444,8 +451,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
444
451
  }
445
452
  }
446
453
  // Try shadow DOM if not found
447
- if (!nextElement && 'shadowRoot' in currentElement && currentElement.shadowRoot) {
448
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
454
+ if (!nextElement &&
455
+ "shadowRoot" in currentElement &&
456
+ currentElement.shadowRoot) {
457
+ if (isXPathSelector(parts[i])) {
449
458
  nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
450
459
  }
451
460
  else {
@@ -453,11 +462,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
453
462
  }
454
463
  }
455
464
  // Check children's shadow roots if still not found
456
- if (!nextElement && 'children' in currentElement) {
465
+ if (!nextElement && "children" in currentElement) {
457
466
  const children = Array.from(currentElement.children || []);
458
467
  for (const child of children) {
459
468
  if (child.shadowRoot) {
460
- if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
469
+ if (isXPathSelector(parts[i])) {
461
470
  nextElement = evaluateXPath(child.shadowRoot, parts[i]);
462
471
  }
463
472
  else {
@@ -472,28 +481,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
472
481
  }
473
482
  return currentElement;
474
483
  };
475
- // Enhanced query all function for XPath and CSS selectors
484
+ // Enhanced query all function for both contexts
476
485
  const queryElementAll = (rootElement, selector) => {
477
- if (!selector.includes('>>') && !selector.includes(':>>')) {
478
- // Check if it's an XPath selector
479
- if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
486
+ if (!selector.includes(">>") && !selector.includes(":>>")) {
487
+ if (isXPathSelector(selector)) {
480
488
  return evaluateXPathAll(rootElement, selector);
481
489
  }
482
490
  else {
483
491
  return Array.from(rootElement.querySelectorAll(selector));
484
492
  }
485
493
  }
486
- const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
494
+ const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
487
495
  let currentElements = [rootElement];
488
496
  for (const part of parts) {
489
497
  const nextElements = [];
490
498
  for (const element of currentElements) {
491
499
  // Handle iframe and frame traversal
492
- if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
500
+ if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
493
501
  try {
494
502
  const frameDoc = element.contentDocument || element.contentWindow.document;
495
503
  if (frameDoc) {
496
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
504
+ if (isXPathSelector(part)) {
497
505
  nextElements.push(...evaluateXPathAll(frameDoc, part));
498
506
  }
499
507
  else {
@@ -509,7 +517,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
509
517
  else {
510
518
  // Regular DOM elements
511
519
  if (element.querySelectorAll) {
512
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
520
+ if (isXPathSelector(part)) {
513
521
  nextElements.push(...evaluateXPathAll(element, part));
514
522
  }
515
523
  else {
@@ -518,7 +526,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
518
526
  }
519
527
  // Shadow DOM elements
520
528
  if (element.shadowRoot) {
521
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
529
+ if (isXPathSelector(part)) {
522
530
  nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
523
531
  }
524
532
  else {
@@ -529,7 +537,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
529
537
  const children = Array.from(element.children || []);
530
538
  for (const child of children) {
531
539
  if (child.shadowRoot) {
532
- if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
540
+ if (isXPathSelector(part)) {
533
541
  nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
534
542
  }
535
543
  else {
@@ -545,7 +553,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
545
553
  };
546
554
  // Enhanced value extraction with context awareness
547
555
  const extractValue = (element, attribute) => {
548
- var _a, _b, _c, _d, _e, _f;
556
+ var _a, _b, _c, _d, _e;
549
557
  if (!element)
550
558
  return null;
551
559
  // Get context-aware base URL
@@ -557,17 +565,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
557
565
  return shadowContent.trim();
558
566
  }
559
567
  }
560
- if (attribute === 'innerText') {
568
+ if (attribute === "innerText") {
561
569
  // First try standard innerText/textContent
562
570
  let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
563
571
  // If empty, check for common data attributes that might contain the text
564
572
  if (!textContent) {
565
573
  const dataAttributes = [
566
- 'data-600',
567
- 'data-text',
568
- 'data-label',
569
- 'data-value',
570
- 'data-content',
574
+ "data-600",
575
+ "data-text",
576
+ "data-label",
577
+ "data-value",
578
+ "data-content",
571
579
  ];
572
580
  for (const attr of dataAttributes) {
573
581
  const dataValue = element.getAttribute(attr);
@@ -579,140 +587,437 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
579
587
  }
580
588
  return textContent || null;
581
589
  }
582
- else if (attribute === 'innerHTML') {
590
+ else if (attribute === "innerHTML") {
583
591
  return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
584
592
  }
585
- else if (attribute === 'href') {
586
- // For href, we need to find the anchor tag if the current element isn't one
587
- let anchorElement = element;
588
- // If current element is not an anchor, look for parent anchor
589
- if (element.tagName !== 'A') {
590
- anchorElement = element.closest('a') || ((_f = element.parentElement) === null || _f === void 0 ? void 0 : _f.closest('a')) || element;
593
+ else if (attribute === "src" || attribute === "href") {
594
+ if (attribute === "href" && element.tagName !== "A") {
595
+ const parentElement = element.parentElement;
596
+ if (parentElement && parentElement.tagName === "A") {
597
+ const parentHref = parentElement.getAttribute("href");
598
+ if (parentHref) {
599
+ try {
600
+ return new URL(parentHref, baseURL).href;
601
+ }
602
+ catch (e) {
603
+ return parentHref;
604
+ }
605
+ }
606
+ }
591
607
  }
592
- const hrefValue = anchorElement.getAttribute('href');
593
- if (!hrefValue || hrefValue.trim() === '') {
608
+ const attrValue = element.getAttribute(attribute);
609
+ const dataAttr = attrValue || element.getAttribute("data-" + attribute);
610
+ if (!dataAttr || dataAttr.trim() === "") {
611
+ if (attribute === "src") {
612
+ const style = window.getComputedStyle(element);
613
+ const bgImage = style.backgroundImage;
614
+ if (bgImage && bgImage !== "none") {
615
+ const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
616
+ return matches ? new URL(matches[1], baseURL).href : null;
617
+ }
618
+ }
594
619
  return null;
595
620
  }
596
621
  try {
597
- return new URL(hrefValue, baseURL).href;
622
+ return new URL(dataAttr, baseURL).href;
598
623
  }
599
624
  catch (e) {
600
- console.warn('Error creating URL from', hrefValue, e);
601
- return hrefValue;
625
+ console.warn("Error creating URL from", dataAttr, e);
626
+ return dataAttr;
602
627
  }
603
628
  }
604
- else if (attribute === 'src') {
605
- const attrValue = element.getAttribute(attribute);
606
- const dataAttr = attrValue || element.getAttribute('data-' + attribute);
607
- if (!dataAttr || dataAttr.trim() === '') {
608
- const style = window.getComputedStyle(element);
609
- const bgImage = style.backgroundImage;
610
- if (bgImage && bgImage !== 'none') {
611
- const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
612
- return matches ? new URL(matches[1], baseURL).href : null;
629
+ return element.getAttribute(attribute);
630
+ };
631
+ // Enhanced table ancestor finding with context support
632
+ const findTableAncestor = (element) => {
633
+ let currentElement = element;
634
+ const MAX_DEPTH = 5;
635
+ let depth = 0;
636
+ while (currentElement && depth < MAX_DEPTH) {
637
+ // Handle shadow DOM
638
+ if (currentElement.getRootNode() instanceof ShadowRoot) {
639
+ currentElement = currentElement.getRootNode().host;
640
+ continue;
641
+ }
642
+ if (currentElement.tagName === "TD") {
643
+ return { type: "TD", element: currentElement };
644
+ }
645
+ else if (currentElement.tagName === "TR") {
646
+ return { type: "TR", element: currentElement };
647
+ }
648
+ // Handle iframe and frame crossing
649
+ if (currentElement.tagName === "IFRAME" ||
650
+ currentElement.tagName === "FRAME") {
651
+ try {
652
+ currentElement = currentElement.contentDocument.body;
613
653
  }
614
- return null;
654
+ catch (e) {
655
+ return null;
656
+ }
657
+ }
658
+ else {
659
+ currentElement = currentElement.parentElement;
660
+ }
661
+ depth++;
662
+ }
663
+ return null;
664
+ };
665
+ // Helper function to get cell index
666
+ const getCellIndex = (td) => {
667
+ if (td.getRootNode() instanceof ShadowRoot) {
668
+ const shadowRoot = td.getRootNode();
669
+ const allCells = Array.from(shadowRoot.querySelectorAll("td"));
670
+ return allCells.indexOf(td);
671
+ }
672
+ let index = 0;
673
+ let sibling = td;
674
+ while ((sibling = sibling.previousElementSibling)) {
675
+ index++;
676
+ }
677
+ return index;
678
+ };
679
+ // Helper function to check for TH elements
680
+ const hasThElement = (row, tableFields) => {
681
+ for (const [_, { selector }] of Object.entries(tableFields)) {
682
+ const element = queryElement(row, selector);
683
+ if (element) {
684
+ let current = element;
685
+ while (current && current !== row) {
686
+ if (current.getRootNode() instanceof ShadowRoot) {
687
+ current = current.getRootNode().host;
688
+ continue;
689
+ }
690
+ if (current.tagName === "TH")
691
+ return true;
692
+ if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
693
+ try {
694
+ current = current.contentDocument.body;
695
+ }
696
+ catch (e) {
697
+ break;
698
+ }
699
+ }
700
+ else {
701
+ current = current.parentElement;
702
+ }
703
+ }
704
+ }
705
+ }
706
+ return false;
707
+ };
708
+ // Helper function to filter rows
709
+ const filterRowsBasedOnTag = (rows, tableFields) => {
710
+ for (const row of rows) {
711
+ if (hasThElement(row, tableFields)) {
712
+ return rows;
615
713
  }
714
+ }
715
+ return rows.filter((row) => {
716
+ const directTH = row.getElementsByTagName("TH").length === 0;
717
+ const shadowTH = row.shadowRoot
718
+ ? row.shadowRoot.querySelector("th") === null
719
+ : true;
720
+ return directTH && shadowTH;
721
+ });
722
+ };
723
+ // Class similarity comparison functions
724
+ const calculateClassSimilarity = (classList1, classList2) => {
725
+ const set1 = new Set(classList1);
726
+ const set2 = new Set(classList2);
727
+ const intersection = new Set([...set1].filter((x) => set2.has(x)));
728
+ const union = new Set([...set1, ...set2]);
729
+ return intersection.size / union.size;
730
+ };
731
+ // Enhanced similar elements finding with context support
732
+ const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
733
+ const baseClasses = Array.from(baseElement.classList);
734
+ if (baseClasses.length === 0)
735
+ return [];
736
+ const allElements = [];
737
+ // Get elements from main document
738
+ allElements.push(...document.getElementsByTagName(baseElement.tagName));
739
+ // Get elements from shadow DOM
740
+ if (baseElement.getRootNode() instanceof ShadowRoot) {
741
+ const shadowHost = baseElement.getRootNode().host;
742
+ allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
743
+ }
744
+ // Get elements from iframes and frames
745
+ const frames = [
746
+ ...Array.from(document.getElementsByTagName("iframe")),
747
+ ...Array.from(document.getElementsByTagName("frame")),
748
+ ];
749
+ for (const frame of frames) {
616
750
  try {
617
- return new URL(dataAttr, baseURL).href;
751
+ const frameDoc = frame.contentDocument || frame.contentWindow.document;
752
+ allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
618
753
  }
619
754
  catch (e) {
620
- console.warn('Error creating URL from', dataAttr, e);
621
- return dataAttr;
755
+ console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
622
756
  }
623
757
  }
624
- return element.getAttribute(attribute);
758
+ return allElements.filter((element) => {
759
+ if (element === baseElement)
760
+ return false;
761
+ const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
762
+ return similarity >= similarityThreshold;
763
+ });
764
+ };
765
+ const tryFallbackSelector = (rootElement, originalSelector) => {
766
+ let element = queryElement(rootElement, originalSelector);
767
+ if (!element && originalSelector.includes("nth-child")) {
768
+ const match = originalSelector.match(/nth-child\((\d+)\)/);
769
+ if (match) {
770
+ const position = parseInt(match[1], 10);
771
+ for (let i = position - 1; i >= 1; i--) {
772
+ const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
773
+ element = queryElement(rootElement, fallbackSelector);
774
+ if (element)
775
+ break;
776
+ }
777
+ if (!element) {
778
+ const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, "");
779
+ element = queryElement(rootElement, baseSelector);
780
+ }
781
+ }
782
+ }
783
+ return element;
625
784
  };
626
785
  // Create indexed XPath for specific container instance
627
786
  const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
628
- console.log(`Creating indexed XPath for container ${containerIndex}`);
629
- console.log(`Child selector: ${childSelector}`);
630
- console.log(`List selector: ${listSelector}`);
631
787
  // Check if the child selector contains the list selector pattern
632
- if (childSelector.includes(listSelector.replace('//', ''))) {
788
+ if (childSelector.includes(listSelector.replace("//", ""))) {
633
789
  // Replace the list selector part with indexed version
634
- const listPattern = listSelector.replace('//', '');
790
+ const listPattern = listSelector.replace("//", "");
635
791
  const indexedListSelector = `(${listSelector})[${containerIndex}]`;
636
792
  const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
637
- console.log(`Generated indexed selector: ${indexedSelector}`);
638
793
  return indexedSelector;
639
794
  }
640
795
  else {
641
796
  // If pattern doesn't match, create a more generic indexed selector
642
- console.warn(`Pattern doesn't match, using fallback approach`);
643
- return `(${listSelector})[${containerIndex}]${childSelector.replace('//', '/')}`;
797
+ return `(${listSelector})[${containerIndex}]${childSelector.replace("//", "/")}`;
644
798
  }
645
799
  };
646
- // Main scraping logic
647
- console.log('🚀 Starting list data extraction');
648
- console.log('List Selector:', listSelector);
649
- console.log('Fields:', fields);
650
- // Step 1: Get all container elements matching the list selector
651
- const containers = queryElementAll(document, listSelector);
652
- console.log(`📦 Found ${containers.length} list containers`);
800
+ // Main scraping logic with unified support for both CSS and XPath
801
+ console.log("🚀 Starting unified list data extraction");
802
+ console.log("List Selector:", listSelector);
803
+ console.log("Fields:", fields);
804
+ let containers = queryElementAll(document, listSelector);
805
+ containers = Array.from(containers);
653
806
  if (containers.length === 0) {
654
- console.warn('❌ No containers found for listSelector:', listSelector);
807
+ console.warn("❌ No containers found for listSelector:", listSelector);
655
808
  return [];
656
809
  }
657
- // Step 2: Extract data from each container up to the limit
658
- const extractedData = [];
659
- const containersToProcess = Math.min(containers.length, limit);
660
- console.log(`🔄 Processing ${containersToProcess} containers...`);
661
- for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
662
- const container = containers[containerIndex];
663
- const record = {};
664
- console.log(`\n📋 Processing container ${containerIndex + 1}/${containersToProcess}`);
665
- // Step 3: For each field, extract data from the current container
666
- for (const [label, field] of Object.entries(fields)) {
667
- console.log(`\n 🔍 Extracting field "${label}"`);
668
- console.log(` Original selector: ${field.selector}`);
669
- console.log(` Attribute: ${field.attribute}`);
670
- let element = null;
671
- // Handle XPath selectors with container indexing
672
- if (field.selector.startsWith('//')) {
673
- // Create indexed absolute XPath
674
- const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
675
- console.log(` 📍 Indexed selector: ${indexedSelector}`);
676
- element = evaluateXPath(document, indexedSelector);
677
- console.log(` 📍 Indexed XPath result: ${element ? 'FOUND' : 'NOT FOUND'}`);
810
+ console.log(`📦 Found ${containers.length} list containers`);
811
+ // For CSS selectors, try to find similar containers if needed
812
+ if (!isXPathSelector(listSelector) &&
813
+ limit > 1 &&
814
+ containers.length === 1) {
815
+ const baseContainer = containers[0];
816
+ const similarContainers = findSimilarElements(baseContainer);
817
+ if (similarContainers.length > 0) {
818
+ const newContainers = similarContainers.filter((container) => !container.matches(listSelector));
819
+ containers = [...containers, ...newContainers];
820
+ }
821
+ }
822
+ const containerFields = containers.map(() => ({
823
+ tableFields: {},
824
+ nonTableFields: {},
825
+ }));
826
+ // For XPath selectors, use the new approach
827
+ if (isXPathSelector(listSelector)) {
828
+ const extractedData = [];
829
+ const containersToProcess = Math.min(containers.length, limit);
830
+ for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
831
+ const record = {};
832
+ for (const [label, field] of Object.entries(fields)) {
833
+ let element = null;
834
+ if (isXPathSelector(field.selector)) {
835
+ // Create indexed absolute XPath
836
+ const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
837
+ element = evaluateXPath(document, indexedSelector);
838
+ }
839
+ else {
840
+ // Fallback for CSS selectors within XPath containers
841
+ const container = containers[containerIndex];
842
+ element = queryElement(container, field.selector);
843
+ }
678
844
  if (element) {
679
- console.log(` 📍 Found element text: "${(_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()}"`);
845
+ const value = extractValue(element, field.attribute);
846
+ if (value !== null && value !== "") {
847
+ record[label] = value;
848
+ }
849
+ else {
850
+ record[label] = "";
851
+ }
852
+ }
853
+ else {
854
+ record[label] = "";
680
855
  }
681
856
  }
682
- else {
683
- // Fallback for non-XPath selectors - search within container
684
- element = queryElement(container, field.selector);
857
+ if (Object.values(record).some((value) => value !== "")) {
858
+ extractedData.push(record);
685
859
  }
686
- // Step 4: Extract the value from the found element
687
- if (element) {
688
- const value = extractValue(element, field.attribute);
689
- if (value !== null && value !== '') {
690
- record[label] = value;
691
- console.log(` ✅ Extracted "${label}": "${value}"`);
860
+ }
861
+ console.log(`📊 Total records extracted: ${extractedData.length}`);
862
+ return extractedData;
863
+ }
864
+ // For CSS selectors, use the original table-aware approach
865
+ containers.forEach((container, containerIndex) => {
866
+ for (const [label, field] of Object.entries(fields)) {
867
+ const sampleElement = queryElement(container, field.selector);
868
+ if (sampleElement) {
869
+ const ancestor = findTableAncestor(sampleElement);
870
+ if (ancestor) {
871
+ containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1 });
692
872
  }
693
873
  else {
694
- console.warn(` ⚠️ Empty value for "${label}"`);
695
- record[label] = '';
874
+ containerFields[containerIndex].nonTableFields[label] = field;
696
875
  }
697
876
  }
698
877
  else {
699
- console.warn(` ❌ Element not found for "${label}"`);
700
- record[label] = '';
878
+ containerFields[containerIndex].nonTableFields[label] = field;
701
879
  }
702
880
  }
703
- // Step 5: Add record if it has any non-empty values
704
- if (Object.values(record).some(value => value !== '')) {
705
- extractedData.push(record);
706
- console.log(` ✅ Added record ${containerIndex + 1}:`, record);
881
+ });
882
+ const tableData = [];
883
+ const nonTableData = [];
884
+ // Process table data with support for iframes, frames, and shadow DOM
885
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
886
+ const container = containers[containerIndex];
887
+ const { tableFields } = containerFields[containerIndex];
888
+ if (Object.keys(tableFields).length > 0) {
889
+ const firstField = Object.values(tableFields)[0];
890
+ const firstElement = queryElement(container, firstField.selector);
891
+ let tableContext = firstElement;
892
+ // Find table context including iframe, frame and shadow DOM
893
+ while (tableContext &&
894
+ tableContext.tagName !== "TABLE" &&
895
+ tableContext !== container) {
896
+ if (tableContext.getRootNode() instanceof ShadowRoot) {
897
+ tableContext = tableContext.getRootNode().host;
898
+ continue;
899
+ }
900
+ if (tableContext.tagName === "IFRAME" ||
901
+ tableContext.tagName === "FRAME") {
902
+ try {
903
+ tableContext = tableContext.contentDocument.body;
904
+ }
905
+ catch (e) {
906
+ break;
907
+ }
908
+ }
909
+ else {
910
+ tableContext = tableContext.parentElement;
911
+ }
912
+ }
913
+ if (tableContext) {
914
+ // Get rows from all contexts
915
+ const rows = [];
916
+ // Get rows from regular DOM
917
+ rows.push(...tableContext.getElementsByTagName("TR"));
918
+ // Get rows from shadow DOM
919
+ if (tableContext.shadowRoot) {
920
+ rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
921
+ }
922
+ // Get rows from iframes and frames
923
+ if (tableContext.tagName === "IFRAME" ||
924
+ tableContext.tagName === "FRAME") {
925
+ try {
926
+ const frameDoc = tableContext.contentDocument ||
927
+ tableContext.contentWindow.document;
928
+ rows.push(...frameDoc.getElementsByTagName("TR"));
929
+ }
930
+ catch (e) {
931
+ console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
932
+ }
933
+ }
934
+ const processedRows = filterRowsBasedOnTag(rows, tableFields);
935
+ for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
936
+ const record = {};
937
+ const currentRow = processedRows[rowIndex];
938
+ for (const [label, { selector, attribute, cellIndex },] of Object.entries(tableFields)) {
939
+ let element = null;
940
+ if (cellIndex >= 0) {
941
+ // Get TD element considering both contexts
942
+ let td = currentRow.children[cellIndex];
943
+ // Check shadow DOM for td
944
+ if (!td && currentRow.shadowRoot) {
945
+ const shadowCells = currentRow.shadowRoot.children;
946
+ if (shadowCells && shadowCells.length > cellIndex) {
947
+ td = shadowCells[cellIndex];
948
+ }
949
+ }
950
+ if (td) {
951
+ element = queryElement(td, selector);
952
+ if (!element &&
953
+ selector
954
+ .split(/(?:>>|:>>)/)
955
+ .pop()
956
+ .includes("td:nth-child")) {
957
+ element = td;
958
+ }
959
+ if (!element) {
960
+ const tagOnlySelector = selector.split(".")[0];
961
+ element = queryElement(td, tagOnlySelector);
962
+ }
963
+ if (!element) {
964
+ let currentElement = td;
965
+ while (currentElement &&
966
+ currentElement.children.length > 0) {
967
+ let foundContentChild = false;
968
+ for (const child of currentElement.children) {
969
+ if (extractValue(child, attribute)) {
970
+ currentElement = child;
971
+ foundContentChild = true;
972
+ break;
973
+ }
974
+ }
975
+ if (!foundContentChild)
976
+ break;
977
+ }
978
+ element = currentElement;
979
+ }
980
+ }
981
+ }
982
+ else {
983
+ element = queryElement(currentRow, selector);
984
+ }
985
+ if (element) {
986
+ record[label] = extractValue(element, attribute);
987
+ }
988
+ }
989
+ if (Object.keys(record).length > 0) {
990
+ tableData.push(record);
991
+ }
992
+ }
993
+ }
707
994
  }
708
- else {
709
- console.warn(` ⚠️ Skipping empty record for container ${containerIndex + 1}`);
995
+ }
996
+ // Process non-table data with all contexts support
997
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
998
+ if (nonTableData.length >= limit)
999
+ break;
1000
+ const container = containers[containerIndex];
1001
+ const { nonTableFields } = containerFields[containerIndex];
1002
+ if (Object.keys(nonTableFields).length > 0) {
1003
+ const record = {};
1004
+ for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
1005
+ // Get the last part of the selector after any context delimiter
1006
+ const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
1007
+ const element = tryFallbackSelector(container, relativeSelector);
1008
+ if (element) {
1009
+ record[label] = extractValue(element, attribute);
1010
+ }
1011
+ }
1012
+ if (Object.keys(record).length > 0) {
1013
+ nonTableData.push(record);
1014
+ }
710
1015
  }
711
1016
  }
712
- console.log('\n🎉 Extraction complete!');
713
- console.log(`📊 Total records extracted: ${extractedData.length}`);
714
- console.log('📋 All records:', extractedData);
715
- return extractedData;
1017
+ // Merge and limit the results
1018
+ const scrapedData = [...tableData, ...nonTableData];
1019
+ console.log(`📊 Total records extracted: ${scrapedData.length}`);
1020
+ return scrapedData;
716
1021
  });
717
1022
  };
718
1023
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.11",
3
+ "version": "0.0.12",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",