mx-cloud 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -359,10 +359,55 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
359
359
  */
360
360
  window.scrapeList = function (_a) {
361
361
  return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
362
- // Enhanced query function to handle iframe, frame and shadow DOM
362
+ var _b;
363
+ // XPath evaluation functions
364
+ const evaluateXPath = (rootElement, xpath) => {
365
+ try {
366
+ const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
367
+ ? rootElement
368
+ : rootElement.ownerDocument;
369
+ if (!ownerDoc)
370
+ return null;
371
+ const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
372
+ return result.singleNodeValue;
373
+ }
374
+ catch (error) {
375
+ console.warn('XPath evaluation failed:', xpath, error);
376
+ return null;
377
+ }
378
+ };
379
+ const evaluateXPathAll = (rootElement, xpath) => {
380
+ try {
381
+ const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
382
+ ? rootElement
383
+ : rootElement.ownerDocument;
384
+ if (!ownerDoc)
385
+ return [];
386
+ const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
387
+ const elements = [];
388
+ for (let i = 0; i < result.snapshotLength; i++) {
389
+ const node = result.snapshotItem(i);
390
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
391
+ elements.push(node);
392
+ }
393
+ }
394
+ return elements;
395
+ }
396
+ catch (error) {
397
+ console.warn('XPath evaluation failed:', xpath, error);
398
+ return [];
399
+ }
400
+ };
401
+ // Enhanced query function to handle iframe, frame, shadow DOM, and XPath
363
402
  const queryElement = (rootElement, selector) => {
364
403
  if (!selector.includes('>>') && !selector.includes(':>>')) {
365
- return rootElement.querySelector(selector);
404
+ // Check if it's an XPath selector
405
+ if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
406
+ return evaluateXPath(rootElement, selector);
407
+ }
408
+ else {
409
+ return rootElement.querySelector(selector);
410
+ }
366
411
  }
367
412
  const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
368
413
  let currentElement = rootElement;
@@ -373,7 +418,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
373
418
  if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
374
419
  try {
375
420
  const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
376
- currentElement = frameDoc.querySelector(parts[i]);
421
+ if (!frameDoc)
422
+ return null;
423
+ if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
424
+ currentElement = evaluateXPath(frameDoc, parts[i]);
425
+ }
426
+ else {
427
+ currentElement = frameDoc.querySelector(parts[i]);
428
+ }
377
429
  continue;
378
430
  }
379
431
  catch (e) {
@@ -381,18 +433,36 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
381
433
  return null;
382
434
  }
383
435
  }
384
- // Try regular DOM first
385
- let nextElement = currentElement.querySelector(parts[i]);
436
+ let nextElement = null;
437
+ if ('querySelector' in currentElement) {
438
+ // Handle XPath vs CSS selector
439
+ if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
440
+ nextElement = evaluateXPath(currentElement, parts[i]);
441
+ }
442
+ else {
443
+ nextElement = currentElement.querySelector(parts[i]);
444
+ }
445
+ }
386
446
  // Try shadow DOM if not found
387
- if (!nextElement && currentElement.shadowRoot) {
388
- nextElement = currentElement.shadowRoot.querySelector(parts[i]);
447
+ if (!nextElement && 'shadowRoot' in currentElement && currentElement.shadowRoot) {
448
+ if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
449
+ nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
450
+ }
451
+ else {
452
+ nextElement = currentElement.shadowRoot.querySelector(parts[i]);
453
+ }
389
454
  }
390
455
  // Check children's shadow roots if still not found
391
- if (!nextElement) {
456
+ if (!nextElement && 'children' in currentElement) {
392
457
  const children = Array.from(currentElement.children || []);
393
458
  for (const child of children) {
394
459
  if (child.shadowRoot) {
395
- nextElement = child.shadowRoot.querySelector(parts[i]);
460
+ if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
461
+ nextElement = evaluateXPath(child.shadowRoot, parts[i]);
462
+ }
463
+ else {
464
+ nextElement = child.shadowRoot.querySelector(parts[i]);
465
+ }
396
466
  if (nextElement)
397
467
  break;
398
468
  }
@@ -402,10 +472,16 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
402
472
  }
403
473
  return currentElement;
404
474
  };
405
- // Enhanced query all function for both contexts
475
+ // Enhanced query all function for XPath and CSS selectors
406
476
  const queryElementAll = (rootElement, selector) => {
407
477
  if (!selector.includes('>>') && !selector.includes(':>>')) {
408
- return rootElement.querySelectorAll(selector);
478
+ // Check if it's an XPath selector
479
+ if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
480
+ return evaluateXPathAll(rootElement, selector);
481
+ }
482
+ else {
483
+ return Array.from(rootElement.querySelectorAll(selector));
484
+ }
409
485
  }
410
486
  const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
411
487
  let currentElements = [rootElement];
@@ -416,7 +492,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
416
492
  if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
417
493
  try {
418
494
  const frameDoc = element.contentDocument || element.contentWindow.document;
419
- nextElements.push(...frameDoc.querySelectorAll(part));
495
+ if (frameDoc) {
496
+ if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
497
+ nextElements.push(...evaluateXPathAll(frameDoc, part));
498
+ }
499
+ else {
500
+ nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
501
+ }
502
+ }
420
503
  }
421
504
  catch (e) {
422
505
  console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
@@ -426,17 +509,32 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
426
509
  else {
427
510
  // Regular DOM elements
428
511
  if (element.querySelectorAll) {
429
- nextElements.push(...element.querySelectorAll(part));
512
+ if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
513
+ nextElements.push(...evaluateXPathAll(element, part));
514
+ }
515
+ else {
516
+ nextElements.push(...Array.from(element.querySelectorAll(part)));
517
+ }
430
518
  }
431
519
  // Shadow DOM elements
432
520
  if (element.shadowRoot) {
433
- nextElements.push(...element.shadowRoot.querySelectorAll(part));
521
+ if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
522
+ nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
523
+ }
524
+ else {
525
+ nextElements.push(...Array.from(element.shadowRoot.querySelectorAll(part)));
526
+ }
434
527
  }
435
528
  // Check children's shadow roots
436
529
  const children = Array.from(element.children || []);
437
530
  for (const child of children) {
438
531
  if (child.shadowRoot) {
439
- nextElements.push(...child.shadowRoot.querySelectorAll(part));
532
+ if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
533
+ nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
534
+ }
535
+ else {
536
+ nextElements.push(...Array.from(child.shadowRoot.querySelectorAll(part)));
537
+ }
440
538
  }
441
539
  }
442
540
  }
@@ -446,8 +544,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
446
544
  return currentElements;
447
545
  };
448
546
  // Enhanced value extraction with context awareness
449
- function extractValue(element, attribute) {
450
- var _a, _b;
547
+ const extractValue = (element, attribute) => {
548
+ var _a, _b, _c, _d, _e, _f;
451
549
  if (!element)
452
550
  return null;
453
551
  // Get context-aware base URL
@@ -460,36 +558,58 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
460
558
  }
461
559
  }
462
560
  if (attribute === 'innerText') {
463
- return element.innerText.trim();
561
+ // First try standard innerText/textContent
562
+ let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
563
+ // If empty, check for common data attributes that might contain the text
564
+ if (!textContent) {
565
+ const dataAttributes = [
566
+ 'data-600',
567
+ 'data-text',
568
+ 'data-label',
569
+ 'data-value',
570
+ 'data-content',
571
+ ];
572
+ for (const attr of dataAttributes) {
573
+ const dataValue = element.getAttribute(attr);
574
+ if (dataValue && dataValue.trim()) {
575
+ textContent = dataValue.trim();
576
+ break;
577
+ }
578
+ }
579
+ }
580
+ return textContent || null;
464
581
  }
465
582
  else if (attribute === 'innerHTML') {
466
- return element.innerHTML.trim();
583
+ return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
467
584
  }
468
- else if (attribute === 'src' || attribute === 'href') {
469
- if (attribute === 'href' && element.tagName !== 'A') {
470
- const parentElement = element.parentElement;
471
- if (parentElement && parentElement.tagName === 'A') {
472
- const parentHref = parentElement.getAttribute('href');
473
- if (parentHref) {
474
- try {
475
- return new URL(parentHref, baseURL).href;
476
- }
477
- catch (e) {
478
- return parentHref;
479
- }
480
- }
481
- }
585
+ else if (attribute === 'href') {
586
+ // For href, we need to find the anchor tag if the current element isn't one
587
+ let anchorElement = element;
588
+ // If current element is not an anchor, look for parent anchor
589
+ if (element.tagName !== 'A') {
590
+ anchorElement = element.closest('a') || ((_f = element.parentElement) === null || _f === void 0 ? void 0 : _f.closest('a')) || element;
591
+ }
592
+ const hrefValue = anchorElement.getAttribute('href');
593
+ if (!hrefValue || hrefValue.trim() === '') {
594
+ return null;
595
+ }
596
+ try {
597
+ return new URL(hrefValue, baseURL).href;
598
+ }
599
+ catch (e) {
600
+ console.warn('Error creating URL from', hrefValue, e);
601
+ return hrefValue;
482
602
  }
603
+ }
604
+ else if (attribute === 'src') {
483
605
  const attrValue = element.getAttribute(attribute);
484
606
  const dataAttr = attrValue || element.getAttribute('data-' + attribute);
485
607
  if (!dataAttr || dataAttr.trim() === '') {
486
- if (attribute === 'src') {
487
- const style = window.getComputedStyle(element);
488
- const bgImage = style.backgroundImage;
489
- if (bgImage && bgImage !== 'none') {
490
- const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
491
- return matches ? new URL(matches[1], baseURL).href : null;
492
- }
608
+ const style = window.getComputedStyle(element);
609
+ const bgImage = style.backgroundImage;
610
+ if (bgImage && bgImage !== 'none') {
611
+ const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
612
+ return matches ? new URL(matches[1], baseURL).href : null;
493
613
  }
494
614
  return null;
495
615
  }
@@ -498,327 +618,101 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
498
618
  }
499
619
  catch (e) {
500
620
  console.warn('Error creating URL from', dataAttr, e);
501
- return dataAttr; // Return the original value if URL construction fails
621
+ return dataAttr;
502
622
  }
503
623
  }
504
624
  return element.getAttribute(attribute);
505
- }
506
- // Enhanced table ancestor finding with context support
507
- function findTableAncestor(element) {
508
- let currentElement = element;
509
- const MAX_DEPTH = 5;
510
- let depth = 0;
511
- while (currentElement && depth < MAX_DEPTH) {
512
- // Handle shadow DOM
513
- if (currentElement.getRootNode() instanceof ShadowRoot) {
514
- currentElement = currentElement.getRootNode().host;
515
- continue;
516
- }
517
- if (currentElement.tagName === 'TD') {
518
- return { type: 'TD', element: currentElement };
519
- }
520
- else if (currentElement.tagName === 'TR') {
521
- return { type: 'TR', element: currentElement };
522
- }
523
- // Handle iframe and frame crossing
524
- if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
525
- try {
526
- currentElement = currentElement.contentDocument.body;
527
- }
528
- catch (e) {
529
- return null;
530
- }
531
- }
532
- else {
533
- currentElement = currentElement.parentElement;
534
- }
535
- depth++;
536
- }
537
- return null;
538
- }
539
- // Helper function to get cell index
540
- function getCellIndex(td) {
541
- if (td.getRootNode() instanceof ShadowRoot) {
542
- const shadowRoot = td.getRootNode();
543
- const allCells = Array.from(shadowRoot.querySelectorAll('td'));
544
- return allCells.indexOf(td);
545
- }
546
- let index = 0;
547
- let sibling = td;
548
- while (sibling = sibling.previousElementSibling) {
549
- index++;
550
- }
551
- return index;
552
- }
553
- // Helper function to check for TH elements
554
- function hasThElement(row, tableFields) {
555
- for (const [_, { selector }] of Object.entries(tableFields)) {
556
- const element = queryElement(row, selector);
557
- if (element) {
558
- let current = element;
559
- while (current && current !== row) {
560
- if (current.getRootNode() instanceof ShadowRoot) {
561
- current = current.getRootNode().host;
562
- continue;
563
- }
564
- if (current.tagName === 'TH')
565
- return true;
566
- if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
567
- try {
568
- current = current.contentDocument.body;
569
- }
570
- catch (e) {
571
- break;
572
- }
573
- }
574
- else {
575
- current = current.parentElement;
576
- }
577
- }
578
- }
579
- }
580
- return false;
581
- }
582
- // Helper function to filter rows
583
- function filterRowsBasedOnTag(rows, tableFields) {
584
- for (const row of rows) {
585
- if (hasThElement(row, tableFields)) {
586
- return rows;
587
- }
588
- }
589
- // Include shadow DOM in TH search
590
- return rows.filter(row => {
591
- const directTH = row.getElementsByTagName('TH').length === 0;
592
- const shadowTH = row.shadowRoot ?
593
- row.shadowRoot.querySelector('th') === null : true;
594
- return directTH && shadowTH;
595
- });
596
- }
597
- // Class similarity comparison functions
598
- function calculateClassSimilarity(classList1, classList2) {
599
- const set1 = new Set(classList1);
600
- const set2 = new Set(classList2);
601
- const intersection = new Set([...set1].filter(x => set2.has(x)));
602
- const union = new Set([...set1, ...set2]);
603
- return intersection.size / union.size;
604
- }
605
- // Enhanced similar elements finding with context support
606
- function findSimilarElements(baseElement, similarityThreshold = 0.7) {
607
- const baseClasses = Array.from(baseElement.classList);
608
- if (baseClasses.length === 0)
609
- return [];
610
- const allElements = [];
611
- // Get elements from main document
612
- allElements.push(...document.getElementsByTagName(baseElement.tagName));
613
- // Get elements from shadow DOM
614
- if (baseElement.getRootNode() instanceof ShadowRoot) {
615
- const shadowHost = baseElement.getRootNode().host;
616
- allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
625
+ };
626
+ // Create indexed XPath for specific container instance
627
+ const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
628
+ console.log(`Creating indexed XPath for container ${containerIndex}`);
629
+ console.log(`Child selector: ${childSelector}`);
630
+ console.log(`List selector: ${listSelector}`);
631
+ // Check if the child selector contains the list selector pattern
632
+ if (childSelector.includes(listSelector.replace('//', ''))) {
633
+ // Replace the list selector part with indexed version
634
+ const listPattern = listSelector.replace('//', '');
635
+ const indexedListSelector = `(${listSelector})[${containerIndex}]`;
636
+ const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
637
+ console.log(`Generated indexed selector: ${indexedSelector}`);
638
+ return indexedSelector;
617
639
  }
618
- // Get elements from iframes and frames
619
- const frames = [
620
- ...Array.from(document.getElementsByTagName('iframe')),
621
- ...Array.from(document.getElementsByTagName('frame'))
622
- ];
623
- for (const frame of frames) {
624
- try {
625
- const frameDoc = frame.contentDocument || frame.contentWindow.document;
626
- allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
627
- }
628
- catch (e) {
629
- console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
630
- }
640
+ else {
641
+ // If pattern doesn't match, create a more generic indexed selector
642
+ console.warn(`Pattern doesn't match, using fallback approach`);
643
+ return `(${listSelector})[${containerIndex}]${childSelector.replace('//', '/')}`;
631
644
  }
632
- return allElements.filter(element => {
633
- if (element === baseElement)
634
- return false;
635
- const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
636
- return similarity >= similarityThreshold;
637
- });
638
- }
639
- function tryFallbackSelector(rootElement, originalSelector) {
640
- let element = queryElement(rootElement, originalSelector);
641
- if (!element && originalSelector.includes('nth-child')) {
642
- const match = originalSelector.match(/nth-child\((\d+)\)/);
643
- if (match) {
644
- const position = parseInt(match[1], 10);
645
- for (let i = position - 1; i >= 1; i--) {
646
- const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
647
- element = queryElement(rootElement, fallbackSelector);
648
- if (element)
649
- break;
650
- }
651
- if (!element) {
652
- const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
653
- element = queryElement(rootElement, baseSelector);
654
- }
655
- }
656
- }
657
- return element;
658
- }
659
- // Main scraping logic with context support
660
- let containers = queryElementAll(document, listSelector);
661
- containers = Array.from(containers);
662
- if (containers.length === 0)
645
+ };
646
+ // Main scraping logic
647
+ console.log('🚀 Starting list data extraction');
648
+ console.log('List Selector:', listSelector);
649
+ console.log('Fields:', fields);
650
+ // Step 1: Get all container elements matching the list selector
651
+ const containers = queryElementAll(document, listSelector);
652
+ console.log(`📦 Found ${containers.length} list containers`);
653
+ if (containers.length === 0) {
654
+ console.warn('❌ No containers found for listSelector:', listSelector);
663
655
  return [];
664
- if (limit > 1 && containers.length === 1) {
665
- const baseContainer = containers[0];
666
- const similarContainers = findSimilarElements(baseContainer);
667
- if (similarContainers.length > 0) {
668
- const newContainers = similarContainers.filter(container => !container.matches(listSelector));
669
- containers = [...containers, ...newContainers];
670
- }
671
656
  }
672
- const containerFields = containers.map(() => ({
673
- tableFields: {},
674
- nonTableFields: {}
675
- }));
676
- // Classify fields
677
- containers.forEach((container, containerIndex) => {
657
+ // Step 2: Extract data from each container up to the limit
658
+ const extractedData = [];
659
+ const containersToProcess = Math.min(containers.length, limit);
660
+ console.log(`🔄 Processing ${containersToProcess} containers...`);
661
+ for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
662
+ const container = containers[containerIndex];
663
+ const record = {};
664
+ console.log(`\n📋 Processing container ${containerIndex + 1}/${containersToProcess}`);
665
+ // Step 3: For each field, extract data from the current container
678
666
  for (const [label, field] of Object.entries(fields)) {
679
- const sampleElement = queryElement(container, field.selector);
680
- if (sampleElement) {
681
- const ancestor = findTableAncestor(sampleElement);
682
- if (ancestor) {
683
- containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
684
- }
685
- else {
686
- containerFields[containerIndex].nonTableFields[label] = field;
667
+ console.log(`\n 🔍 Extracting field "${label}"`);
668
+ console.log(` Original selector: ${field.selector}`);
669
+ console.log(` Attribute: ${field.attribute}`);
670
+ let element = null;
671
+ // Handle XPath selectors with container indexing
672
+ if (field.selector.startsWith('//')) {
673
+ // Create indexed absolute XPath
674
+ const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
675
+ console.log(` 📍 Indexed selector: ${indexedSelector}`);
676
+ element = evaluateXPath(document, indexedSelector);
677
+ console.log(` 📍 Indexed XPath result: ${element ? 'FOUND' : 'NOT FOUND'}`);
678
+ if (element) {
679
+ console.log(` 📍 Found element text: "${(_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()}"`);
687
680
  }
688
681
  }
689
682
  else {
690
- containerFields[containerIndex].nonTableFields[label] = field;
683
+ // Fallback for non-XPath selectors - search within container
684
+ element = queryElement(container, field.selector);
691
685
  }
692
- }
693
- });
694
- const tableData = [];
695
- const nonTableData = [];
696
- // Process table data with support for iframes, frames, and shadow DOM
697
- for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
698
- const container = containers[containerIndex];
699
- const { tableFields } = containerFields[containerIndex];
700
- if (Object.keys(tableFields).length > 0) {
701
- const firstField = Object.values(tableFields)[0];
702
- const firstElement = queryElement(container, firstField.selector);
703
- let tableContext = firstElement;
704
- // Find table context including iframe, frame and shadow DOM
705
- while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
706
- if (tableContext.getRootNode() instanceof ShadowRoot) {
707
- tableContext = tableContext.getRootNode().host;
708
- continue;
709
- }
710
- if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
711
- try {
712
- tableContext = tableContext.contentDocument.body;
713
- }
714
- catch (e) {
715
- break;
716
- }
686
+ // Step 4: Extract the value from the found element
687
+ if (element) {
688
+ const value = extractValue(element, field.attribute);
689
+ if (value !== null && value !== '') {
690
+ record[label] = value;
691
+ console.log(` ✅ Extracted "${label}": "${value}"`);
717
692
  }
718
693
  else {
719
- tableContext = tableContext.parentElement;
694
+ console.warn(` ⚠️ Empty value for "${label}"`);
695
+ record[label] = '';
720
696
  }
721
697
  }
722
- if (tableContext) {
723
- // Get rows from all contexts
724
- const rows = [];
725
- // Get rows from regular DOM
726
- rows.push(...tableContext.getElementsByTagName('TR'));
727
- // Get rows from shadow DOM
728
- if (tableContext.shadowRoot) {
729
- rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
730
- }
731
- // Get rows from iframes and frames
732
- if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
733
- try {
734
- const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
735
- rows.push(...frameDoc.getElementsByTagName('TR'));
736
- }
737
- catch (e) {
738
- console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
739
- }
740
- }
741
- const processedRows = filterRowsBasedOnTag(rows, tableFields);
742
- for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
743
- const record = {};
744
- const currentRow = processedRows[rowIndex];
745
- for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
746
- let element = null;
747
- if (cellIndex >= 0) {
748
- // Get TD element considering both contexts
749
- let td = currentRow.children[cellIndex];
750
- // Check shadow DOM for td
751
- if (!td && currentRow.shadowRoot) {
752
- const shadowCells = currentRow.shadowRoot.children;
753
- if (shadowCells && shadowCells.length > cellIndex) {
754
- td = shadowCells[cellIndex];
755
- }
756
- }
757
- if (td) {
758
- element = queryElement(td, selector);
759
- if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
760
- element = td;
761
- }
762
- if (!element) {
763
- const tagOnlySelector = selector.split('.')[0];
764
- element = queryElement(td, tagOnlySelector);
765
- }
766
- if (!element) {
767
- let currentElement = td;
768
- while (currentElement && currentElement.children.length > 0) {
769
- let foundContentChild = false;
770
- for (const child of currentElement.children) {
771
- if (extractValue(child, attribute)) {
772
- currentElement = child;
773
- foundContentChild = true;
774
- break;
775
- }
776
- }
777
- if (!foundContentChild)
778
- break;
779
- }
780
- element = currentElement;
781
- }
782
- }
783
- }
784
- else {
785
- element = queryElement(currentRow, selector);
786
- }
787
- if (element) {
788
- record[label] = extractValue(element, attribute);
789
- }
790
- }
791
- if (Object.keys(record).length > 0) {
792
- tableData.push(record);
793
- }
794
- }
698
+ else {
699
+ console.warn(` ❌ Element not found for "${label}"`);
700
+ record[label] = '';
795
701
  }
796
702
  }
797
- }
798
- // Process non-table data with all contexts support
799
- for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
800
- if (nonTableData.length >= limit)
801
- break;
802
- const container = containers[containerIndex];
803
- const { nonTableFields } = containerFields[containerIndex];
804
- if (Object.keys(nonTableFields).length > 0) {
805
- const record = {};
806
- for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
807
- // Get the last part of the selector after any context delimiter
808
- const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
809
- const element = tryFallbackSelector(container, relativeSelector);
810
- if (element) {
811
- record[label] = extractValue(element, attribute);
812
- }
813
- }
814
- if (Object.keys(record).length > 0) {
815
- nonTableData.push(record);
816
- }
703
+ // Step 5: Add record if it has any non-empty values
704
+ if (Object.values(record).some(value => value !== '')) {
705
+ extractedData.push(record);
706
+ console.log(` ✅ Added record ${containerIndex + 1}:`, record);
707
+ }
708
+ else {
709
+ console.warn(` ⚠️ Skipping empty record for container ${containerIndex + 1}`);
817
710
  }
818
711
  }
819
- // Merge and limit the results
820
- const scrapedData = [...tableData, ...nonTableData];
821
- return scrapedData;
712
+ console.log('\n🎉 Extraction complete!');
713
+ console.log(`📊 Total records extracted: ${extractedData.length}`);
714
+ console.log('📋 All records:', extractedData);
715
+ return extractedData;
822
716
  });
823
717
  };
824
718
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.9",
3
+ "version": "0.0.10",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",