maxun-core 0.0.17 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -359,21 +359,80 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
359
359
  */
360
360
  window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
361
361
  return __awaiter(this, void 0, void 0, function* () {
362
- // Enhanced query function to handle iframe, frame and shadow DOM
362
+ // XPath evaluation functions
363
+ const evaluateXPath = (rootElement, xpath) => {
364
+ try {
365
+ const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
366
+ ? rootElement
367
+ : rootElement.ownerDocument;
368
+ if (!ownerDoc)
369
+ return null;
370
+ const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
371
+ return result.singleNodeValue;
372
+ }
373
+ catch (error) {
374
+ console.warn("XPath evaluation failed:", xpath, error);
375
+ return null;
376
+ }
377
+ };
378
+ const evaluateXPathAll = (rootElement, xpath) => {
379
+ try {
380
+ const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
381
+ ? rootElement
382
+ : rootElement.ownerDocument;
383
+ if (!ownerDoc)
384
+ return [];
385
+ const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
386
+ const elements = [];
387
+ for (let i = 0; i < result.snapshotLength; i++) {
388
+ const node = result.snapshotItem(i);
389
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
390
+ elements.push(node);
391
+ }
392
+ }
393
+ return elements;
394
+ }
395
+ catch (error) {
396
+ console.warn("XPath evaluation failed:", xpath, error);
397
+ return [];
398
+ }
399
+ };
400
+ // Helper function to detect selector type
401
+ const isXPathSelector = (selector) => {
402
+ return (selector.startsWith("//") ||
403
+ selector.startsWith("/") ||
404
+ selector.startsWith("./"));
405
+ };
406
+ // Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
363
407
  const queryElement = (rootElement, selector) => {
364
- if (!selector.includes('>>') && !selector.includes(':>>')) {
365
- return rootElement.querySelector(selector);
408
+ if (!selector.includes(">>") && !selector.includes(":>>")) {
409
+ // Check if it's an XPath selector
410
+ if (isXPathSelector(selector)) {
411
+ return evaluateXPath(rootElement, selector);
412
+ }
413
+ else {
414
+ return rootElement.querySelector(selector);
415
+ }
366
416
  }
367
- const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
417
+ const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
368
418
  let currentElement = rootElement;
369
419
  for (let i = 0; i < parts.length; i++) {
370
420
  if (!currentElement)
371
421
  return null;
372
422
  // Handle iframe and frame traversal
373
- if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
423
+ if (currentElement.tagName === "IFRAME" ||
424
+ currentElement.tagName === "FRAME") {
374
425
  try {
375
- const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
376
- currentElement = frameDoc.querySelector(parts[i]);
426
+ const frameDoc = currentElement.contentDocument ||
427
+ currentElement.contentWindow.document;
428
+ if (!frameDoc)
429
+ return null;
430
+ if (isXPathSelector(parts[i])) {
431
+ currentElement = evaluateXPath(frameDoc, parts[i]);
432
+ }
433
+ else {
434
+ currentElement = frameDoc.querySelector(parts[i]);
435
+ }
377
436
  continue;
378
437
  }
379
438
  catch (e) {
@@ -381,18 +440,38 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
381
440
  return null;
382
441
  }
383
442
  }
443
+ let nextElement = null;
384
444
  // Try regular DOM first
385
- let nextElement = currentElement.querySelector(parts[i]);
445
+ if ("querySelector" in currentElement) {
446
+ if (isXPathSelector(parts[i])) {
447
+ nextElement = evaluateXPath(currentElement, parts[i]);
448
+ }
449
+ else {
450
+ nextElement = currentElement.querySelector(parts[i]);
451
+ }
452
+ }
386
453
  // Try shadow DOM if not found
387
- if (!nextElement && currentElement.shadowRoot) {
388
- nextElement = currentElement.shadowRoot.querySelector(parts[i]);
454
+ if (!nextElement &&
455
+ "shadowRoot" in currentElement &&
456
+ currentElement.shadowRoot) {
457
+ if (isXPathSelector(parts[i])) {
458
+ nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
459
+ }
460
+ else {
461
+ nextElement = currentElement.shadowRoot.querySelector(parts[i]);
462
+ }
389
463
  }
390
464
  // Check children's shadow roots if still not found
391
- if (!nextElement) {
465
+ if (!nextElement && "children" in currentElement) {
392
466
  const children = Array.from(currentElement.children || []);
393
467
  for (const child of children) {
394
468
  if (child.shadowRoot) {
395
- nextElement = child.shadowRoot.querySelector(parts[i]);
469
+ if (isXPathSelector(parts[i])) {
470
+ nextElement = evaluateXPath(child.shadowRoot, parts[i]);
471
+ }
472
+ else {
473
+ nextElement = child.shadowRoot.querySelector(parts[i]);
474
+ }
396
475
  if (nextElement)
397
476
  break;
398
477
  }
@@ -404,19 +483,31 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
404
483
  };
405
484
  // Enhanced query all function for both contexts
406
485
  const queryElementAll = (rootElement, selector) => {
407
- if (!selector.includes('>>') && !selector.includes(':>>')) {
408
- return rootElement.querySelectorAll(selector);
486
+ if (!selector.includes(">>") && !selector.includes(":>>")) {
487
+ if (isXPathSelector(selector)) {
488
+ return evaluateXPathAll(rootElement, selector);
489
+ }
490
+ else {
491
+ return Array.from(rootElement.querySelectorAll(selector));
492
+ }
409
493
  }
410
- const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
494
+ const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
411
495
  let currentElements = [rootElement];
412
496
  for (const part of parts) {
413
497
  const nextElements = [];
414
498
  for (const element of currentElements) {
415
499
  // Handle iframe and frame traversal
416
- if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
500
+ if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
417
501
  try {
418
502
  const frameDoc = element.contentDocument || element.contentWindow.document;
419
- nextElements.push(...frameDoc.querySelectorAll(part));
503
+ if (frameDoc) {
504
+ if (isXPathSelector(part)) {
505
+ nextElements.push(...evaluateXPathAll(frameDoc, part));
506
+ }
507
+ else {
508
+ nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
509
+ }
510
+ }
420
511
  }
421
512
  catch (e) {
422
513
  console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
@@ -426,17 +517,32 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
426
517
  else {
427
518
  // Regular DOM elements
428
519
  if (element.querySelectorAll) {
429
- nextElements.push(...element.querySelectorAll(part));
520
+ if (isXPathSelector(part)) {
521
+ nextElements.push(...evaluateXPathAll(element, part));
522
+ }
523
+ else {
524
+ nextElements.push(...Array.from(element.querySelectorAll(part)));
525
+ }
430
526
  }
431
527
  // Shadow DOM elements
432
528
  if (element.shadowRoot) {
433
- nextElements.push(...element.shadowRoot.querySelectorAll(part));
529
+ if (isXPathSelector(part)) {
530
+ nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
531
+ }
532
+ else {
533
+ nextElements.push(...Array.from(element.shadowRoot.querySelectorAll(part)));
534
+ }
434
535
  }
435
536
  // Check children's shadow roots
436
537
  const children = Array.from(element.children || []);
437
538
  for (const child of children) {
438
539
  if (child.shadowRoot) {
439
- nextElements.push(...child.shadowRoot.querySelectorAll(part));
540
+ if (isXPathSelector(part)) {
541
+ nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
542
+ }
543
+ else {
544
+ nextElements.push(...Array.from(child.shadowRoot.querySelectorAll(part)));
545
+ }
440
546
  }
441
547
  }
442
548
  }
@@ -446,8 +552,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
446
552
  return currentElements;
447
553
  };
448
554
  // Enhanced value extraction with context awareness
449
- function extractValue(element, attribute) {
450
- var _a, _b;
555
+ const extractValue = (element, attribute) => {
556
+ var _a, _b, _c, _d, _e;
451
557
  if (!element)
452
558
  return null;
453
559
  // Get context-aware base URL
@@ -459,17 +565,36 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
459
565
  return shadowContent.trim();
460
566
  }
461
567
  }
462
- if (attribute === 'innerText') {
463
- return element.innerText.trim();
568
+ if (attribute === "innerText") {
569
+ // First try standard innerText/textContent
570
+ let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
571
+ // If empty, check for common data attributes that might contain the text
572
+ if (!textContent) {
573
+ const dataAttributes = [
574
+ "data-600",
575
+ "data-text",
576
+ "data-label",
577
+ "data-value",
578
+ "data-content",
579
+ ];
580
+ for (const attr of dataAttributes) {
581
+ const dataValue = element.getAttribute(attr);
582
+ if (dataValue && dataValue.trim()) {
583
+ textContent = dataValue.trim();
584
+ break;
585
+ }
586
+ }
587
+ }
588
+ return textContent || null;
464
589
  }
465
- else if (attribute === 'innerHTML') {
466
- return element.innerHTML.trim();
590
+ else if (attribute === "innerHTML") {
591
+ return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
467
592
  }
468
- else if (attribute === 'src' || attribute === 'href') {
469
- if (attribute === 'href' && element.tagName !== 'A') {
593
+ else if (attribute === "src" || attribute === "href") {
594
+ if (attribute === "href" && element.tagName !== "A") {
470
595
  const parentElement = element.parentElement;
471
- if (parentElement && parentElement.tagName === 'A') {
472
- const parentHref = parentElement.getAttribute('href');
596
+ if (parentElement && parentElement.tagName === "A") {
597
+ const parentHref = parentElement.getAttribute("href");
473
598
  if (parentHref) {
474
599
  try {
475
600
  return new URL(parentHref, baseURL).href;
@@ -481,12 +606,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
481
606
  }
482
607
  }
483
608
  const attrValue = element.getAttribute(attribute);
484
- const dataAttr = attrValue || element.getAttribute('data-' + attribute);
485
- if (!dataAttr || dataAttr.trim() === '') {
486
- if (attribute === 'src') {
609
+ const dataAttr = attrValue || element.getAttribute("data-" + attribute);
610
+ if (!dataAttr || dataAttr.trim() === "") {
611
+ if (attribute === "src") {
487
612
  const style = window.getComputedStyle(element);
488
613
  const bgImage = style.backgroundImage;
489
- if (bgImage && bgImage !== 'none') {
614
+ if (bgImage && bgImage !== "none") {
490
615
  const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
491
616
  return matches ? new URL(matches[1], baseURL).href : null;
492
617
  }
@@ -497,14 +622,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
497
622
  return new URL(dataAttr, baseURL).href;
498
623
  }
499
624
  catch (e) {
500
- console.warn('Error creating URL from', dataAttr, e);
501
- return dataAttr; // Return the original value if URL construction fails
625
+ console.warn("Error creating URL from", dataAttr, e);
626
+ return dataAttr;
502
627
  }
503
628
  }
504
629
  return element.getAttribute(attribute);
505
- }
630
+ };
506
631
  // Enhanced table ancestor finding with context support
507
- function findTableAncestor(element) {
632
+ const findTableAncestor = (element) => {
508
633
  let currentElement = element;
509
634
  const MAX_DEPTH = 5;
510
635
  let depth = 0;
@@ -514,14 +639,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
514
639
  currentElement = currentElement.getRootNode().host;
515
640
  continue;
516
641
  }
517
- if (currentElement.tagName === 'TD') {
518
- return { type: 'TD', element: currentElement };
642
+ if (currentElement.tagName === "TD") {
643
+ return { type: "TD", element: currentElement };
519
644
  }
520
- else if (currentElement.tagName === 'TR') {
521
- return { type: 'TR', element: currentElement };
645
+ else if (currentElement.tagName === "TR") {
646
+ return { type: "TR", element: currentElement };
522
647
  }
523
648
  // Handle iframe and frame crossing
524
- if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
649
+ if (currentElement.tagName === "IFRAME" ||
650
+ currentElement.tagName === "FRAME") {
525
651
  try {
526
652
  currentElement = currentElement.contentDocument.body;
527
653
  }
@@ -535,23 +661,23 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
535
661
  depth++;
536
662
  }
537
663
  return null;
538
- }
664
+ };
539
665
  // Helper function to get cell index
540
- function getCellIndex(td) {
666
+ const getCellIndex = (td) => {
541
667
  if (td.getRootNode() instanceof ShadowRoot) {
542
668
  const shadowRoot = td.getRootNode();
543
- const allCells = Array.from(shadowRoot.querySelectorAll('td'));
669
+ const allCells = Array.from(shadowRoot.querySelectorAll("td"));
544
670
  return allCells.indexOf(td);
545
671
  }
546
672
  let index = 0;
547
673
  let sibling = td;
548
- while (sibling = sibling.previousElementSibling) {
674
+ while ((sibling = sibling.previousElementSibling)) {
549
675
  index++;
550
676
  }
551
677
  return index;
552
- }
678
+ };
553
679
  // Helper function to check for TH elements
554
- function hasThElement(row, tableFields) {
680
+ const hasThElement = (row, tableFields) => {
555
681
  for (const [_, { selector }] of Object.entries(tableFields)) {
556
682
  const element = queryElement(row, selector);
557
683
  if (element) {
@@ -561,9 +687,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
561
687
  current = current.getRootNode().host;
562
688
  continue;
563
689
  }
564
- if (current.tagName === 'TH')
690
+ if (current.tagName === "TH")
565
691
  return true;
566
- if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
692
+ if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
567
693
  try {
568
694
  current = current.contentDocument.body;
569
695
  }
@@ -578,32 +704,32 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
578
704
  }
579
705
  }
580
706
  return false;
581
- }
707
+ };
582
708
  // Helper function to filter rows
583
- function filterRowsBasedOnTag(rows, tableFields) {
709
+ const filterRowsBasedOnTag = (rows, tableFields) => {
584
710
  for (const row of rows) {
585
711
  if (hasThElement(row, tableFields)) {
586
712
  return rows;
587
713
  }
588
714
  }
589
- // Include shadow DOM in TH search
590
- return rows.filter(row => {
591
- const directTH = row.getElementsByTagName('TH').length === 0;
592
- const shadowTH = row.shadowRoot ?
593
- row.shadowRoot.querySelector('th') === null : true;
715
+ return rows.filter((row) => {
716
+ const directTH = row.getElementsByTagName("TH").length === 0;
717
+ const shadowTH = row.shadowRoot
718
+ ? row.shadowRoot.querySelector("th") === null
719
+ : true;
594
720
  return directTH && shadowTH;
595
721
  });
596
- }
722
+ };
597
723
  // Class similarity comparison functions
598
- function calculateClassSimilarity(classList1, classList2) {
724
+ const calculateClassSimilarity = (classList1, classList2) => {
599
725
  const set1 = new Set(classList1);
600
726
  const set2 = new Set(classList2);
601
- const intersection = new Set([...set1].filter(x => set2.has(x)));
727
+ const intersection = new Set([...set1].filter((x) => set2.has(x)));
602
728
  const union = new Set([...set1, ...set2]);
603
729
  return intersection.size / union.size;
604
- }
730
+ };
605
731
  // Enhanced similar elements finding with context support
606
- function findSimilarElements(baseElement, similarityThreshold = 0.7) {
732
+ const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
607
733
  const baseClasses = Array.from(baseElement.classList);
608
734
  if (baseClasses.length === 0)
609
735
  return [];
@@ -617,8 +743,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
617
743
  }
618
744
  // Get elements from iframes and frames
619
745
  const frames = [
620
- ...Array.from(document.getElementsByTagName('iframe')),
621
- ...Array.from(document.getElementsByTagName('frame'))
746
+ ...Array.from(document.getElementsByTagName("iframe")),
747
+ ...Array.from(document.getElementsByTagName("frame")),
622
748
  ];
623
749
  for (const frame of frames) {
624
750
  try {
@@ -629,16 +755,16 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
629
755
  console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
630
756
  }
631
757
  }
632
- return allElements.filter(element => {
758
+ return allElements.filter((element) => {
633
759
  if (element === baseElement)
634
760
  return false;
635
761
  const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
636
762
  return similarity >= similarityThreshold;
637
763
  });
638
- }
639
- function tryFallbackSelector(rootElement, originalSelector) {
764
+ };
765
+ const tryFallbackSelector = (rootElement, originalSelector) => {
640
766
  let element = queryElement(rootElement, originalSelector);
641
- if (!element && originalSelector.includes('nth-child')) {
767
+ if (!element && originalSelector.includes("nth-child")) {
642
768
  const match = originalSelector.match(/nth-child\((\d+)\)/);
643
769
  if (match) {
644
770
  const position = parseInt(match[1], 10);
@@ -649,38 +775,100 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
649
775
  break;
650
776
  }
651
777
  if (!element) {
652
- const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
778
+ const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, "");
653
779
  element = queryElement(rootElement, baseSelector);
654
780
  }
655
781
  }
656
782
  }
657
783
  return element;
658
- }
659
- // Main scraping logic with context support
784
+ };
785
+ // Create indexed XPath for specific container instance
786
+ const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
787
+ // Check if the child selector contains the list selector pattern
788
+ if (childSelector.includes(listSelector.replace("//", ""))) {
789
+ // Replace the list selector part with indexed version
790
+ const listPattern = listSelector.replace("//", "");
791
+ const indexedListSelector = `(${listSelector})[${containerIndex}]`;
792
+ const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
793
+ return indexedSelector;
794
+ }
795
+ else {
796
+ // If pattern doesn't match, create a more generic indexed selector
797
+ return `(${listSelector})[${containerIndex}]${childSelector.replace("//", "/")}`;
798
+ }
799
+ };
800
+ // Main scraping logic with unified support for both CSS and XPath
801
+ console.log("🚀 Starting unified list data extraction");
802
+ console.log("List Selector:", listSelector);
803
+ console.log("Fields:", fields);
660
804
  let containers = queryElementAll(document, listSelector);
661
805
  containers = Array.from(containers);
662
- if (containers.length === 0)
806
+ if (containers.length === 0) {
807
+ console.warn("❌ No containers found for listSelector:", listSelector);
663
808
  return [];
664
- if (limit > 1 && containers.length === 1) {
809
+ }
810
+ console.log(`📦 Found ${containers.length} list containers`);
811
+ // For CSS selectors, try to find similar containers if needed
812
+ if (!isXPathSelector(listSelector) &&
813
+ limit > 1 &&
814
+ containers.length === 1) {
665
815
  const baseContainer = containers[0];
666
816
  const similarContainers = findSimilarElements(baseContainer);
667
817
  if (similarContainers.length > 0) {
668
- const newContainers = similarContainers.filter(container => !container.matches(listSelector));
818
+ const newContainers = similarContainers.filter((container) => !container.matches(listSelector));
669
819
  containers = [...containers, ...newContainers];
670
820
  }
671
821
  }
672
822
  const containerFields = containers.map(() => ({
673
823
  tableFields: {},
674
- nonTableFields: {}
824
+ nonTableFields: {},
675
825
  }));
676
- // Classify fields
826
+ // For XPath selectors, use the new approach
827
+ if (isXPathSelector(listSelector)) {
828
+ const extractedData = [];
829
+ const containersToProcess = Math.min(containers.length, limit);
830
+ for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
831
+ const record = {};
832
+ for (const [label, field] of Object.entries(fields)) {
833
+ let element = null;
834
+ if (isXPathSelector(field.selector)) {
835
+ // Create indexed absolute XPath
836
+ const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
837
+ element = evaluateXPath(document, indexedSelector);
838
+ }
839
+ else {
840
+ // Fallback for CSS selectors within XPath containers
841
+ const container = containers[containerIndex];
842
+ element = queryElement(container, field.selector);
843
+ }
844
+ if (element) {
845
+ const value = extractValue(element, field.attribute);
846
+ if (value !== null && value !== "") {
847
+ record[label] = value;
848
+ }
849
+ else {
850
+ record[label] = "";
851
+ }
852
+ }
853
+ else {
854
+ record[label] = "";
855
+ }
856
+ }
857
+ if (Object.values(record).some((value) => value !== "")) {
858
+ extractedData.push(record);
859
+ }
860
+ }
861
+ console.log(`📊 Total records extracted: ${extractedData.length}`);
862
+ return extractedData;
863
+ }
864
+ // For CSS selectors, use the original table-aware approach
677
865
  containers.forEach((container, containerIndex) => {
678
866
  for (const [label, field] of Object.entries(fields)) {
679
867
  const sampleElement = queryElement(container, field.selector);
680
868
  if (sampleElement) {
681
869
  const ancestor = findTableAncestor(sampleElement);
682
870
  if (ancestor) {
683
- containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
871
+ containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1 });
684
872
  }
685
873
  else {
686
874
  containerFields[containerIndex].nonTableFields[label] = field;
@@ -702,12 +890,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
702
890
  const firstElement = queryElement(container, firstField.selector);
703
891
  let tableContext = firstElement;
704
892
  // Find table context including iframe, frame and shadow DOM
705
- while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
893
+ while (tableContext &&
894
+ tableContext.tagName !== "TABLE" &&
895
+ tableContext !== container) {
706
896
  if (tableContext.getRootNode() instanceof ShadowRoot) {
707
897
  tableContext = tableContext.getRootNode().host;
708
898
  continue;
709
899
  }
710
- if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
900
+ if (tableContext.tagName === "IFRAME" ||
901
+ tableContext.tagName === "FRAME") {
711
902
  try {
712
903
  tableContext = tableContext.contentDocument.body;
713
904
  }
@@ -723,16 +914,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
723
914
  // Get rows from all contexts
724
915
  const rows = [];
725
916
  // Get rows from regular DOM
726
- rows.push(...tableContext.getElementsByTagName('TR'));
917
+ rows.push(...tableContext.getElementsByTagName("TR"));
727
918
  // Get rows from shadow DOM
728
919
  if (tableContext.shadowRoot) {
729
- rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
920
+ rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
730
921
  }
731
922
  // Get rows from iframes and frames
732
- if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
923
+ if (tableContext.tagName === "IFRAME" ||
924
+ tableContext.tagName === "FRAME") {
733
925
  try {
734
- const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
735
- rows.push(...frameDoc.getElementsByTagName('TR'));
926
+ const frameDoc = tableContext.contentDocument ||
927
+ tableContext.contentWindow.document;
928
+ rows.push(...frameDoc.getElementsByTagName("TR"));
736
929
  }
737
930
  catch (e) {
738
931
  console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
@@ -742,7 +935,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
742
935
  for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
743
936
  const record = {};
744
937
  const currentRow = processedRows[rowIndex];
745
- for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
938
+ for (const [label, { selector, attribute, cellIndex },] of Object.entries(tableFields)) {
746
939
  let element = null;
747
940
  if (cellIndex >= 0) {
748
941
  // Get TD element considering both contexts
@@ -756,16 +949,21 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
756
949
  }
757
950
  if (td) {
758
951
  element = queryElement(td, selector);
759
- if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
952
+ if (!element &&
953
+ selector
954
+ .split(/(?:>>|:>>)/)
955
+ .pop()
956
+ .includes("td:nth-child")) {
760
957
  element = td;
761
958
  }
762
959
  if (!element) {
763
- const tagOnlySelector = selector.split('.')[0];
960
+ const tagOnlySelector = selector.split(".")[0];
764
961
  element = queryElement(td, tagOnlySelector);
765
962
  }
766
963
  if (!element) {
767
964
  let currentElement = td;
768
- while (currentElement && currentElement.children.length > 0) {
965
+ while (currentElement &&
966
+ currentElement.children.length > 0) {
769
967
  let foundContentChild = false;
770
968
  for (const child of currentElement.children) {
771
969
  if (extractValue(child, attribute)) {
@@ -818,6 +1016,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
818
1016
  }
819
1017
  // Merge and limit the results
820
1018
  const scrapedData = [...tableData, ...nonTableData];
1019
+ console.log(`📊 Total records extracted: ${scrapedData.length}`);
821
1020
  return scrapedData;
822
1021
  });
823
1022
  };
@@ -41,6 +41,7 @@ interface InterpreterOptions {
41
41
  activeId: (id: number) => void;
42
42
  debugMessage: (msg: string) => void;
43
43
  setActionType: (type: string) => void;
44
+ incrementScrapeListIndex: () => void;
44
45
  }>;
45
46
  }
46
47
  /**
@@ -385,7 +385,7 @@ class Interpreter extends events_1.EventEmitter {
385
385
  yield this.options.serializableCallback([mergedResult]);
386
386
  }),
387
387
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
388
- var _f;
388
+ var _f, _g;
389
389
  if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
390
390
  this.options.debugChannel.setActionType('scrapeList');
391
391
  }
@@ -394,6 +394,9 @@ class Interpreter extends events_1.EventEmitter {
394
394
  return;
395
395
  }
396
396
  yield this.ensureScriptsLoaded(page);
397
+ if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.incrementScrapeListIndex) {
398
+ this.options.debugChannel.incrementScrapeListIndex();
399
+ }
397
400
  if (!config.pagination) {
398
401
  const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
399
402
  yield this.options.serializableCallback(scrapeResults);
@@ -404,8 +407,8 @@ class Interpreter extends events_1.EventEmitter {
404
407
  }
405
408
  }),
406
409
  scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
407
- var _g;
408
- if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.setActionType) {
410
+ var _h;
411
+ if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
409
412
  this.options.debugChannel.setActionType('scrapeListAuto');
410
413
  }
411
414
  yield this.ensureScriptsLoaded(page);
@@ -415,8 +418,8 @@ class Interpreter extends events_1.EventEmitter {
415
418
  yield this.options.serializableCallback(scrapeResults);
416
419
  }),
417
420
  scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
418
- var _h;
419
- if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
421
+ var _j;
422
+ if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
420
423
  this.options.debugChannel.setActionType('scroll');
421
424
  }
422
425
  yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
@@ -427,8 +430,8 @@ class Interpreter extends events_1.EventEmitter {
427
430
  }), pages !== null && pages !== void 0 ? pages : 1);
428
431
  }),
429
432
  script: (code) => __awaiter(this, void 0, void 0, function* () {
430
- var _j;
431
- if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
433
+ var _k;
434
+ if ((_k = this.options.debugChannel) === null || _k === void 0 ? void 0 : _k.setActionType) {
432
435
  this.options.debugChannel.setActionType('script');
433
436
  }
434
437
  const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
@@ -530,6 +533,7 @@ class Interpreter extends events_1.EventEmitter {
530
533
  });
531
534
  allResults = allResults.concat(newResults);
532
535
  debugLog("Results collected:", allResults.length);
536
+ yield this.options.serializableCallback(allResults);
533
537
  });
534
538
  const checkLimit = () => {
535
539
  if (config.limit && allResults.length >= config.limit) {
@@ -674,10 +678,47 @@ class Interpreter extends events_1.EventEmitter {
674
678
  }
675
679
  let retryCount = 0;
676
680
  let paginationSuccess = false;
677
- // Capture basic content signature before click
681
+ // Capture basic content signature before click - with XPath support
678
682
  const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
679
- return yield page.evaluate((selector) => {
680
- const items = document.querySelectorAll(selector);
683
+ return yield page.evaluate((listSelector) => {
684
+ const isXPath = (selector) => {
685
+ return selector.startsWith('//') || selector.startsWith('./') || selector.includes('::');
686
+ };
687
+ let items = [];
688
+ if (isXPath(listSelector)) {
689
+ try {
690
+ // Use XPath to find elements
691
+ const xpathResult = document.evaluate(listSelector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
692
+ items = [];
693
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
694
+ const node = xpathResult.snapshotItem(i);
695
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
696
+ items.push(node);
697
+ }
698
+ }
699
+ }
700
+ catch (xpathError) {
701
+ console.warn('XPath evaluation failed, trying CSS selector as fallback:', xpathError);
702
+ // Fallback to CSS selector
703
+ try {
704
+ items = document.querySelectorAll(listSelector);
705
+ }
706
+ catch (cssError) {
707
+ console.warn('CSS selector fallback also failed:', cssError);
708
+ items = [];
709
+ }
710
+ }
711
+ }
712
+ else {
713
+ try {
714
+ // Use CSS selector
715
+ items = document.querySelectorAll(listSelector);
716
+ }
717
+ catch (cssError) {
718
+ console.warn('CSS selector failed:', cssError);
719
+ items = [];
720
+ }
721
+ }
681
722
  return {
682
723
  url: window.location.href,
683
724
  itemCount: items.length,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.17",
3
+ "version": "0.0.19",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",