mx-cloud 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +426 -121
- package/package.json +1 -1
|
@@ -359,7 +359,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
359
359
|
*/
|
|
360
360
|
window.scrapeList = function (_a) {
|
|
361
361
|
return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
|
|
362
|
-
var _b;
|
|
363
362
|
// XPath evaluation functions
|
|
364
363
|
const evaluateXPath = (rootElement, xpath) => {
|
|
365
364
|
try {
|
|
@@ -372,7 +371,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
372
371
|
return result.singleNodeValue;
|
|
373
372
|
}
|
|
374
373
|
catch (error) {
|
|
375
|
-
console.warn(
|
|
374
|
+
console.warn("XPath evaluation failed:", xpath, error);
|
|
376
375
|
return null;
|
|
377
376
|
}
|
|
378
377
|
};
|
|
@@ -394,33 +393,41 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
394
393
|
return elements;
|
|
395
394
|
}
|
|
396
395
|
catch (error) {
|
|
397
|
-
console.warn(
|
|
396
|
+
console.warn("XPath evaluation failed:", xpath, error);
|
|
398
397
|
return [];
|
|
399
398
|
}
|
|
400
399
|
};
|
|
401
|
-
//
|
|
400
|
+
// Helper function to detect selector type
|
|
401
|
+
const isXPathSelector = (selector) => {
|
|
402
|
+
return (selector.startsWith("//") ||
|
|
403
|
+
selector.startsWith("/") ||
|
|
404
|
+
selector.startsWith("./"));
|
|
405
|
+
};
|
|
406
|
+
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
|
|
402
407
|
const queryElement = (rootElement, selector) => {
|
|
403
|
-
if (!selector.includes(
|
|
408
|
+
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
|
404
409
|
// Check if it's an XPath selector
|
|
405
|
-
if (
|
|
410
|
+
if (isXPathSelector(selector)) {
|
|
406
411
|
return evaluateXPath(rootElement, selector);
|
|
407
412
|
}
|
|
408
413
|
else {
|
|
409
414
|
return rootElement.querySelector(selector);
|
|
410
415
|
}
|
|
411
416
|
}
|
|
412
|
-
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
417
|
+
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
|
413
418
|
let currentElement = rootElement;
|
|
414
419
|
for (let i = 0; i < parts.length; i++) {
|
|
415
420
|
if (!currentElement)
|
|
416
421
|
return null;
|
|
417
422
|
// Handle iframe and frame traversal
|
|
418
|
-
if (currentElement.tagName ===
|
|
423
|
+
if (currentElement.tagName === "IFRAME" ||
|
|
424
|
+
currentElement.tagName === "FRAME") {
|
|
419
425
|
try {
|
|
420
|
-
const frameDoc = currentElement.contentDocument ||
|
|
426
|
+
const frameDoc = currentElement.contentDocument ||
|
|
427
|
+
currentElement.contentWindow.document;
|
|
421
428
|
if (!frameDoc)
|
|
422
429
|
return null;
|
|
423
|
-
if (
|
|
430
|
+
if (isXPathSelector(parts[i])) {
|
|
424
431
|
currentElement = evaluateXPath(frameDoc, parts[i]);
|
|
425
432
|
}
|
|
426
433
|
else {
|
|
@@ -434,9 +441,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
434
441
|
}
|
|
435
442
|
}
|
|
436
443
|
let nextElement = null;
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
if (
|
|
444
|
+
// Try regular DOM first
|
|
445
|
+
if ("querySelector" in currentElement) {
|
|
446
|
+
if (isXPathSelector(parts[i])) {
|
|
440
447
|
nextElement = evaluateXPath(currentElement, parts[i]);
|
|
441
448
|
}
|
|
442
449
|
else {
|
|
@@ -444,8 +451,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
444
451
|
}
|
|
445
452
|
}
|
|
446
453
|
// Try shadow DOM if not found
|
|
447
|
-
if (!nextElement &&
|
|
448
|
-
|
|
454
|
+
if (!nextElement &&
|
|
455
|
+
"shadowRoot" in currentElement &&
|
|
456
|
+
currentElement.shadowRoot) {
|
|
457
|
+
if (isXPathSelector(parts[i])) {
|
|
449
458
|
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
|
|
450
459
|
}
|
|
451
460
|
else {
|
|
@@ -453,11 +462,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
453
462
|
}
|
|
454
463
|
}
|
|
455
464
|
// Check children's shadow roots if still not found
|
|
456
|
-
if (!nextElement &&
|
|
465
|
+
if (!nextElement && "children" in currentElement) {
|
|
457
466
|
const children = Array.from(currentElement.children || []);
|
|
458
467
|
for (const child of children) {
|
|
459
468
|
if (child.shadowRoot) {
|
|
460
|
-
if (
|
|
469
|
+
if (isXPathSelector(parts[i])) {
|
|
461
470
|
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
|
|
462
471
|
}
|
|
463
472
|
else {
|
|
@@ -472,28 +481,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
472
481
|
}
|
|
473
482
|
return currentElement;
|
|
474
483
|
};
|
|
475
|
-
// Enhanced query all function for
|
|
484
|
+
// Enhanced query all function for both contexts
|
|
476
485
|
const queryElementAll = (rootElement, selector) => {
|
|
477
|
-
if (!selector.includes(
|
|
478
|
-
|
|
479
|
-
if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
|
|
486
|
+
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
|
487
|
+
if (isXPathSelector(selector)) {
|
|
480
488
|
return evaluateXPathAll(rootElement, selector);
|
|
481
489
|
}
|
|
482
490
|
else {
|
|
483
491
|
return Array.from(rootElement.querySelectorAll(selector));
|
|
484
492
|
}
|
|
485
493
|
}
|
|
486
|
-
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
494
|
+
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
|
487
495
|
let currentElements = [rootElement];
|
|
488
496
|
for (const part of parts) {
|
|
489
497
|
const nextElements = [];
|
|
490
498
|
for (const element of currentElements) {
|
|
491
499
|
// Handle iframe and frame traversal
|
|
492
|
-
if (element.tagName ===
|
|
500
|
+
if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
|
|
493
501
|
try {
|
|
494
502
|
const frameDoc = element.contentDocument || element.contentWindow.document;
|
|
495
503
|
if (frameDoc) {
|
|
496
|
-
if (
|
|
504
|
+
if (isXPathSelector(part)) {
|
|
497
505
|
nextElements.push(...evaluateXPathAll(frameDoc, part));
|
|
498
506
|
}
|
|
499
507
|
else {
|
|
@@ -509,7 +517,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
509
517
|
else {
|
|
510
518
|
// Regular DOM elements
|
|
511
519
|
if (element.querySelectorAll) {
|
|
512
|
-
if (
|
|
520
|
+
if (isXPathSelector(part)) {
|
|
513
521
|
nextElements.push(...evaluateXPathAll(element, part));
|
|
514
522
|
}
|
|
515
523
|
else {
|
|
@@ -518,7 +526,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
518
526
|
}
|
|
519
527
|
// Shadow DOM elements
|
|
520
528
|
if (element.shadowRoot) {
|
|
521
|
-
if (
|
|
529
|
+
if (isXPathSelector(part)) {
|
|
522
530
|
nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
|
|
523
531
|
}
|
|
524
532
|
else {
|
|
@@ -529,7 +537,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
529
537
|
const children = Array.from(element.children || []);
|
|
530
538
|
for (const child of children) {
|
|
531
539
|
if (child.shadowRoot) {
|
|
532
|
-
if (
|
|
540
|
+
if (isXPathSelector(part)) {
|
|
533
541
|
nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
|
|
534
542
|
}
|
|
535
543
|
else {
|
|
@@ -545,7 +553,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
545
553
|
};
|
|
546
554
|
// Enhanced value extraction with context awareness
|
|
547
555
|
const extractValue = (element, attribute) => {
|
|
548
|
-
var _a, _b, _c, _d, _e
|
|
556
|
+
var _a, _b, _c, _d, _e;
|
|
549
557
|
if (!element)
|
|
550
558
|
return null;
|
|
551
559
|
// Get context-aware base URL
|
|
@@ -557,17 +565,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
557
565
|
return shadowContent.trim();
|
|
558
566
|
}
|
|
559
567
|
}
|
|
560
|
-
if (attribute ===
|
|
568
|
+
if (attribute === "innerText") {
|
|
561
569
|
// First try standard innerText/textContent
|
|
562
570
|
let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
|
|
563
571
|
// If empty, check for common data attributes that might contain the text
|
|
564
572
|
if (!textContent) {
|
|
565
573
|
const dataAttributes = [
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
574
|
+
"data-600",
|
|
575
|
+
"data-text",
|
|
576
|
+
"data-label",
|
|
577
|
+
"data-value",
|
|
578
|
+
"data-content",
|
|
571
579
|
];
|
|
572
580
|
for (const attr of dataAttributes) {
|
|
573
581
|
const dataValue = element.getAttribute(attr);
|
|
@@ -579,140 +587,437 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
579
587
|
}
|
|
580
588
|
return textContent || null;
|
|
581
589
|
}
|
|
582
|
-
else if (attribute ===
|
|
590
|
+
else if (attribute === "innerHTML") {
|
|
583
591
|
return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
|
|
584
592
|
}
|
|
585
|
-
else if (attribute ===
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
593
|
+
else if (attribute === "src" || attribute === "href") {
|
|
594
|
+
if (attribute === "href" && element.tagName !== "A") {
|
|
595
|
+
const parentElement = element.parentElement;
|
|
596
|
+
if (parentElement && parentElement.tagName === "A") {
|
|
597
|
+
const parentHref = parentElement.getAttribute("href");
|
|
598
|
+
if (parentHref) {
|
|
599
|
+
try {
|
|
600
|
+
return new URL(parentHref, baseURL).href;
|
|
601
|
+
}
|
|
602
|
+
catch (e) {
|
|
603
|
+
return parentHref;
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
}
|
|
591
607
|
}
|
|
592
|
-
const
|
|
593
|
-
|
|
608
|
+
const attrValue = element.getAttribute(attribute);
|
|
609
|
+
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
|
|
610
|
+
if (!dataAttr || dataAttr.trim() === "") {
|
|
611
|
+
if (attribute === "src") {
|
|
612
|
+
const style = window.getComputedStyle(element);
|
|
613
|
+
const bgImage = style.backgroundImage;
|
|
614
|
+
if (bgImage && bgImage !== "none") {
|
|
615
|
+
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
616
|
+
return matches ? new URL(matches[1], baseURL).href : null;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
594
619
|
return null;
|
|
595
620
|
}
|
|
596
621
|
try {
|
|
597
|
-
return new URL(
|
|
622
|
+
return new URL(dataAttr, baseURL).href;
|
|
598
623
|
}
|
|
599
624
|
catch (e) {
|
|
600
|
-
console.warn(
|
|
601
|
-
return
|
|
625
|
+
console.warn("Error creating URL from", dataAttr, e);
|
|
626
|
+
return dataAttr;
|
|
602
627
|
}
|
|
603
628
|
}
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
629
|
+
return element.getAttribute(attribute);
|
|
630
|
+
};
|
|
631
|
+
// Enhanced table ancestor finding with context support
|
|
632
|
+
const findTableAncestor = (element) => {
|
|
633
|
+
let currentElement = element;
|
|
634
|
+
const MAX_DEPTH = 5;
|
|
635
|
+
let depth = 0;
|
|
636
|
+
while (currentElement && depth < MAX_DEPTH) {
|
|
637
|
+
// Handle shadow DOM
|
|
638
|
+
if (currentElement.getRootNode() instanceof ShadowRoot) {
|
|
639
|
+
currentElement = currentElement.getRootNode().host;
|
|
640
|
+
continue;
|
|
641
|
+
}
|
|
642
|
+
if (currentElement.tagName === "TD") {
|
|
643
|
+
return { type: "TD", element: currentElement };
|
|
644
|
+
}
|
|
645
|
+
else if (currentElement.tagName === "TR") {
|
|
646
|
+
return { type: "TR", element: currentElement };
|
|
647
|
+
}
|
|
648
|
+
// Handle iframe and frame crossing
|
|
649
|
+
if (currentElement.tagName === "IFRAME" ||
|
|
650
|
+
currentElement.tagName === "FRAME") {
|
|
651
|
+
try {
|
|
652
|
+
currentElement = currentElement.contentDocument.body;
|
|
613
653
|
}
|
|
614
|
-
|
|
654
|
+
catch (e) {
|
|
655
|
+
return null;
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
else {
|
|
659
|
+
currentElement = currentElement.parentElement;
|
|
660
|
+
}
|
|
661
|
+
depth++;
|
|
662
|
+
}
|
|
663
|
+
return null;
|
|
664
|
+
};
|
|
665
|
+
// Helper function to get cell index
|
|
666
|
+
const getCellIndex = (td) => {
|
|
667
|
+
if (td.getRootNode() instanceof ShadowRoot) {
|
|
668
|
+
const shadowRoot = td.getRootNode();
|
|
669
|
+
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
|
|
670
|
+
return allCells.indexOf(td);
|
|
671
|
+
}
|
|
672
|
+
let index = 0;
|
|
673
|
+
let sibling = td;
|
|
674
|
+
while ((sibling = sibling.previousElementSibling)) {
|
|
675
|
+
index++;
|
|
676
|
+
}
|
|
677
|
+
return index;
|
|
678
|
+
};
|
|
679
|
+
// Helper function to check for TH elements
|
|
680
|
+
const hasThElement = (row, tableFields) => {
|
|
681
|
+
for (const [_, { selector }] of Object.entries(tableFields)) {
|
|
682
|
+
const element = queryElement(row, selector);
|
|
683
|
+
if (element) {
|
|
684
|
+
let current = element;
|
|
685
|
+
while (current && current !== row) {
|
|
686
|
+
if (current.getRootNode() instanceof ShadowRoot) {
|
|
687
|
+
current = current.getRootNode().host;
|
|
688
|
+
continue;
|
|
689
|
+
}
|
|
690
|
+
if (current.tagName === "TH")
|
|
691
|
+
return true;
|
|
692
|
+
if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
|
|
693
|
+
try {
|
|
694
|
+
current = current.contentDocument.body;
|
|
695
|
+
}
|
|
696
|
+
catch (e) {
|
|
697
|
+
break;
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
else {
|
|
701
|
+
current = current.parentElement;
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
return false;
|
|
707
|
+
};
|
|
708
|
+
// Helper function to filter rows
|
|
709
|
+
const filterRowsBasedOnTag = (rows, tableFields) => {
|
|
710
|
+
for (const row of rows) {
|
|
711
|
+
if (hasThElement(row, tableFields)) {
|
|
712
|
+
return rows;
|
|
615
713
|
}
|
|
714
|
+
}
|
|
715
|
+
return rows.filter((row) => {
|
|
716
|
+
const directTH = row.getElementsByTagName("TH").length === 0;
|
|
717
|
+
const shadowTH = row.shadowRoot
|
|
718
|
+
? row.shadowRoot.querySelector("th") === null
|
|
719
|
+
: true;
|
|
720
|
+
return directTH && shadowTH;
|
|
721
|
+
});
|
|
722
|
+
};
|
|
723
|
+
// Class similarity comparison functions
|
|
724
|
+
const calculateClassSimilarity = (classList1, classList2) => {
|
|
725
|
+
const set1 = new Set(classList1);
|
|
726
|
+
const set2 = new Set(classList2);
|
|
727
|
+
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
|
728
|
+
const union = new Set([...set1, ...set2]);
|
|
729
|
+
return intersection.size / union.size;
|
|
730
|
+
};
|
|
731
|
+
// Enhanced similar elements finding with context support
|
|
732
|
+
const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
|
|
733
|
+
const baseClasses = Array.from(baseElement.classList);
|
|
734
|
+
if (baseClasses.length === 0)
|
|
735
|
+
return [];
|
|
736
|
+
const allElements = [];
|
|
737
|
+
// Get elements from main document
|
|
738
|
+
allElements.push(...document.getElementsByTagName(baseElement.tagName));
|
|
739
|
+
// Get elements from shadow DOM
|
|
740
|
+
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
|
741
|
+
const shadowHost = baseElement.getRootNode().host;
|
|
742
|
+
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
|
743
|
+
}
|
|
744
|
+
// Get elements from iframes and frames
|
|
745
|
+
const frames = [
|
|
746
|
+
...Array.from(document.getElementsByTagName("iframe")),
|
|
747
|
+
...Array.from(document.getElementsByTagName("frame")),
|
|
748
|
+
];
|
|
749
|
+
for (const frame of frames) {
|
|
616
750
|
try {
|
|
617
|
-
|
|
751
|
+
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
|
752
|
+
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
|
618
753
|
}
|
|
619
754
|
catch (e) {
|
|
620
|
-
console.warn(
|
|
621
|
-
return dataAttr;
|
|
755
|
+
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
|
622
756
|
}
|
|
623
757
|
}
|
|
624
|
-
return
|
|
758
|
+
return allElements.filter((element) => {
|
|
759
|
+
if (element === baseElement)
|
|
760
|
+
return false;
|
|
761
|
+
const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
|
|
762
|
+
return similarity >= similarityThreshold;
|
|
763
|
+
});
|
|
764
|
+
};
|
|
765
|
+
const tryFallbackSelector = (rootElement, originalSelector) => {
|
|
766
|
+
let element = queryElement(rootElement, originalSelector);
|
|
767
|
+
if (!element && originalSelector.includes("nth-child")) {
|
|
768
|
+
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
769
|
+
if (match) {
|
|
770
|
+
const position = parseInt(match[1], 10);
|
|
771
|
+
for (let i = position - 1; i >= 1; i--) {
|
|
772
|
+
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
|
773
|
+
element = queryElement(rootElement, fallbackSelector);
|
|
774
|
+
if (element)
|
|
775
|
+
break;
|
|
776
|
+
}
|
|
777
|
+
if (!element) {
|
|
778
|
+
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, "");
|
|
779
|
+
element = queryElement(rootElement, baseSelector);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
return element;
|
|
625
784
|
};
|
|
626
785
|
// Create indexed XPath for specific container instance
|
|
627
786
|
const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
|
|
628
|
-
console.log(`Creating indexed XPath for container ${containerIndex}`);
|
|
629
|
-
console.log(`Child selector: ${childSelector}`);
|
|
630
|
-
console.log(`List selector: ${listSelector}`);
|
|
631
787
|
// Check if the child selector contains the list selector pattern
|
|
632
|
-
if (childSelector.includes(listSelector.replace(
|
|
788
|
+
if (childSelector.includes(listSelector.replace("//", ""))) {
|
|
633
789
|
// Replace the list selector part with indexed version
|
|
634
|
-
const listPattern = listSelector.replace(
|
|
790
|
+
const listPattern = listSelector.replace("//", "");
|
|
635
791
|
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
|
636
792
|
const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
|
|
637
|
-
console.log(`Generated indexed selector: ${indexedSelector}`);
|
|
638
793
|
return indexedSelector;
|
|
639
794
|
}
|
|
640
795
|
else {
|
|
641
796
|
// If pattern doesn't match, create a more generic indexed selector
|
|
642
|
-
|
|
643
|
-
return `(${listSelector})[${containerIndex}]${childSelector.replace('//', '/')}`;
|
|
797
|
+
return `(${listSelector})[${containerIndex}]${childSelector.replace("//", "/")}`;
|
|
644
798
|
}
|
|
645
799
|
};
|
|
646
|
-
// Main scraping logic
|
|
647
|
-
console.log(
|
|
648
|
-
console.log(
|
|
649
|
-
console.log(
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
console.log(`📦 Found ${containers.length} list containers`);
|
|
800
|
+
// Main scraping logic with unified support for both CSS and XPath
|
|
801
|
+
console.log("🚀 Starting unified list data extraction");
|
|
802
|
+
console.log("List Selector:", listSelector);
|
|
803
|
+
console.log("Fields:", fields);
|
|
804
|
+
let containers = queryElementAll(document, listSelector);
|
|
805
|
+
containers = Array.from(containers);
|
|
653
806
|
if (containers.length === 0) {
|
|
654
|
-
console.warn(
|
|
807
|
+
console.warn("❌ No containers found for listSelector:", listSelector);
|
|
655
808
|
return [];
|
|
656
809
|
}
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
const
|
|
663
|
-
const
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
810
|
+
console.log(`📦 Found ${containers.length} list containers`);
|
|
811
|
+
// For CSS selectors, try to find similar containers if needed
|
|
812
|
+
if (!isXPathSelector(listSelector) &&
|
|
813
|
+
limit > 1 &&
|
|
814
|
+
containers.length === 1) {
|
|
815
|
+
const baseContainer = containers[0];
|
|
816
|
+
const similarContainers = findSimilarElements(baseContainer);
|
|
817
|
+
if (similarContainers.length > 0) {
|
|
818
|
+
const newContainers = similarContainers.filter((container) => !container.matches(listSelector));
|
|
819
|
+
containers = [...containers, ...newContainers];
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
const containerFields = containers.map(() => ({
|
|
823
|
+
tableFields: {},
|
|
824
|
+
nonTableFields: {},
|
|
825
|
+
}));
|
|
826
|
+
// For XPath selectors, use the new approach
|
|
827
|
+
if (isXPathSelector(listSelector)) {
|
|
828
|
+
const extractedData = [];
|
|
829
|
+
const containersToProcess = Math.min(containers.length, limit);
|
|
830
|
+
for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
|
|
831
|
+
const record = {};
|
|
832
|
+
for (const [label, field] of Object.entries(fields)) {
|
|
833
|
+
let element = null;
|
|
834
|
+
if (isXPathSelector(field.selector)) {
|
|
835
|
+
// Create indexed absolute XPath
|
|
836
|
+
const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
|
|
837
|
+
element = evaluateXPath(document, indexedSelector);
|
|
838
|
+
}
|
|
839
|
+
else {
|
|
840
|
+
// Fallback for CSS selectors within XPath containers
|
|
841
|
+
const container = containers[containerIndex];
|
|
842
|
+
element = queryElement(container, field.selector);
|
|
843
|
+
}
|
|
678
844
|
if (element) {
|
|
679
|
-
|
|
845
|
+
const value = extractValue(element, field.attribute);
|
|
846
|
+
if (value !== null && value !== "") {
|
|
847
|
+
record[label] = value;
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
record[label] = "";
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
else {
|
|
854
|
+
record[label] = "";
|
|
680
855
|
}
|
|
681
856
|
}
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
element = queryElement(container, field.selector);
|
|
857
|
+
if (Object.values(record).some((value) => value !== "")) {
|
|
858
|
+
extractedData.push(record);
|
|
685
859
|
}
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
860
|
+
}
|
|
861
|
+
console.log(`📊 Total records extracted: ${extractedData.length}`);
|
|
862
|
+
return extractedData;
|
|
863
|
+
}
|
|
864
|
+
// For CSS selectors, use the original table-aware approach
|
|
865
|
+
containers.forEach((container, containerIndex) => {
|
|
866
|
+
for (const [label, field] of Object.entries(fields)) {
|
|
867
|
+
const sampleElement = queryElement(container, field.selector);
|
|
868
|
+
if (sampleElement) {
|
|
869
|
+
const ancestor = findTableAncestor(sampleElement);
|
|
870
|
+
if (ancestor) {
|
|
871
|
+
containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1 });
|
|
692
872
|
}
|
|
693
873
|
else {
|
|
694
|
-
|
|
695
|
-
record[label] = '';
|
|
874
|
+
containerFields[containerIndex].nonTableFields[label] = field;
|
|
696
875
|
}
|
|
697
876
|
}
|
|
698
877
|
else {
|
|
699
|
-
|
|
700
|
-
record[label] = '';
|
|
878
|
+
containerFields[containerIndex].nonTableFields[label] = field;
|
|
701
879
|
}
|
|
702
880
|
}
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
881
|
+
});
|
|
882
|
+
const tableData = [];
|
|
883
|
+
const nonTableData = [];
|
|
884
|
+
// Process table data with support for iframes, frames, and shadow DOM
|
|
885
|
+
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
886
|
+
const container = containers[containerIndex];
|
|
887
|
+
const { tableFields } = containerFields[containerIndex];
|
|
888
|
+
if (Object.keys(tableFields).length > 0) {
|
|
889
|
+
const firstField = Object.values(tableFields)[0];
|
|
890
|
+
const firstElement = queryElement(container, firstField.selector);
|
|
891
|
+
let tableContext = firstElement;
|
|
892
|
+
// Find table context including iframe, frame and shadow DOM
|
|
893
|
+
while (tableContext &&
|
|
894
|
+
tableContext.tagName !== "TABLE" &&
|
|
895
|
+
tableContext !== container) {
|
|
896
|
+
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
897
|
+
tableContext = tableContext.getRootNode().host;
|
|
898
|
+
continue;
|
|
899
|
+
}
|
|
900
|
+
if (tableContext.tagName === "IFRAME" ||
|
|
901
|
+
tableContext.tagName === "FRAME") {
|
|
902
|
+
try {
|
|
903
|
+
tableContext = tableContext.contentDocument.body;
|
|
904
|
+
}
|
|
905
|
+
catch (e) {
|
|
906
|
+
break;
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
else {
|
|
910
|
+
tableContext = tableContext.parentElement;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
if (tableContext) {
|
|
914
|
+
// Get rows from all contexts
|
|
915
|
+
const rows = [];
|
|
916
|
+
// Get rows from regular DOM
|
|
917
|
+
rows.push(...tableContext.getElementsByTagName("TR"));
|
|
918
|
+
// Get rows from shadow DOM
|
|
919
|
+
if (tableContext.shadowRoot) {
|
|
920
|
+
rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
|
|
921
|
+
}
|
|
922
|
+
// Get rows from iframes and frames
|
|
923
|
+
if (tableContext.tagName === "IFRAME" ||
|
|
924
|
+
tableContext.tagName === "FRAME") {
|
|
925
|
+
try {
|
|
926
|
+
const frameDoc = tableContext.contentDocument ||
|
|
927
|
+
tableContext.contentWindow.document;
|
|
928
|
+
rows.push(...frameDoc.getElementsByTagName("TR"));
|
|
929
|
+
}
|
|
930
|
+
catch (e) {
|
|
931
|
+
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
|
935
|
+
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
|
936
|
+
const record = {};
|
|
937
|
+
const currentRow = processedRows[rowIndex];
|
|
938
|
+
for (const [label, { selector, attribute, cellIndex },] of Object.entries(tableFields)) {
|
|
939
|
+
let element = null;
|
|
940
|
+
if (cellIndex >= 0) {
|
|
941
|
+
// Get TD element considering both contexts
|
|
942
|
+
let td = currentRow.children[cellIndex];
|
|
943
|
+
// Check shadow DOM for td
|
|
944
|
+
if (!td && currentRow.shadowRoot) {
|
|
945
|
+
const shadowCells = currentRow.shadowRoot.children;
|
|
946
|
+
if (shadowCells && shadowCells.length > cellIndex) {
|
|
947
|
+
td = shadowCells[cellIndex];
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
if (td) {
|
|
951
|
+
element = queryElement(td, selector);
|
|
952
|
+
if (!element &&
|
|
953
|
+
selector
|
|
954
|
+
.split(/(?:>>|:>>)/)
|
|
955
|
+
.pop()
|
|
956
|
+
.includes("td:nth-child")) {
|
|
957
|
+
element = td;
|
|
958
|
+
}
|
|
959
|
+
if (!element) {
|
|
960
|
+
const tagOnlySelector = selector.split(".")[0];
|
|
961
|
+
element = queryElement(td, tagOnlySelector);
|
|
962
|
+
}
|
|
963
|
+
if (!element) {
|
|
964
|
+
let currentElement = td;
|
|
965
|
+
while (currentElement &&
|
|
966
|
+
currentElement.children.length > 0) {
|
|
967
|
+
let foundContentChild = false;
|
|
968
|
+
for (const child of currentElement.children) {
|
|
969
|
+
if (extractValue(child, attribute)) {
|
|
970
|
+
currentElement = child;
|
|
971
|
+
foundContentChild = true;
|
|
972
|
+
break;
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
if (!foundContentChild)
|
|
976
|
+
break;
|
|
977
|
+
}
|
|
978
|
+
element = currentElement;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
else {
|
|
983
|
+
element = queryElement(currentRow, selector);
|
|
984
|
+
}
|
|
985
|
+
if (element) {
|
|
986
|
+
record[label] = extractValue(element, attribute);
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
if (Object.keys(record).length > 0) {
|
|
990
|
+
tableData.push(record);
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
}
|
|
707
994
|
}
|
|
708
|
-
|
|
709
|
-
|
|
995
|
+
}
|
|
996
|
+
// Process non-table data with all contexts support
|
|
997
|
+
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
998
|
+
if (nonTableData.length >= limit)
|
|
999
|
+
break;
|
|
1000
|
+
const container = containers[containerIndex];
|
|
1001
|
+
const { nonTableFields } = containerFields[containerIndex];
|
|
1002
|
+
if (Object.keys(nonTableFields).length > 0) {
|
|
1003
|
+
const record = {};
|
|
1004
|
+
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
1005
|
+
// Get the last part of the selector after any context delimiter
|
|
1006
|
+
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
1007
|
+
const element = tryFallbackSelector(container, relativeSelector);
|
|
1008
|
+
if (element) {
|
|
1009
|
+
record[label] = extractValue(element, attribute);
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
if (Object.keys(record).length > 0) {
|
|
1013
|
+
nonTableData.push(record);
|
|
1014
|
+
}
|
|
710
1015
|
}
|
|
711
1016
|
}
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
console.log(
|
|
715
|
-
return
|
|
1017
|
+
// Merge and limit the results
|
|
1018
|
+
const scrapedData = [...tableData, ...nonTableData];
|
|
1019
|
+
console.log(`📊 Total records extracted: ${scrapedData.length}`);
|
|
1020
|
+
return scrapedData;
|
|
716
1021
|
});
|
|
717
1022
|
};
|
|
718
1023
|
/**
|