maxun-core 0.0.17 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +288 -89
- package/build/interpret.d.ts +1 -0
- package/build/interpret.js +51 -10
- package/package.json +1 -1
|
@@ -359,21 +359,80 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
359
359
|
*/
|
|
360
360
|
window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
|
|
361
361
|
return __awaiter(this, void 0, void 0, function* () {
|
|
362
|
-
//
|
|
362
|
+
// XPath evaluation functions
|
|
363
|
+
const evaluateXPath = (rootElement, xpath) => {
|
|
364
|
+
try {
|
|
365
|
+
const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
|
|
366
|
+
? rootElement
|
|
367
|
+
: rootElement.ownerDocument;
|
|
368
|
+
if (!ownerDoc)
|
|
369
|
+
return null;
|
|
370
|
+
const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
|
|
371
|
+
return result.singleNodeValue;
|
|
372
|
+
}
|
|
373
|
+
catch (error) {
|
|
374
|
+
console.warn("XPath evaluation failed:", xpath, error);
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
};
|
|
378
|
+
const evaluateXPathAll = (rootElement, xpath) => {
|
|
379
|
+
try {
|
|
380
|
+
const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
|
|
381
|
+
? rootElement
|
|
382
|
+
: rootElement.ownerDocument;
|
|
383
|
+
if (!ownerDoc)
|
|
384
|
+
return [];
|
|
385
|
+
const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
386
|
+
const elements = [];
|
|
387
|
+
for (let i = 0; i < result.snapshotLength; i++) {
|
|
388
|
+
const node = result.snapshotItem(i);
|
|
389
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
390
|
+
elements.push(node);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
return elements;
|
|
394
|
+
}
|
|
395
|
+
catch (error) {
|
|
396
|
+
console.warn("XPath evaluation failed:", xpath, error);
|
|
397
|
+
return [];
|
|
398
|
+
}
|
|
399
|
+
};
|
|
400
|
+
// Helper function to detect selector type
|
|
401
|
+
const isXPathSelector = (selector) => {
|
|
402
|
+
return (selector.startsWith("//") ||
|
|
403
|
+
selector.startsWith("/") ||
|
|
404
|
+
selector.startsWith("./"));
|
|
405
|
+
};
|
|
406
|
+
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
|
|
363
407
|
const queryElement = (rootElement, selector) => {
|
|
364
|
-
if (!selector.includes(
|
|
365
|
-
|
|
408
|
+
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
|
409
|
+
// Check if it's an XPath selector
|
|
410
|
+
if (isXPathSelector(selector)) {
|
|
411
|
+
return evaluateXPath(rootElement, selector);
|
|
412
|
+
}
|
|
413
|
+
else {
|
|
414
|
+
return rootElement.querySelector(selector);
|
|
415
|
+
}
|
|
366
416
|
}
|
|
367
|
-
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
417
|
+
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
|
368
418
|
let currentElement = rootElement;
|
|
369
419
|
for (let i = 0; i < parts.length; i++) {
|
|
370
420
|
if (!currentElement)
|
|
371
421
|
return null;
|
|
372
422
|
// Handle iframe and frame traversal
|
|
373
|
-
if (currentElement.tagName ===
|
|
423
|
+
if (currentElement.tagName === "IFRAME" ||
|
|
424
|
+
currentElement.tagName === "FRAME") {
|
|
374
425
|
try {
|
|
375
|
-
const frameDoc = currentElement.contentDocument ||
|
|
376
|
-
|
|
426
|
+
const frameDoc = currentElement.contentDocument ||
|
|
427
|
+
currentElement.contentWindow.document;
|
|
428
|
+
if (!frameDoc)
|
|
429
|
+
return null;
|
|
430
|
+
if (isXPathSelector(parts[i])) {
|
|
431
|
+
currentElement = evaluateXPath(frameDoc, parts[i]);
|
|
432
|
+
}
|
|
433
|
+
else {
|
|
434
|
+
currentElement = frameDoc.querySelector(parts[i]);
|
|
435
|
+
}
|
|
377
436
|
continue;
|
|
378
437
|
}
|
|
379
438
|
catch (e) {
|
|
@@ -381,18 +440,38 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
381
440
|
return null;
|
|
382
441
|
}
|
|
383
442
|
}
|
|
443
|
+
let nextElement = null;
|
|
384
444
|
// Try regular DOM first
|
|
385
|
-
|
|
445
|
+
if ("querySelector" in currentElement) {
|
|
446
|
+
if (isXPathSelector(parts[i])) {
|
|
447
|
+
nextElement = evaluateXPath(currentElement, parts[i]);
|
|
448
|
+
}
|
|
449
|
+
else {
|
|
450
|
+
nextElement = currentElement.querySelector(parts[i]);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
386
453
|
// Try shadow DOM if not found
|
|
387
|
-
if (!nextElement &&
|
|
388
|
-
|
|
454
|
+
if (!nextElement &&
|
|
455
|
+
"shadowRoot" in currentElement &&
|
|
456
|
+
currentElement.shadowRoot) {
|
|
457
|
+
if (isXPathSelector(parts[i])) {
|
|
458
|
+
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
|
|
459
|
+
}
|
|
460
|
+
else {
|
|
461
|
+
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
|
462
|
+
}
|
|
389
463
|
}
|
|
390
464
|
// Check children's shadow roots if still not found
|
|
391
|
-
if (!nextElement) {
|
|
465
|
+
if (!nextElement && "children" in currentElement) {
|
|
392
466
|
const children = Array.from(currentElement.children || []);
|
|
393
467
|
for (const child of children) {
|
|
394
468
|
if (child.shadowRoot) {
|
|
395
|
-
|
|
469
|
+
if (isXPathSelector(parts[i])) {
|
|
470
|
+
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
|
|
471
|
+
}
|
|
472
|
+
else {
|
|
473
|
+
nextElement = child.shadowRoot.querySelector(parts[i]);
|
|
474
|
+
}
|
|
396
475
|
if (nextElement)
|
|
397
476
|
break;
|
|
398
477
|
}
|
|
@@ -404,19 +483,31 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
404
483
|
};
|
|
405
484
|
// Enhanced query all function for both contexts
|
|
406
485
|
const queryElementAll = (rootElement, selector) => {
|
|
407
|
-
if (!selector.includes(
|
|
408
|
-
|
|
486
|
+
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
|
487
|
+
if (isXPathSelector(selector)) {
|
|
488
|
+
return evaluateXPathAll(rootElement, selector);
|
|
489
|
+
}
|
|
490
|
+
else {
|
|
491
|
+
return Array.from(rootElement.querySelectorAll(selector));
|
|
492
|
+
}
|
|
409
493
|
}
|
|
410
|
-
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
494
|
+
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
|
411
495
|
let currentElements = [rootElement];
|
|
412
496
|
for (const part of parts) {
|
|
413
497
|
const nextElements = [];
|
|
414
498
|
for (const element of currentElements) {
|
|
415
499
|
// Handle iframe and frame traversal
|
|
416
|
-
if (element.tagName ===
|
|
500
|
+
if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
|
|
417
501
|
try {
|
|
418
502
|
const frameDoc = element.contentDocument || element.contentWindow.document;
|
|
419
|
-
|
|
503
|
+
if (frameDoc) {
|
|
504
|
+
if (isXPathSelector(part)) {
|
|
505
|
+
nextElements.push(...evaluateXPathAll(frameDoc, part));
|
|
506
|
+
}
|
|
507
|
+
else {
|
|
508
|
+
nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
|
|
509
|
+
}
|
|
510
|
+
}
|
|
420
511
|
}
|
|
421
512
|
catch (e) {
|
|
422
513
|
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
|
@@ -426,17 +517,32 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
426
517
|
else {
|
|
427
518
|
// Regular DOM elements
|
|
428
519
|
if (element.querySelectorAll) {
|
|
429
|
-
|
|
520
|
+
if (isXPathSelector(part)) {
|
|
521
|
+
nextElements.push(...evaluateXPathAll(element, part));
|
|
522
|
+
}
|
|
523
|
+
else {
|
|
524
|
+
nextElements.push(...Array.from(element.querySelectorAll(part)));
|
|
525
|
+
}
|
|
430
526
|
}
|
|
431
527
|
// Shadow DOM elements
|
|
432
528
|
if (element.shadowRoot) {
|
|
433
|
-
|
|
529
|
+
if (isXPathSelector(part)) {
|
|
530
|
+
nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
|
|
531
|
+
}
|
|
532
|
+
else {
|
|
533
|
+
nextElements.push(...Array.from(element.shadowRoot.querySelectorAll(part)));
|
|
534
|
+
}
|
|
434
535
|
}
|
|
435
536
|
// Check children's shadow roots
|
|
436
537
|
const children = Array.from(element.children || []);
|
|
437
538
|
for (const child of children) {
|
|
438
539
|
if (child.shadowRoot) {
|
|
439
|
-
|
|
540
|
+
if (isXPathSelector(part)) {
|
|
541
|
+
nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
|
|
542
|
+
}
|
|
543
|
+
else {
|
|
544
|
+
nextElements.push(...Array.from(child.shadowRoot.querySelectorAll(part)));
|
|
545
|
+
}
|
|
440
546
|
}
|
|
441
547
|
}
|
|
442
548
|
}
|
|
@@ -446,8 +552,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
446
552
|
return currentElements;
|
|
447
553
|
};
|
|
448
554
|
// Enhanced value extraction with context awareness
|
|
449
|
-
|
|
450
|
-
var _a, _b;
|
|
555
|
+
const extractValue = (element, attribute) => {
|
|
556
|
+
var _a, _b, _c, _d, _e;
|
|
451
557
|
if (!element)
|
|
452
558
|
return null;
|
|
453
559
|
// Get context-aware base URL
|
|
@@ -459,17 +565,36 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
459
565
|
return shadowContent.trim();
|
|
460
566
|
}
|
|
461
567
|
}
|
|
462
|
-
if (attribute ===
|
|
463
|
-
|
|
568
|
+
if (attribute === "innerText") {
|
|
569
|
+
// First try standard innerText/textContent
|
|
570
|
+
let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
|
|
571
|
+
// If empty, check for common data attributes that might contain the text
|
|
572
|
+
if (!textContent) {
|
|
573
|
+
const dataAttributes = [
|
|
574
|
+
"data-600",
|
|
575
|
+
"data-text",
|
|
576
|
+
"data-label",
|
|
577
|
+
"data-value",
|
|
578
|
+
"data-content",
|
|
579
|
+
];
|
|
580
|
+
for (const attr of dataAttributes) {
|
|
581
|
+
const dataValue = element.getAttribute(attr);
|
|
582
|
+
if (dataValue && dataValue.trim()) {
|
|
583
|
+
textContent = dataValue.trim();
|
|
584
|
+
break;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
return textContent || null;
|
|
464
589
|
}
|
|
465
|
-
else if (attribute ===
|
|
466
|
-
return element.innerHTML.trim();
|
|
590
|
+
else if (attribute === "innerHTML") {
|
|
591
|
+
return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
|
|
467
592
|
}
|
|
468
|
-
else if (attribute ===
|
|
469
|
-
if (attribute ===
|
|
593
|
+
else if (attribute === "src" || attribute === "href") {
|
|
594
|
+
if (attribute === "href" && element.tagName !== "A") {
|
|
470
595
|
const parentElement = element.parentElement;
|
|
471
|
-
if (parentElement && parentElement.tagName ===
|
|
472
|
-
const parentHref = parentElement.getAttribute(
|
|
596
|
+
if (parentElement && parentElement.tagName === "A") {
|
|
597
|
+
const parentHref = parentElement.getAttribute("href");
|
|
473
598
|
if (parentHref) {
|
|
474
599
|
try {
|
|
475
600
|
return new URL(parentHref, baseURL).href;
|
|
@@ -481,12 +606,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
481
606
|
}
|
|
482
607
|
}
|
|
483
608
|
const attrValue = element.getAttribute(attribute);
|
|
484
|
-
const dataAttr = attrValue || element.getAttribute(
|
|
485
|
-
if (!dataAttr || dataAttr.trim() ===
|
|
486
|
-
if (attribute ===
|
|
609
|
+
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
|
|
610
|
+
if (!dataAttr || dataAttr.trim() === "") {
|
|
611
|
+
if (attribute === "src") {
|
|
487
612
|
const style = window.getComputedStyle(element);
|
|
488
613
|
const bgImage = style.backgroundImage;
|
|
489
|
-
if (bgImage && bgImage !==
|
|
614
|
+
if (bgImage && bgImage !== "none") {
|
|
490
615
|
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
491
616
|
return matches ? new URL(matches[1], baseURL).href : null;
|
|
492
617
|
}
|
|
@@ -497,14 +622,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
497
622
|
return new URL(dataAttr, baseURL).href;
|
|
498
623
|
}
|
|
499
624
|
catch (e) {
|
|
500
|
-
console.warn(
|
|
501
|
-
return dataAttr;
|
|
625
|
+
console.warn("Error creating URL from", dataAttr, e);
|
|
626
|
+
return dataAttr;
|
|
502
627
|
}
|
|
503
628
|
}
|
|
504
629
|
return element.getAttribute(attribute);
|
|
505
|
-
}
|
|
630
|
+
};
|
|
506
631
|
// Enhanced table ancestor finding with context support
|
|
507
|
-
|
|
632
|
+
const findTableAncestor = (element) => {
|
|
508
633
|
let currentElement = element;
|
|
509
634
|
const MAX_DEPTH = 5;
|
|
510
635
|
let depth = 0;
|
|
@@ -514,14 +639,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
514
639
|
currentElement = currentElement.getRootNode().host;
|
|
515
640
|
continue;
|
|
516
641
|
}
|
|
517
|
-
if (currentElement.tagName ===
|
|
518
|
-
return { type:
|
|
642
|
+
if (currentElement.tagName === "TD") {
|
|
643
|
+
return { type: "TD", element: currentElement };
|
|
519
644
|
}
|
|
520
|
-
else if (currentElement.tagName ===
|
|
521
|
-
return { type:
|
|
645
|
+
else if (currentElement.tagName === "TR") {
|
|
646
|
+
return { type: "TR", element: currentElement };
|
|
522
647
|
}
|
|
523
648
|
// Handle iframe and frame crossing
|
|
524
|
-
if (currentElement.tagName ===
|
|
649
|
+
if (currentElement.tagName === "IFRAME" ||
|
|
650
|
+
currentElement.tagName === "FRAME") {
|
|
525
651
|
try {
|
|
526
652
|
currentElement = currentElement.contentDocument.body;
|
|
527
653
|
}
|
|
@@ -535,23 +661,23 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
535
661
|
depth++;
|
|
536
662
|
}
|
|
537
663
|
return null;
|
|
538
|
-
}
|
|
664
|
+
};
|
|
539
665
|
// Helper function to get cell index
|
|
540
|
-
|
|
666
|
+
const getCellIndex = (td) => {
|
|
541
667
|
if (td.getRootNode() instanceof ShadowRoot) {
|
|
542
668
|
const shadowRoot = td.getRootNode();
|
|
543
|
-
const allCells = Array.from(shadowRoot.querySelectorAll(
|
|
669
|
+
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
|
|
544
670
|
return allCells.indexOf(td);
|
|
545
671
|
}
|
|
546
672
|
let index = 0;
|
|
547
673
|
let sibling = td;
|
|
548
|
-
while (sibling = sibling.previousElementSibling) {
|
|
674
|
+
while ((sibling = sibling.previousElementSibling)) {
|
|
549
675
|
index++;
|
|
550
676
|
}
|
|
551
677
|
return index;
|
|
552
|
-
}
|
|
678
|
+
};
|
|
553
679
|
// Helper function to check for TH elements
|
|
554
|
-
|
|
680
|
+
const hasThElement = (row, tableFields) => {
|
|
555
681
|
for (const [_, { selector }] of Object.entries(tableFields)) {
|
|
556
682
|
const element = queryElement(row, selector);
|
|
557
683
|
if (element) {
|
|
@@ -561,9 +687,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
561
687
|
current = current.getRootNode().host;
|
|
562
688
|
continue;
|
|
563
689
|
}
|
|
564
|
-
if (current.tagName ===
|
|
690
|
+
if (current.tagName === "TH")
|
|
565
691
|
return true;
|
|
566
|
-
if (current.tagName ===
|
|
692
|
+
if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
|
|
567
693
|
try {
|
|
568
694
|
current = current.contentDocument.body;
|
|
569
695
|
}
|
|
@@ -578,32 +704,32 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
578
704
|
}
|
|
579
705
|
}
|
|
580
706
|
return false;
|
|
581
|
-
}
|
|
707
|
+
};
|
|
582
708
|
// Helper function to filter rows
|
|
583
|
-
|
|
709
|
+
const filterRowsBasedOnTag = (rows, tableFields) => {
|
|
584
710
|
for (const row of rows) {
|
|
585
711
|
if (hasThElement(row, tableFields)) {
|
|
586
712
|
return rows;
|
|
587
713
|
}
|
|
588
714
|
}
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
const
|
|
592
|
-
|
|
593
|
-
|
|
715
|
+
return rows.filter((row) => {
|
|
716
|
+
const directTH = row.getElementsByTagName("TH").length === 0;
|
|
717
|
+
const shadowTH = row.shadowRoot
|
|
718
|
+
? row.shadowRoot.querySelector("th") === null
|
|
719
|
+
: true;
|
|
594
720
|
return directTH && shadowTH;
|
|
595
721
|
});
|
|
596
|
-
}
|
|
722
|
+
};
|
|
597
723
|
// Class similarity comparison functions
|
|
598
|
-
|
|
724
|
+
const calculateClassSimilarity = (classList1, classList2) => {
|
|
599
725
|
const set1 = new Set(classList1);
|
|
600
726
|
const set2 = new Set(classList2);
|
|
601
|
-
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
727
|
+
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
|
602
728
|
const union = new Set([...set1, ...set2]);
|
|
603
729
|
return intersection.size / union.size;
|
|
604
|
-
}
|
|
730
|
+
};
|
|
605
731
|
// Enhanced similar elements finding with context support
|
|
606
|
-
|
|
732
|
+
const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
|
|
607
733
|
const baseClasses = Array.from(baseElement.classList);
|
|
608
734
|
if (baseClasses.length === 0)
|
|
609
735
|
return [];
|
|
@@ -617,8 +743,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
617
743
|
}
|
|
618
744
|
// Get elements from iframes and frames
|
|
619
745
|
const frames = [
|
|
620
|
-
...Array.from(document.getElementsByTagName(
|
|
621
|
-
...Array.from(document.getElementsByTagName(
|
|
746
|
+
...Array.from(document.getElementsByTagName("iframe")),
|
|
747
|
+
...Array.from(document.getElementsByTagName("frame")),
|
|
622
748
|
];
|
|
623
749
|
for (const frame of frames) {
|
|
624
750
|
try {
|
|
@@ -629,16 +755,16 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
629
755
|
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
|
630
756
|
}
|
|
631
757
|
}
|
|
632
|
-
return allElements.filter(element => {
|
|
758
|
+
return allElements.filter((element) => {
|
|
633
759
|
if (element === baseElement)
|
|
634
760
|
return false;
|
|
635
761
|
const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
|
|
636
762
|
return similarity >= similarityThreshold;
|
|
637
763
|
});
|
|
638
|
-
}
|
|
639
|
-
|
|
764
|
+
};
|
|
765
|
+
const tryFallbackSelector = (rootElement, originalSelector) => {
|
|
640
766
|
let element = queryElement(rootElement, originalSelector);
|
|
641
|
-
if (!element && originalSelector.includes(
|
|
767
|
+
if (!element && originalSelector.includes("nth-child")) {
|
|
642
768
|
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
643
769
|
if (match) {
|
|
644
770
|
const position = parseInt(match[1], 10);
|
|
@@ -649,38 +775,100 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
649
775
|
break;
|
|
650
776
|
}
|
|
651
777
|
if (!element) {
|
|
652
|
-
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/,
|
|
778
|
+
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, "");
|
|
653
779
|
element = queryElement(rootElement, baseSelector);
|
|
654
780
|
}
|
|
655
781
|
}
|
|
656
782
|
}
|
|
657
783
|
return element;
|
|
658
|
-
}
|
|
659
|
-
//
|
|
784
|
+
};
|
|
785
|
+
// Create indexed XPath for specific container instance
|
|
786
|
+
const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
|
|
787
|
+
// Check if the child selector contains the list selector pattern
|
|
788
|
+
if (childSelector.includes(listSelector.replace("//", ""))) {
|
|
789
|
+
// Replace the list selector part with indexed version
|
|
790
|
+
const listPattern = listSelector.replace("//", "");
|
|
791
|
+
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
|
792
|
+
const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
|
|
793
|
+
return indexedSelector;
|
|
794
|
+
}
|
|
795
|
+
else {
|
|
796
|
+
// If pattern doesn't match, create a more generic indexed selector
|
|
797
|
+
return `(${listSelector})[${containerIndex}]${childSelector.replace("//", "/")}`;
|
|
798
|
+
}
|
|
799
|
+
};
|
|
800
|
+
// Main scraping logic with unified support for both CSS and XPath
|
|
801
|
+
console.log("🚀 Starting unified list data extraction");
|
|
802
|
+
console.log("List Selector:", listSelector);
|
|
803
|
+
console.log("Fields:", fields);
|
|
660
804
|
let containers = queryElementAll(document, listSelector);
|
|
661
805
|
containers = Array.from(containers);
|
|
662
|
-
if (containers.length === 0)
|
|
806
|
+
if (containers.length === 0) {
|
|
807
|
+
console.warn("❌ No containers found for listSelector:", listSelector);
|
|
663
808
|
return [];
|
|
664
|
-
|
|
809
|
+
}
|
|
810
|
+
console.log(`📦 Found ${containers.length} list containers`);
|
|
811
|
+
// For CSS selectors, try to find similar containers if needed
|
|
812
|
+
if (!isXPathSelector(listSelector) &&
|
|
813
|
+
limit > 1 &&
|
|
814
|
+
containers.length === 1) {
|
|
665
815
|
const baseContainer = containers[0];
|
|
666
816
|
const similarContainers = findSimilarElements(baseContainer);
|
|
667
817
|
if (similarContainers.length > 0) {
|
|
668
|
-
const newContainers = similarContainers.filter(container => !container.matches(listSelector));
|
|
818
|
+
const newContainers = similarContainers.filter((container) => !container.matches(listSelector));
|
|
669
819
|
containers = [...containers, ...newContainers];
|
|
670
820
|
}
|
|
671
821
|
}
|
|
672
822
|
const containerFields = containers.map(() => ({
|
|
673
823
|
tableFields: {},
|
|
674
|
-
nonTableFields: {}
|
|
824
|
+
nonTableFields: {},
|
|
675
825
|
}));
|
|
676
|
-
//
|
|
826
|
+
// For XPath selectors, use the new approach
|
|
827
|
+
if (isXPathSelector(listSelector)) {
|
|
828
|
+
const extractedData = [];
|
|
829
|
+
const containersToProcess = Math.min(containers.length, limit);
|
|
830
|
+
for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
|
|
831
|
+
const record = {};
|
|
832
|
+
for (const [label, field] of Object.entries(fields)) {
|
|
833
|
+
let element = null;
|
|
834
|
+
if (isXPathSelector(field.selector)) {
|
|
835
|
+
// Create indexed absolute XPath
|
|
836
|
+
const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
|
|
837
|
+
element = evaluateXPath(document, indexedSelector);
|
|
838
|
+
}
|
|
839
|
+
else {
|
|
840
|
+
// Fallback for CSS selectors within XPath containers
|
|
841
|
+
const container = containers[containerIndex];
|
|
842
|
+
element = queryElement(container, field.selector);
|
|
843
|
+
}
|
|
844
|
+
if (element) {
|
|
845
|
+
const value = extractValue(element, field.attribute);
|
|
846
|
+
if (value !== null && value !== "") {
|
|
847
|
+
record[label] = value;
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
record[label] = "";
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
else {
|
|
854
|
+
record[label] = "";
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
if (Object.values(record).some((value) => value !== "")) {
|
|
858
|
+
extractedData.push(record);
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
console.log(`📊 Total records extracted: ${extractedData.length}`);
|
|
862
|
+
return extractedData;
|
|
863
|
+
}
|
|
864
|
+
// For CSS selectors, use the original table-aware approach
|
|
677
865
|
containers.forEach((container, containerIndex) => {
|
|
678
866
|
for (const [label, field] of Object.entries(fields)) {
|
|
679
867
|
const sampleElement = queryElement(container, field.selector);
|
|
680
868
|
if (sampleElement) {
|
|
681
869
|
const ancestor = findTableAncestor(sampleElement);
|
|
682
870
|
if (ancestor) {
|
|
683
|
-
containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type ===
|
|
871
|
+
containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1 });
|
|
684
872
|
}
|
|
685
873
|
else {
|
|
686
874
|
containerFields[containerIndex].nonTableFields[label] = field;
|
|
@@ -702,12 +890,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
702
890
|
const firstElement = queryElement(container, firstField.selector);
|
|
703
891
|
let tableContext = firstElement;
|
|
704
892
|
// Find table context including iframe, frame and shadow DOM
|
|
705
|
-
while (tableContext &&
|
|
893
|
+
while (tableContext &&
|
|
894
|
+
tableContext.tagName !== "TABLE" &&
|
|
895
|
+
tableContext !== container) {
|
|
706
896
|
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
707
897
|
tableContext = tableContext.getRootNode().host;
|
|
708
898
|
continue;
|
|
709
899
|
}
|
|
710
|
-
if (tableContext.tagName ===
|
|
900
|
+
if (tableContext.tagName === "IFRAME" ||
|
|
901
|
+
tableContext.tagName === "FRAME") {
|
|
711
902
|
try {
|
|
712
903
|
tableContext = tableContext.contentDocument.body;
|
|
713
904
|
}
|
|
@@ -723,16 +914,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
723
914
|
// Get rows from all contexts
|
|
724
915
|
const rows = [];
|
|
725
916
|
// Get rows from regular DOM
|
|
726
|
-
rows.push(...tableContext.getElementsByTagName(
|
|
917
|
+
rows.push(...tableContext.getElementsByTagName("TR"));
|
|
727
918
|
// Get rows from shadow DOM
|
|
728
919
|
if (tableContext.shadowRoot) {
|
|
729
|
-
rows.push(...tableContext.shadowRoot.getElementsByTagName(
|
|
920
|
+
rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
|
|
730
921
|
}
|
|
731
922
|
// Get rows from iframes and frames
|
|
732
|
-
if (tableContext.tagName ===
|
|
923
|
+
if (tableContext.tagName === "IFRAME" ||
|
|
924
|
+
tableContext.tagName === "FRAME") {
|
|
733
925
|
try {
|
|
734
|
-
const frameDoc = tableContext.contentDocument ||
|
|
735
|
-
|
|
926
|
+
const frameDoc = tableContext.contentDocument ||
|
|
927
|
+
tableContext.contentWindow.document;
|
|
928
|
+
rows.push(...frameDoc.getElementsByTagName("TR"));
|
|
736
929
|
}
|
|
737
930
|
catch (e) {
|
|
738
931
|
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
|
@@ -742,7 +935,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
742
935
|
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
|
743
936
|
const record = {};
|
|
744
937
|
const currentRow = processedRows[rowIndex];
|
|
745
|
-
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
|
938
|
+
for (const [label, { selector, attribute, cellIndex },] of Object.entries(tableFields)) {
|
|
746
939
|
let element = null;
|
|
747
940
|
if (cellIndex >= 0) {
|
|
748
941
|
// Get TD element considering both contexts
|
|
@@ -756,16 +949,21 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
756
949
|
}
|
|
757
950
|
if (td) {
|
|
758
951
|
element = queryElement(td, selector);
|
|
759
|
-
if (!element &&
|
|
952
|
+
if (!element &&
|
|
953
|
+
selector
|
|
954
|
+
.split(/(?:>>|:>>)/)
|
|
955
|
+
.pop()
|
|
956
|
+
.includes("td:nth-child")) {
|
|
760
957
|
element = td;
|
|
761
958
|
}
|
|
762
959
|
if (!element) {
|
|
763
|
-
const tagOnlySelector = selector.split(
|
|
960
|
+
const tagOnlySelector = selector.split(".")[0];
|
|
764
961
|
element = queryElement(td, tagOnlySelector);
|
|
765
962
|
}
|
|
766
963
|
if (!element) {
|
|
767
964
|
let currentElement = td;
|
|
768
|
-
while (currentElement &&
|
|
965
|
+
while (currentElement &&
|
|
966
|
+
currentElement.children.length > 0) {
|
|
769
967
|
let foundContentChild = false;
|
|
770
968
|
for (const child of currentElement.children) {
|
|
771
969
|
if (extractValue(child, attribute)) {
|
|
@@ -818,6 +1016,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
818
1016
|
}
|
|
819
1017
|
// Merge and limit the results
|
|
820
1018
|
const scrapedData = [...tableData, ...nonTableData];
|
|
1019
|
+
console.log(`📊 Total records extracted: ${scrapedData.length}`);
|
|
821
1020
|
return scrapedData;
|
|
822
1021
|
});
|
|
823
1022
|
};
|
package/build/interpret.d.ts
CHANGED
package/build/interpret.js
CHANGED
|
@@ -385,7 +385,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
385
385
|
yield this.options.serializableCallback([mergedResult]);
|
|
386
386
|
}),
|
|
387
387
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
388
|
-
var _f;
|
|
388
|
+
var _f, _g;
|
|
389
389
|
if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
|
|
390
390
|
this.options.debugChannel.setActionType('scrapeList');
|
|
391
391
|
}
|
|
@@ -394,6 +394,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
394
394
|
return;
|
|
395
395
|
}
|
|
396
396
|
yield this.ensureScriptsLoaded(page);
|
|
397
|
+
if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.incrementScrapeListIndex) {
|
|
398
|
+
this.options.debugChannel.incrementScrapeListIndex();
|
|
399
|
+
}
|
|
397
400
|
if (!config.pagination) {
|
|
398
401
|
const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
399
402
|
yield this.options.serializableCallback(scrapeResults);
|
|
@@ -404,8 +407,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
404
407
|
}
|
|
405
408
|
}),
|
|
406
409
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
407
|
-
var
|
|
408
|
-
if ((
|
|
410
|
+
var _h;
|
|
411
|
+
if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
|
|
409
412
|
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
410
413
|
}
|
|
411
414
|
yield this.ensureScriptsLoaded(page);
|
|
@@ -415,8 +418,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
415
418
|
yield this.options.serializableCallback(scrapeResults);
|
|
416
419
|
}),
|
|
417
420
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
418
|
-
var
|
|
419
|
-
if ((
|
|
421
|
+
var _j;
|
|
422
|
+
if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
|
|
420
423
|
this.options.debugChannel.setActionType('scroll');
|
|
421
424
|
}
|
|
422
425
|
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -427,8 +430,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
427
430
|
}), pages !== null && pages !== void 0 ? pages : 1);
|
|
428
431
|
}),
|
|
429
432
|
script: (code) => __awaiter(this, void 0, void 0, function* () {
|
|
430
|
-
var
|
|
431
|
-
if ((
|
|
433
|
+
var _k;
|
|
434
|
+
if ((_k = this.options.debugChannel) === null || _k === void 0 ? void 0 : _k.setActionType) {
|
|
432
435
|
this.options.debugChannel.setActionType('script');
|
|
433
436
|
}
|
|
434
437
|
const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
|
|
@@ -530,6 +533,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
530
533
|
});
|
|
531
534
|
allResults = allResults.concat(newResults);
|
|
532
535
|
debugLog("Results collected:", allResults.length);
|
|
536
|
+
yield this.options.serializableCallback(allResults);
|
|
533
537
|
});
|
|
534
538
|
const checkLimit = () => {
|
|
535
539
|
if (config.limit && allResults.length >= config.limit) {
|
|
@@ -674,10 +678,47 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
674
678
|
}
|
|
675
679
|
let retryCount = 0;
|
|
676
680
|
let paginationSuccess = false;
|
|
677
|
-
// Capture basic content signature before click
|
|
681
|
+
// Capture basic content signature before click - with XPath support
|
|
678
682
|
const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
|
|
679
|
-
return yield page.evaluate((
|
|
680
|
-
const
|
|
683
|
+
return yield page.evaluate((listSelector) => {
|
|
684
|
+
const isXPath = (selector) => {
|
|
685
|
+
return selector.startsWith('//') || selector.startsWith('./') || selector.includes('::');
|
|
686
|
+
};
|
|
687
|
+
let items = [];
|
|
688
|
+
if (isXPath(listSelector)) {
|
|
689
|
+
try {
|
|
690
|
+
// Use XPath to find elements
|
|
691
|
+
const xpathResult = document.evaluate(listSelector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
692
|
+
items = [];
|
|
693
|
+
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
694
|
+
const node = xpathResult.snapshotItem(i);
|
|
695
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
696
|
+
items.push(node);
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
catch (xpathError) {
|
|
701
|
+
console.warn('XPath evaluation failed, trying CSS selector as fallback:', xpathError);
|
|
702
|
+
// Fallback to CSS selector
|
|
703
|
+
try {
|
|
704
|
+
items = document.querySelectorAll(listSelector);
|
|
705
|
+
}
|
|
706
|
+
catch (cssError) {
|
|
707
|
+
console.warn('CSS selector fallback also failed:', cssError);
|
|
708
|
+
items = [];
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
else {
|
|
713
|
+
try {
|
|
714
|
+
// Use CSS selector
|
|
715
|
+
items = document.querySelectorAll(listSelector);
|
|
716
|
+
}
|
|
717
|
+
catch (cssError) {
|
|
718
|
+
console.warn('CSS selector failed:', cssError);
|
|
719
|
+
items = [];
|
|
720
|
+
}
|
|
721
|
+
}
|
|
681
722
|
return {
|
|
682
723
|
url: window.location.href,
|
|
683
724
|
itemCount: items.length,
|