mx-cloud 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +235 -341
- package/package.json +1 -1
|
@@ -359,10 +359,55 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
359
359
|
*/
|
|
360
360
|
window.scrapeList = function (_a) {
|
|
361
361
|
return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
|
|
362
|
-
|
|
362
|
+
var _b;
|
|
363
|
+
// XPath evaluation functions
|
|
364
|
+
const evaluateXPath = (rootElement, xpath) => {
|
|
365
|
+
try {
|
|
366
|
+
const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
|
|
367
|
+
? rootElement
|
|
368
|
+
: rootElement.ownerDocument;
|
|
369
|
+
if (!ownerDoc)
|
|
370
|
+
return null;
|
|
371
|
+
const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
|
|
372
|
+
return result.singleNodeValue;
|
|
373
|
+
}
|
|
374
|
+
catch (error) {
|
|
375
|
+
console.warn('XPath evaluation failed:', xpath, error);
|
|
376
|
+
return null;
|
|
377
|
+
}
|
|
378
|
+
};
|
|
379
|
+
const evaluateXPathAll = (rootElement, xpath) => {
|
|
380
|
+
try {
|
|
381
|
+
const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
|
|
382
|
+
? rootElement
|
|
383
|
+
: rootElement.ownerDocument;
|
|
384
|
+
if (!ownerDoc)
|
|
385
|
+
return [];
|
|
386
|
+
const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
387
|
+
const elements = [];
|
|
388
|
+
for (let i = 0; i < result.snapshotLength; i++) {
|
|
389
|
+
const node = result.snapshotItem(i);
|
|
390
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
391
|
+
elements.push(node);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
return elements;
|
|
395
|
+
}
|
|
396
|
+
catch (error) {
|
|
397
|
+
console.warn('XPath evaluation failed:', xpath, error);
|
|
398
|
+
return [];
|
|
399
|
+
}
|
|
400
|
+
};
|
|
401
|
+
// Enhanced query function to handle iframe, frame, shadow DOM, and XPath
|
|
363
402
|
const queryElement = (rootElement, selector) => {
|
|
364
403
|
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
365
|
-
|
|
404
|
+
// Check if it's an XPath selector
|
|
405
|
+
if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
|
|
406
|
+
return evaluateXPath(rootElement, selector);
|
|
407
|
+
}
|
|
408
|
+
else {
|
|
409
|
+
return rootElement.querySelector(selector);
|
|
410
|
+
}
|
|
366
411
|
}
|
|
367
412
|
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
368
413
|
let currentElement = rootElement;
|
|
@@ -373,7 +418,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
373
418
|
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
|
374
419
|
try {
|
|
375
420
|
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
|
376
|
-
|
|
421
|
+
if (!frameDoc)
|
|
422
|
+
return null;
|
|
423
|
+
if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
|
|
424
|
+
currentElement = evaluateXPath(frameDoc, parts[i]);
|
|
425
|
+
}
|
|
426
|
+
else {
|
|
427
|
+
currentElement = frameDoc.querySelector(parts[i]);
|
|
428
|
+
}
|
|
377
429
|
continue;
|
|
378
430
|
}
|
|
379
431
|
catch (e) {
|
|
@@ -381,18 +433,36 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
381
433
|
return null;
|
|
382
434
|
}
|
|
383
435
|
}
|
|
384
|
-
|
|
385
|
-
|
|
436
|
+
let nextElement = null;
|
|
437
|
+
if ('querySelector' in currentElement) {
|
|
438
|
+
// Handle XPath vs CSS selector
|
|
439
|
+
if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
|
|
440
|
+
nextElement = evaluateXPath(currentElement, parts[i]);
|
|
441
|
+
}
|
|
442
|
+
else {
|
|
443
|
+
nextElement = currentElement.querySelector(parts[i]);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
386
446
|
// Try shadow DOM if not found
|
|
387
|
-
if (!nextElement && currentElement.shadowRoot) {
|
|
388
|
-
|
|
447
|
+
if (!nextElement && 'shadowRoot' in currentElement && currentElement.shadowRoot) {
|
|
448
|
+
if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
|
|
449
|
+
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
|
|
450
|
+
}
|
|
451
|
+
else {
|
|
452
|
+
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
|
453
|
+
}
|
|
389
454
|
}
|
|
390
455
|
// Check children's shadow roots if still not found
|
|
391
|
-
if (!nextElement) {
|
|
456
|
+
if (!nextElement && 'children' in currentElement) {
|
|
392
457
|
const children = Array.from(currentElement.children || []);
|
|
393
458
|
for (const child of children) {
|
|
394
459
|
if (child.shadowRoot) {
|
|
395
|
-
|
|
460
|
+
if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
|
|
461
|
+
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
|
|
462
|
+
}
|
|
463
|
+
else {
|
|
464
|
+
nextElement = child.shadowRoot.querySelector(parts[i]);
|
|
465
|
+
}
|
|
396
466
|
if (nextElement)
|
|
397
467
|
break;
|
|
398
468
|
}
|
|
@@ -402,10 +472,16 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
402
472
|
}
|
|
403
473
|
return currentElement;
|
|
404
474
|
};
|
|
405
|
-
// Enhanced query all function for
|
|
475
|
+
// Enhanced query all function for XPath and CSS selectors
|
|
406
476
|
const queryElementAll = (rootElement, selector) => {
|
|
407
477
|
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
408
|
-
|
|
478
|
+
// Check if it's an XPath selector
|
|
479
|
+
if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
|
|
480
|
+
return evaluateXPathAll(rootElement, selector);
|
|
481
|
+
}
|
|
482
|
+
else {
|
|
483
|
+
return Array.from(rootElement.querySelectorAll(selector));
|
|
484
|
+
}
|
|
409
485
|
}
|
|
410
486
|
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
411
487
|
let currentElements = [rootElement];
|
|
@@ -416,7 +492,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
416
492
|
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
|
417
493
|
try {
|
|
418
494
|
const frameDoc = element.contentDocument || element.contentWindow.document;
|
|
419
|
-
|
|
495
|
+
if (frameDoc) {
|
|
496
|
+
if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
|
|
497
|
+
nextElements.push(...evaluateXPathAll(frameDoc, part));
|
|
498
|
+
}
|
|
499
|
+
else {
|
|
500
|
+
nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
|
|
501
|
+
}
|
|
502
|
+
}
|
|
420
503
|
}
|
|
421
504
|
catch (e) {
|
|
422
505
|
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
|
@@ -426,17 +509,32 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
426
509
|
else {
|
|
427
510
|
// Regular DOM elements
|
|
428
511
|
if (element.querySelectorAll) {
|
|
429
|
-
|
|
512
|
+
if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
|
|
513
|
+
nextElements.push(...evaluateXPathAll(element, part));
|
|
514
|
+
}
|
|
515
|
+
else {
|
|
516
|
+
nextElements.push(...Array.from(element.querySelectorAll(part)));
|
|
517
|
+
}
|
|
430
518
|
}
|
|
431
519
|
// Shadow DOM elements
|
|
432
520
|
if (element.shadowRoot) {
|
|
433
|
-
|
|
521
|
+
if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
|
|
522
|
+
nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
|
|
523
|
+
}
|
|
524
|
+
else {
|
|
525
|
+
nextElements.push(...Array.from(element.shadowRoot.querySelectorAll(part)));
|
|
526
|
+
}
|
|
434
527
|
}
|
|
435
528
|
// Check children's shadow roots
|
|
436
529
|
const children = Array.from(element.children || []);
|
|
437
530
|
for (const child of children) {
|
|
438
531
|
if (child.shadowRoot) {
|
|
439
|
-
|
|
532
|
+
if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
|
|
533
|
+
nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
|
|
534
|
+
}
|
|
535
|
+
else {
|
|
536
|
+
nextElements.push(...Array.from(child.shadowRoot.querySelectorAll(part)));
|
|
537
|
+
}
|
|
440
538
|
}
|
|
441
539
|
}
|
|
442
540
|
}
|
|
@@ -446,8 +544,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
446
544
|
return currentElements;
|
|
447
545
|
};
|
|
448
546
|
// Enhanced value extraction with context awareness
|
|
449
|
-
|
|
450
|
-
var _a, _b;
|
|
547
|
+
const extractValue = (element, attribute) => {
|
|
548
|
+
var _a, _b, _c, _d, _e, _f;
|
|
451
549
|
if (!element)
|
|
452
550
|
return null;
|
|
453
551
|
// Get context-aware base URL
|
|
@@ -460,36 +558,58 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
460
558
|
}
|
|
461
559
|
}
|
|
462
560
|
if (attribute === 'innerText') {
|
|
463
|
-
|
|
561
|
+
// First try standard innerText/textContent
|
|
562
|
+
let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
|
|
563
|
+
// If empty, check for common data attributes that might contain the text
|
|
564
|
+
if (!textContent) {
|
|
565
|
+
const dataAttributes = [
|
|
566
|
+
'data-600',
|
|
567
|
+
'data-text',
|
|
568
|
+
'data-label',
|
|
569
|
+
'data-value',
|
|
570
|
+
'data-content',
|
|
571
|
+
];
|
|
572
|
+
for (const attr of dataAttributes) {
|
|
573
|
+
const dataValue = element.getAttribute(attr);
|
|
574
|
+
if (dataValue && dataValue.trim()) {
|
|
575
|
+
textContent = dataValue.trim();
|
|
576
|
+
break;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
return textContent || null;
|
|
464
581
|
}
|
|
465
582
|
else if (attribute === 'innerHTML') {
|
|
466
|
-
return element.innerHTML.trim();
|
|
583
|
+
return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
|
|
467
584
|
}
|
|
468
|
-
else if (attribute === '
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
585
|
+
else if (attribute === 'href') {
|
|
586
|
+
// For href, we need to find the anchor tag if the current element isn't one
|
|
587
|
+
let anchorElement = element;
|
|
588
|
+
// If current element is not an anchor, look for parent anchor
|
|
589
|
+
if (element.tagName !== 'A') {
|
|
590
|
+
anchorElement = element.closest('a') || ((_f = element.parentElement) === null || _f === void 0 ? void 0 : _f.closest('a')) || element;
|
|
591
|
+
}
|
|
592
|
+
const hrefValue = anchorElement.getAttribute('href');
|
|
593
|
+
if (!hrefValue || hrefValue.trim() === '') {
|
|
594
|
+
return null;
|
|
595
|
+
}
|
|
596
|
+
try {
|
|
597
|
+
return new URL(hrefValue, baseURL).href;
|
|
598
|
+
}
|
|
599
|
+
catch (e) {
|
|
600
|
+
console.warn('Error creating URL from', hrefValue, e);
|
|
601
|
+
return hrefValue;
|
|
482
602
|
}
|
|
603
|
+
}
|
|
604
|
+
else if (attribute === 'src') {
|
|
483
605
|
const attrValue = element.getAttribute(attribute);
|
|
484
606
|
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
485
607
|
if (!dataAttr || dataAttr.trim() === '') {
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
return matches ? new URL(matches[1], baseURL).href : null;
|
|
492
|
-
}
|
|
608
|
+
const style = window.getComputedStyle(element);
|
|
609
|
+
const bgImage = style.backgroundImage;
|
|
610
|
+
if (bgImage && bgImage !== 'none') {
|
|
611
|
+
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
612
|
+
return matches ? new URL(matches[1], baseURL).href : null;
|
|
493
613
|
}
|
|
494
614
|
return null;
|
|
495
615
|
}
|
|
@@ -498,327 +618,101 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
498
618
|
}
|
|
499
619
|
catch (e) {
|
|
500
620
|
console.warn('Error creating URL from', dataAttr, e);
|
|
501
|
-
return dataAttr;
|
|
621
|
+
return dataAttr;
|
|
502
622
|
}
|
|
503
623
|
}
|
|
504
624
|
return element.getAttribute(attribute);
|
|
505
|
-
}
|
|
506
|
-
//
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
}
|
|
520
|
-
else if (currentElement.tagName === 'TR') {
|
|
521
|
-
return { type: 'TR', element: currentElement };
|
|
522
|
-
}
|
|
523
|
-
// Handle iframe and frame crossing
|
|
524
|
-
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
|
525
|
-
try {
|
|
526
|
-
currentElement = currentElement.contentDocument.body;
|
|
527
|
-
}
|
|
528
|
-
catch (e) {
|
|
529
|
-
return null;
|
|
530
|
-
}
|
|
531
|
-
}
|
|
532
|
-
else {
|
|
533
|
-
currentElement = currentElement.parentElement;
|
|
534
|
-
}
|
|
535
|
-
depth++;
|
|
536
|
-
}
|
|
537
|
-
return null;
|
|
538
|
-
}
|
|
539
|
-
// Helper function to get cell index
|
|
540
|
-
function getCellIndex(td) {
|
|
541
|
-
if (td.getRootNode() instanceof ShadowRoot) {
|
|
542
|
-
const shadowRoot = td.getRootNode();
|
|
543
|
-
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
|
|
544
|
-
return allCells.indexOf(td);
|
|
545
|
-
}
|
|
546
|
-
let index = 0;
|
|
547
|
-
let sibling = td;
|
|
548
|
-
while (sibling = sibling.previousElementSibling) {
|
|
549
|
-
index++;
|
|
550
|
-
}
|
|
551
|
-
return index;
|
|
552
|
-
}
|
|
553
|
-
// Helper function to check for TH elements
|
|
554
|
-
function hasThElement(row, tableFields) {
|
|
555
|
-
for (const [_, { selector }] of Object.entries(tableFields)) {
|
|
556
|
-
const element = queryElement(row, selector);
|
|
557
|
-
if (element) {
|
|
558
|
-
let current = element;
|
|
559
|
-
while (current && current !== row) {
|
|
560
|
-
if (current.getRootNode() instanceof ShadowRoot) {
|
|
561
|
-
current = current.getRootNode().host;
|
|
562
|
-
continue;
|
|
563
|
-
}
|
|
564
|
-
if (current.tagName === 'TH')
|
|
565
|
-
return true;
|
|
566
|
-
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
|
567
|
-
try {
|
|
568
|
-
current = current.contentDocument.body;
|
|
569
|
-
}
|
|
570
|
-
catch (e) {
|
|
571
|
-
break;
|
|
572
|
-
}
|
|
573
|
-
}
|
|
574
|
-
else {
|
|
575
|
-
current = current.parentElement;
|
|
576
|
-
}
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
}
|
|
580
|
-
return false;
|
|
581
|
-
}
|
|
582
|
-
// Helper function to filter rows
|
|
583
|
-
function filterRowsBasedOnTag(rows, tableFields) {
|
|
584
|
-
for (const row of rows) {
|
|
585
|
-
if (hasThElement(row, tableFields)) {
|
|
586
|
-
return rows;
|
|
587
|
-
}
|
|
588
|
-
}
|
|
589
|
-
// Include shadow DOM in TH search
|
|
590
|
-
return rows.filter(row => {
|
|
591
|
-
const directTH = row.getElementsByTagName('TH').length === 0;
|
|
592
|
-
const shadowTH = row.shadowRoot ?
|
|
593
|
-
row.shadowRoot.querySelector('th') === null : true;
|
|
594
|
-
return directTH && shadowTH;
|
|
595
|
-
});
|
|
596
|
-
}
|
|
597
|
-
// Class similarity comparison functions
|
|
598
|
-
function calculateClassSimilarity(classList1, classList2) {
|
|
599
|
-
const set1 = new Set(classList1);
|
|
600
|
-
const set2 = new Set(classList2);
|
|
601
|
-
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
602
|
-
const union = new Set([...set1, ...set2]);
|
|
603
|
-
return intersection.size / union.size;
|
|
604
|
-
}
|
|
605
|
-
// Enhanced similar elements finding with context support
|
|
606
|
-
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
|
607
|
-
const baseClasses = Array.from(baseElement.classList);
|
|
608
|
-
if (baseClasses.length === 0)
|
|
609
|
-
return [];
|
|
610
|
-
const allElements = [];
|
|
611
|
-
// Get elements from main document
|
|
612
|
-
allElements.push(...document.getElementsByTagName(baseElement.tagName));
|
|
613
|
-
// Get elements from shadow DOM
|
|
614
|
-
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
|
615
|
-
const shadowHost = baseElement.getRootNode().host;
|
|
616
|
-
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
|
625
|
+
};
|
|
626
|
+
// Create indexed XPath for specific container instance
|
|
627
|
+
const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
|
|
628
|
+
console.log(`Creating indexed XPath for container ${containerIndex}`);
|
|
629
|
+
console.log(`Child selector: ${childSelector}`);
|
|
630
|
+
console.log(`List selector: ${listSelector}`);
|
|
631
|
+
// Check if the child selector contains the list selector pattern
|
|
632
|
+
if (childSelector.includes(listSelector.replace('//', ''))) {
|
|
633
|
+
// Replace the list selector part with indexed version
|
|
634
|
+
const listPattern = listSelector.replace('//', '');
|
|
635
|
+
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
|
636
|
+
const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
|
|
637
|
+
console.log(`Generated indexed selector: ${indexedSelector}`);
|
|
638
|
+
return indexedSelector;
|
|
617
639
|
}
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
];
|
|
623
|
-
for (const frame of frames) {
|
|
624
|
-
try {
|
|
625
|
-
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
|
626
|
-
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
|
627
|
-
}
|
|
628
|
-
catch (e) {
|
|
629
|
-
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
|
630
|
-
}
|
|
640
|
+
else {
|
|
641
|
+
// If pattern doesn't match, create a more generic indexed selector
|
|
642
|
+
console.warn(`Pattern doesn't match, using fallback approach`);
|
|
643
|
+
return `(${listSelector})[${containerIndex}]${childSelector.replace('//', '/')}`;
|
|
631
644
|
}
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
643
|
-
if (match) {
|
|
644
|
-
const position = parseInt(match[1], 10);
|
|
645
|
-
for (let i = position - 1; i >= 1; i--) {
|
|
646
|
-
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
|
647
|
-
element = queryElement(rootElement, fallbackSelector);
|
|
648
|
-
if (element)
|
|
649
|
-
break;
|
|
650
|
-
}
|
|
651
|
-
if (!element) {
|
|
652
|
-
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
|
653
|
-
element = queryElement(rootElement, baseSelector);
|
|
654
|
-
}
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
return element;
|
|
658
|
-
}
|
|
659
|
-
// Main scraping logic with context support
|
|
660
|
-
let containers = queryElementAll(document, listSelector);
|
|
661
|
-
containers = Array.from(containers);
|
|
662
|
-
if (containers.length === 0)
|
|
645
|
+
};
|
|
646
|
+
// Main scraping logic
|
|
647
|
+
console.log('🚀 Starting list data extraction');
|
|
648
|
+
console.log('List Selector:', listSelector);
|
|
649
|
+
console.log('Fields:', fields);
|
|
650
|
+
// Step 1: Get all container elements matching the list selector
|
|
651
|
+
const containers = queryElementAll(document, listSelector);
|
|
652
|
+
console.log(`📦 Found ${containers.length} list containers`);
|
|
653
|
+
if (containers.length === 0) {
|
|
654
|
+
console.warn('❌ No containers found for listSelector:', listSelector);
|
|
663
655
|
return [];
|
|
664
|
-
if (limit > 1 && containers.length === 1) {
|
|
665
|
-
const baseContainer = containers[0];
|
|
666
|
-
const similarContainers = findSimilarElements(baseContainer);
|
|
667
|
-
if (similarContainers.length > 0) {
|
|
668
|
-
const newContainers = similarContainers.filter(container => !container.matches(listSelector));
|
|
669
|
-
containers = [...containers, ...newContainers];
|
|
670
|
-
}
|
|
671
656
|
}
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
})
|
|
676
|
-
|
|
677
|
-
|
|
657
|
+
// Step 2: Extract data from each container up to the limit
|
|
658
|
+
const extractedData = [];
|
|
659
|
+
const containersToProcess = Math.min(containers.length, limit);
|
|
660
|
+
console.log(`🔄 Processing ${containersToProcess} containers...`);
|
|
661
|
+
for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
|
|
662
|
+
const container = containers[containerIndex];
|
|
663
|
+
const record = {};
|
|
664
|
+
console.log(`\n📋 Processing container ${containerIndex + 1}/${containersToProcess}`);
|
|
665
|
+
// Step 3: For each field, extract data from the current container
|
|
678
666
|
for (const [label, field] of Object.entries(fields)) {
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
667
|
+
console.log(`\n 🔍 Extracting field "${label}"`);
|
|
668
|
+
console.log(` Original selector: ${field.selector}`);
|
|
669
|
+
console.log(` Attribute: ${field.attribute}`);
|
|
670
|
+
let element = null;
|
|
671
|
+
// Handle XPath selectors with container indexing
|
|
672
|
+
if (field.selector.startsWith('//')) {
|
|
673
|
+
// Create indexed absolute XPath
|
|
674
|
+
const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
|
|
675
|
+
console.log(` 📍 Indexed selector: ${indexedSelector}`);
|
|
676
|
+
element = evaluateXPath(document, indexedSelector);
|
|
677
|
+
console.log(` 📍 Indexed XPath result: ${element ? 'FOUND' : 'NOT FOUND'}`);
|
|
678
|
+
if (element) {
|
|
679
|
+
console.log(` 📍 Found element text: "${(_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()}"`);
|
|
687
680
|
}
|
|
688
681
|
}
|
|
689
682
|
else {
|
|
690
|
-
|
|
683
|
+
// Fallback for non-XPath selectors - search within container
|
|
684
|
+
element = queryElement(container, field.selector);
|
|
691
685
|
}
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
const container = containers[containerIndex];
|
|
699
|
-
const { tableFields } = containerFields[containerIndex];
|
|
700
|
-
if (Object.keys(tableFields).length > 0) {
|
|
701
|
-
const firstField = Object.values(tableFields)[0];
|
|
702
|
-
const firstElement = queryElement(container, firstField.selector);
|
|
703
|
-
let tableContext = firstElement;
|
|
704
|
-
// Find table context including iframe, frame and shadow DOM
|
|
705
|
-
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
|
706
|
-
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
707
|
-
tableContext = tableContext.getRootNode().host;
|
|
708
|
-
continue;
|
|
709
|
-
}
|
|
710
|
-
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
|
711
|
-
try {
|
|
712
|
-
tableContext = tableContext.contentDocument.body;
|
|
713
|
-
}
|
|
714
|
-
catch (e) {
|
|
715
|
-
break;
|
|
716
|
-
}
|
|
686
|
+
// Step 4: Extract the value from the found element
|
|
687
|
+
if (element) {
|
|
688
|
+
const value = extractValue(element, field.attribute);
|
|
689
|
+
if (value !== null && value !== '') {
|
|
690
|
+
record[label] = value;
|
|
691
|
+
console.log(` ✅ Extracted "${label}": "${value}"`);
|
|
717
692
|
}
|
|
718
693
|
else {
|
|
719
|
-
|
|
694
|
+
console.warn(` ⚠️ Empty value for "${label}"`);
|
|
695
|
+
record[label] = '';
|
|
720
696
|
}
|
|
721
697
|
}
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
// Get rows from regular DOM
|
|
726
|
-
rows.push(...tableContext.getElementsByTagName('TR'));
|
|
727
|
-
// Get rows from shadow DOM
|
|
728
|
-
if (tableContext.shadowRoot) {
|
|
729
|
-
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
|
730
|
-
}
|
|
731
|
-
// Get rows from iframes and frames
|
|
732
|
-
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
|
733
|
-
try {
|
|
734
|
-
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
|
735
|
-
rows.push(...frameDoc.getElementsByTagName('TR'));
|
|
736
|
-
}
|
|
737
|
-
catch (e) {
|
|
738
|
-
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
|
739
|
-
}
|
|
740
|
-
}
|
|
741
|
-
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
|
742
|
-
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
|
743
|
-
const record = {};
|
|
744
|
-
const currentRow = processedRows[rowIndex];
|
|
745
|
-
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
|
746
|
-
let element = null;
|
|
747
|
-
if (cellIndex >= 0) {
|
|
748
|
-
// Get TD element considering both contexts
|
|
749
|
-
let td = currentRow.children[cellIndex];
|
|
750
|
-
// Check shadow DOM for td
|
|
751
|
-
if (!td && currentRow.shadowRoot) {
|
|
752
|
-
const shadowCells = currentRow.shadowRoot.children;
|
|
753
|
-
if (shadowCells && shadowCells.length > cellIndex) {
|
|
754
|
-
td = shadowCells[cellIndex];
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
if (td) {
|
|
758
|
-
element = queryElement(td, selector);
|
|
759
|
-
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
|
|
760
|
-
element = td;
|
|
761
|
-
}
|
|
762
|
-
if (!element) {
|
|
763
|
-
const tagOnlySelector = selector.split('.')[0];
|
|
764
|
-
element = queryElement(td, tagOnlySelector);
|
|
765
|
-
}
|
|
766
|
-
if (!element) {
|
|
767
|
-
let currentElement = td;
|
|
768
|
-
while (currentElement && currentElement.children.length > 0) {
|
|
769
|
-
let foundContentChild = false;
|
|
770
|
-
for (const child of currentElement.children) {
|
|
771
|
-
if (extractValue(child, attribute)) {
|
|
772
|
-
currentElement = child;
|
|
773
|
-
foundContentChild = true;
|
|
774
|
-
break;
|
|
775
|
-
}
|
|
776
|
-
}
|
|
777
|
-
if (!foundContentChild)
|
|
778
|
-
break;
|
|
779
|
-
}
|
|
780
|
-
element = currentElement;
|
|
781
|
-
}
|
|
782
|
-
}
|
|
783
|
-
}
|
|
784
|
-
else {
|
|
785
|
-
element = queryElement(currentRow, selector);
|
|
786
|
-
}
|
|
787
|
-
if (element) {
|
|
788
|
-
record[label] = extractValue(element, attribute);
|
|
789
|
-
}
|
|
790
|
-
}
|
|
791
|
-
if (Object.keys(record).length > 0) {
|
|
792
|
-
tableData.push(record);
|
|
793
|
-
}
|
|
794
|
-
}
|
|
698
|
+
else {
|
|
699
|
+
console.warn(` ❌ Element not found for "${label}"`);
|
|
700
|
+
record[label] = '';
|
|
795
701
|
}
|
|
796
702
|
}
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
if (Object.keys(nonTableFields).length > 0) {
|
|
805
|
-
const record = {};
|
|
806
|
-
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
807
|
-
// Get the last part of the selector after any context delimiter
|
|
808
|
-
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
809
|
-
const element = tryFallbackSelector(container, relativeSelector);
|
|
810
|
-
if (element) {
|
|
811
|
-
record[label] = extractValue(element, attribute);
|
|
812
|
-
}
|
|
813
|
-
}
|
|
814
|
-
if (Object.keys(record).length > 0) {
|
|
815
|
-
nonTableData.push(record);
|
|
816
|
-
}
|
|
703
|
+
// Step 5: Add record if it has any non-empty values
|
|
704
|
+
if (Object.values(record).some(value => value !== '')) {
|
|
705
|
+
extractedData.push(record);
|
|
706
|
+
console.log(` ✅ Added record ${containerIndex + 1}:`, record);
|
|
707
|
+
}
|
|
708
|
+
else {
|
|
709
|
+
console.warn(` ⚠️ Skipping empty record for container ${containerIndex + 1}`);
|
|
817
710
|
}
|
|
818
711
|
}
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
712
|
+
console.log('\n🎉 Extraction complete!');
|
|
713
|
+
console.log(`📊 Total records extracted: ${extractedData.length}`);
|
|
714
|
+
console.log('📋 All records:', extractedData);
|
|
715
|
+
return extractedData;
|
|
822
716
|
});
|
|
823
717
|
};
|
|
824
718
|
/**
|