mx-cloud 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +162 -11
- package/build/interpret.js +16 -17
- package/package.json +1 -1
|
@@ -360,18 +360,169 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
360
360
|
window.scrapeList = function (_a) {
|
|
361
361
|
return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
|
|
362
362
|
// XPath evaluation functions
|
|
363
|
-
const
|
|
363
|
+
const queryInsideContext = (context, part) => {
|
|
364
364
|
try {
|
|
365
|
-
const
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
const
|
|
371
|
-
|
|
365
|
+
const { tagName, conditions } = parseXPathPart(part);
|
|
366
|
+
const candidateElements = Array.from(context.querySelectorAll(tagName));
|
|
367
|
+
if (candidateElements.length === 0) {
|
|
368
|
+
return [];
|
|
369
|
+
}
|
|
370
|
+
const matchingElements = candidateElements.filter((el) => {
|
|
371
|
+
return elementMatchesConditions(el, conditions);
|
|
372
|
+
});
|
|
373
|
+
return matchingElements;
|
|
372
374
|
}
|
|
373
|
-
catch (
|
|
374
|
-
console.
|
|
375
|
+
catch (err) {
|
|
376
|
+
console.error("Error in queryInsideContext:", err);
|
|
377
|
+
return [];
|
|
378
|
+
}
|
|
379
|
+
};
|
|
380
|
+
// Helper function to parse XPath part
|
|
381
|
+
const parseXPathPart = (part) => {
|
|
382
|
+
const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
|
|
383
|
+
const tagName = tagMatch ? tagMatch[1] : "*";
|
|
384
|
+
const conditionMatches = part.match(/\[([^\]]+)\]/g);
|
|
385
|
+
const conditions = conditionMatches
|
|
386
|
+
? conditionMatches.map((c) => c.slice(1, -1))
|
|
387
|
+
: [];
|
|
388
|
+
return { tagName, conditions };
|
|
389
|
+
};
|
|
390
|
+
// Helper function to check if element matches all conditions
|
|
391
|
+
const elementMatchesConditions = (element, conditions) => {
|
|
392
|
+
for (const condition of conditions) {
|
|
393
|
+
if (!elementMatchesCondition(element, condition)) {
|
|
394
|
+
return false;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
return true;
|
|
398
|
+
};
|
|
399
|
+
// Helper function to check if element matches a single condition
|
|
400
|
+
const elementMatchesCondition = (element, condition) => {
|
|
401
|
+
var _a, _b;
|
|
402
|
+
condition = condition.trim();
|
|
403
|
+
if (/^\d+$/.test(condition)) {
|
|
404
|
+
return true;
|
|
405
|
+
}
|
|
406
|
+
// Handle @attribute="value"
|
|
407
|
+
const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
|
|
408
|
+
if (attrMatch) {
|
|
409
|
+
const [, attr, value] = attrMatch;
|
|
410
|
+
const elementValue = element.getAttribute(attr);
|
|
411
|
+
return elementValue === value;
|
|
412
|
+
}
|
|
413
|
+
// Handle contains(@class, 'value')
|
|
414
|
+
const classContainsMatch = condition.match(/^contains\(@class,\s*["']([^"']+)["']\)$/);
|
|
415
|
+
if (classContainsMatch) {
|
|
416
|
+
const className = classContainsMatch[1];
|
|
417
|
+
return element.classList.contains(className);
|
|
418
|
+
}
|
|
419
|
+
// Handle contains(@attribute, 'value')
|
|
420
|
+
const attrContainsMatch = condition.match(/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/);
|
|
421
|
+
if (attrContainsMatch) {
|
|
422
|
+
const [, attr, value] = attrContainsMatch;
|
|
423
|
+
const elementValue = element.getAttribute(attr) || "";
|
|
424
|
+
return elementValue.includes(value);
|
|
425
|
+
}
|
|
426
|
+
// Handle text()="value"
|
|
427
|
+
const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
|
|
428
|
+
if (textMatch) {
|
|
429
|
+
const expectedText = textMatch[1];
|
|
430
|
+
const elementText = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || "";
|
|
431
|
+
return elementText === expectedText;
|
|
432
|
+
}
|
|
433
|
+
// Handle contains(text(), 'value')
|
|
434
|
+
const textContainsMatch = condition.match(/^contains\(text\(\),\s*["']([^"']+)["']\)$/);
|
|
435
|
+
if (textContainsMatch) {
|
|
436
|
+
const expectedText = textContainsMatch[1];
|
|
437
|
+
const elementText = ((_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()) || "";
|
|
438
|
+
return elementText.includes(expectedText);
|
|
439
|
+
}
|
|
440
|
+
// Handle count(*)=0 (element has no children)
|
|
441
|
+
if (condition === "count(*)=0") {
|
|
442
|
+
return element.children.length === 0;
|
|
443
|
+
}
|
|
444
|
+
// Handle other count conditions
|
|
445
|
+
const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
|
|
446
|
+
if (countMatch) {
|
|
447
|
+
const expectedCount = parseInt(countMatch[1]);
|
|
448
|
+
return element.children.length === expectedCount;
|
|
449
|
+
}
|
|
450
|
+
return true;
|
|
451
|
+
};
|
|
452
|
+
const evaluateXPath = (document, xpath, isShadow = false) => {
|
|
453
|
+
try {
|
|
454
|
+
const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
|
455
|
+
if (!isShadow) {
|
|
456
|
+
if (result === null) {
|
|
457
|
+
return null;
|
|
458
|
+
}
|
|
459
|
+
return result;
|
|
460
|
+
}
|
|
461
|
+
let cleanPath = xpath;
|
|
462
|
+
let isIndexed = false;
|
|
463
|
+
const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
|
|
464
|
+
if (indexedMatch) {
|
|
465
|
+
cleanPath = indexedMatch[1] + indexedMatch[3];
|
|
466
|
+
isIndexed = true;
|
|
467
|
+
}
|
|
468
|
+
const pathParts = cleanPath
|
|
469
|
+
.replace(/^\/\//, "")
|
|
470
|
+
.split("/")
|
|
471
|
+
.map((p) => p.trim())
|
|
472
|
+
.filter((p) => p.length > 0);
|
|
473
|
+
let currentContexts = [document];
|
|
474
|
+
for (let i = 0; i < pathParts.length; i++) {
|
|
475
|
+
const part = pathParts[i];
|
|
476
|
+
const nextContexts = [];
|
|
477
|
+
for (const ctx of currentContexts) {
|
|
478
|
+
const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
|
|
479
|
+
let partWithoutPosition = part;
|
|
480
|
+
let requestedPosition = null;
|
|
481
|
+
if (positionalMatch) {
|
|
482
|
+
partWithoutPosition = positionalMatch[1];
|
|
483
|
+
requestedPosition = parseInt(positionalMatch[2]);
|
|
484
|
+
}
|
|
485
|
+
const matched = queryInsideContext(ctx, partWithoutPosition);
|
|
486
|
+
let elementsToAdd = matched;
|
|
487
|
+
if (requestedPosition !== null) {
|
|
488
|
+
const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
|
|
489
|
+
if (index >= 0 && index < matched.length) {
|
|
490
|
+
elementsToAdd = [matched[index]];
|
|
491
|
+
}
|
|
492
|
+
else {
|
|
493
|
+
console.warn(`Position ${requestedPosition} out of range (${matched.length} elements found)`);
|
|
494
|
+
elementsToAdd = [];
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
elementsToAdd.forEach((el) => {
|
|
498
|
+
nextContexts.push(el);
|
|
499
|
+
if (el.shadowRoot) {
|
|
500
|
+
nextContexts.push(el.shadowRoot);
|
|
501
|
+
}
|
|
502
|
+
});
|
|
503
|
+
}
|
|
504
|
+
if (nextContexts.length === 0) {
|
|
505
|
+
return null;
|
|
506
|
+
}
|
|
507
|
+
currentContexts = nextContexts;
|
|
508
|
+
}
|
|
509
|
+
if (currentContexts.length > 0) {
|
|
510
|
+
if (isIndexed && indexedMatch) {
|
|
511
|
+
const requestedIndex = parseInt(indexedMatch[2]) - 1;
|
|
512
|
+
if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
|
|
513
|
+
return currentContexts[requestedIndex];
|
|
514
|
+
}
|
|
515
|
+
else {
|
|
516
|
+
console.warn(`Requested index ${requestedIndex + 1} out of range (${currentContexts.length} elements found)`);
|
|
517
|
+
return null;
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
return currentContexts[0];
|
|
521
|
+
}
|
|
522
|
+
return null;
|
|
523
|
+
}
|
|
524
|
+
catch (err) {
|
|
525
|
+
console.error("Critical XPath failure:", xpath, err);
|
|
375
526
|
return null;
|
|
376
527
|
}
|
|
377
528
|
};
|
|
@@ -834,7 +985,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
834
985
|
if (isXPathSelector(field.selector)) {
|
|
835
986
|
// Create indexed absolute XPath
|
|
836
987
|
const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
|
|
837
|
-
element = evaluateXPath(document, indexedSelector);
|
|
988
|
+
element = evaluateXPath(document, indexedSelector, field.isShadow);
|
|
838
989
|
}
|
|
839
990
|
else {
|
|
840
991
|
// Fallback for CSS selectors within XPath containers
|
package/build/interpret.js
CHANGED
|
@@ -829,9 +829,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
829
829
|
if (checkLimit())
|
|
830
830
|
return allResults;
|
|
831
831
|
let loadMoreCounter = 0;
|
|
832
|
-
let previousResultCount = allResults.length;
|
|
833
|
-
let noNewItemsCounter = 0;
|
|
834
|
-
const MAX_NO_NEW_ITEMS = 2;
|
|
832
|
+
// let previousResultCount = allResults.length;
|
|
833
|
+
// let noNewItemsCounter = 0;
|
|
834
|
+
// const MAX_NO_NEW_ITEMS = 2;
|
|
835
835
|
while (true) {
|
|
836
836
|
// Find working button with retry mechanism
|
|
837
837
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
@@ -888,20 +888,19 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
888
888
|
const heightChanged = currentHeight !== previousHeight;
|
|
889
889
|
previousHeight = currentHeight;
|
|
890
890
|
yield scrapeCurrentPage();
|
|
891
|
-
const currentResultCount = allResults.length;
|
|
892
|
-
const newItemsAdded = currentResultCount > previousResultCount;
|
|
893
|
-
if (!newItemsAdded) {
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
}
|
|
891
|
+
// const currentResultCount = allResults.length;
|
|
892
|
+
// const newItemsAdded = currentResultCount > previousResultCount;
|
|
893
|
+
// if (!newItemsAdded) {
|
|
894
|
+
// noNewItemsCounter++;
|
|
895
|
+
// debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
|
896
|
+
// if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
|
897
|
+
// debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
|
898
|
+
// return allResults;
|
|
899
|
+
// }
|
|
900
|
+
// } else {
|
|
901
|
+
// noNewItemsCounter = 0;
|
|
902
|
+
// previousResultCount = currentResultCount;
|
|
903
|
+
// }
|
|
905
904
|
if (checkLimit())
|
|
906
905
|
return allResults;
|
|
907
906
|
if (!heightChanged) {
|