mx-cloud 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -360,18 +360,169 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
360
360
  window.scrapeList = function (_a) {
361
361
  return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
362
362
  // XPath evaluation functions
363
- const evaluateXPath = (rootElement, xpath) => {
363
+ const queryInsideContext = (context, part) => {
364
364
  try {
365
- const ownerDoc = rootElement.nodeType === Node.DOCUMENT_NODE
366
- ? rootElement
367
- : rootElement.ownerDocument;
368
- if (!ownerDoc)
369
- return null;
370
- const result = ownerDoc.evaluate(xpath, rootElement, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
371
- return result.singleNodeValue;
365
+ const { tagName, conditions } = parseXPathPart(part);
366
+ const candidateElements = Array.from(context.querySelectorAll(tagName));
367
+ if (candidateElements.length === 0) {
368
+ return [];
369
+ }
370
+ const matchingElements = candidateElements.filter((el) => {
371
+ return elementMatchesConditions(el, conditions);
372
+ });
373
+ return matchingElements;
372
374
  }
373
- catch (error) {
374
- console.warn("XPath evaluation failed:", xpath, error);
375
+ catch (err) {
376
+ console.error("Error in queryInsideContext:", err);
377
+ return [];
378
+ }
379
+ };
380
+ // Helper function to parse XPath part
381
+ const parseXPathPart = (part) => {
382
+ const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
383
+ const tagName = tagMatch ? tagMatch[1] : "*";
384
+ const conditionMatches = part.match(/\[([^\]]+)\]/g);
385
+ const conditions = conditionMatches
386
+ ? conditionMatches.map((c) => c.slice(1, -1))
387
+ : [];
388
+ return { tagName, conditions };
389
+ };
390
+ // Helper function to check if element matches all conditions
391
+ const elementMatchesConditions = (element, conditions) => {
392
+ for (const condition of conditions) {
393
+ if (!elementMatchesCondition(element, condition)) {
394
+ return false;
395
+ }
396
+ }
397
+ return true;
398
+ };
399
+ // Helper function to check if element matches a single condition
400
+ const elementMatchesCondition = (element, condition) => {
401
+ var _a, _b;
402
+ condition = condition.trim();
403
+ if (/^\d+$/.test(condition)) {
404
+ return true;
405
+ }
406
+ // Handle @attribute="value"
407
+ const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
408
+ if (attrMatch) {
409
+ const [, attr, value] = attrMatch;
410
+ const elementValue = element.getAttribute(attr);
411
+ return elementValue === value;
412
+ }
413
+ // Handle contains(@class, 'value')
414
+ const classContainsMatch = condition.match(/^contains\(@class,\s*["']([^"']+)["']\)$/);
415
+ if (classContainsMatch) {
416
+ const className = classContainsMatch[1];
417
+ return element.classList.contains(className);
418
+ }
419
+ // Handle contains(@attribute, 'value')
420
+ const attrContainsMatch = condition.match(/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/);
421
+ if (attrContainsMatch) {
422
+ const [, attr, value] = attrContainsMatch;
423
+ const elementValue = element.getAttribute(attr) || "";
424
+ return elementValue.includes(value);
425
+ }
426
+ // Handle text()="value"
427
+ const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
428
+ if (textMatch) {
429
+ const expectedText = textMatch[1];
430
+ const elementText = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || "";
431
+ return elementText === expectedText;
432
+ }
433
+ // Handle contains(text(), 'value')
434
+ const textContainsMatch = condition.match(/^contains\(text\(\),\s*["']([^"']+)["']\)$/);
435
+ if (textContainsMatch) {
436
+ const expectedText = textContainsMatch[1];
437
+ const elementText = ((_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()) || "";
438
+ return elementText.includes(expectedText);
439
+ }
440
+ // Handle count(*)=0 (element has no children)
441
+ if (condition === "count(*)=0") {
442
+ return element.children.length === 0;
443
+ }
444
+ // Handle other count conditions
445
+ const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
446
+ if (countMatch) {
447
+ const expectedCount = parseInt(countMatch[1]);
448
+ return element.children.length === expectedCount;
449
+ }
450
+ return true;
451
+ };
452
+ const evaluateXPath = (document, xpath, isShadow = false) => {
453
+ try {
454
+ const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
455
+ if (!isShadow) {
456
+ if (result === null) {
457
+ return null;
458
+ }
459
+ return result;
460
+ }
461
+ let cleanPath = xpath;
462
+ let isIndexed = false;
463
+ const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
464
+ if (indexedMatch) {
465
+ cleanPath = indexedMatch[1] + indexedMatch[3];
466
+ isIndexed = true;
467
+ }
468
+ const pathParts = cleanPath
469
+ .replace(/^\/\//, "")
470
+ .split("/")
471
+ .map((p) => p.trim())
472
+ .filter((p) => p.length > 0);
473
+ let currentContexts = [document];
474
+ for (let i = 0; i < pathParts.length; i++) {
475
+ const part = pathParts[i];
476
+ const nextContexts = [];
477
+ for (const ctx of currentContexts) {
478
+ const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
479
+ let partWithoutPosition = part;
480
+ let requestedPosition = null;
481
+ if (positionalMatch) {
482
+ partWithoutPosition = positionalMatch[1];
483
+ requestedPosition = parseInt(positionalMatch[2]);
484
+ }
485
+ const matched = queryInsideContext(ctx, partWithoutPosition);
486
+ let elementsToAdd = matched;
487
+ if (requestedPosition !== null) {
488
+ const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
489
+ if (index >= 0 && index < matched.length) {
490
+ elementsToAdd = [matched[index]];
491
+ }
492
+ else {
493
+ console.warn(`Position ${requestedPosition} out of range (${matched.length} elements found)`);
494
+ elementsToAdd = [];
495
+ }
496
+ }
497
+ elementsToAdd.forEach((el) => {
498
+ nextContexts.push(el);
499
+ if (el.shadowRoot) {
500
+ nextContexts.push(el.shadowRoot);
501
+ }
502
+ });
503
+ }
504
+ if (nextContexts.length === 0) {
505
+ return null;
506
+ }
507
+ currentContexts = nextContexts;
508
+ }
509
+ if (currentContexts.length > 0) {
510
+ if (isIndexed && indexedMatch) {
511
+ const requestedIndex = parseInt(indexedMatch[2]) - 1;
512
+ if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
513
+ return currentContexts[requestedIndex];
514
+ }
515
+ else {
516
+ console.warn(`Requested index ${requestedIndex + 1} out of range (${currentContexts.length} elements found)`);
517
+ return null;
518
+ }
519
+ }
520
+ return currentContexts[0];
521
+ }
522
+ return null;
523
+ }
524
+ catch (err) {
525
+ console.error("Critical XPath failure:", xpath, err);
375
526
  return null;
376
527
  }
377
528
  };
@@ -834,7 +985,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
834
985
  if (isXPathSelector(field.selector)) {
835
986
  // Create indexed absolute XPath
836
987
  const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
837
- element = evaluateXPath(document, indexedSelector);
988
+ element = evaluateXPath(document, indexedSelector, field.isShadow);
838
989
  }
839
990
  else {
840
991
  // Fallback for CSS selectors within XPath containers
@@ -829,9 +829,9 @@ class Interpreter extends events_1.EventEmitter {
829
829
  if (checkLimit())
830
830
  return allResults;
831
831
  let loadMoreCounter = 0;
832
- let previousResultCount = allResults.length;
833
- let noNewItemsCounter = 0;
834
- const MAX_NO_NEW_ITEMS = 2;
832
+ // let previousResultCount = allResults.length;
833
+ // let noNewItemsCounter = 0;
834
+ // const MAX_NO_NEW_ITEMS = 2;
835
835
  while (true) {
836
836
  // Find working button with retry mechanism
837
837
  const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
@@ -888,20 +888,19 @@ class Interpreter extends events_1.EventEmitter {
888
888
  const heightChanged = currentHeight !== previousHeight;
889
889
  previousHeight = currentHeight;
890
890
  yield scrapeCurrentPage();
891
- const currentResultCount = allResults.length;
892
- const newItemsAdded = currentResultCount > previousResultCount;
893
- if (!newItemsAdded) {
894
- noNewItemsCounter++;
895
- debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
896
- if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
897
- debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
898
- return allResults;
899
- }
900
- }
901
- else {
902
- noNewItemsCounter = 0;
903
- previousResultCount = currentResultCount;
904
- }
891
+ // const currentResultCount = allResults.length;
892
+ // const newItemsAdded = currentResultCount > previousResultCount;
893
+ // if (!newItemsAdded) {
894
+ // noNewItemsCounter++;
895
+ // debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
896
+ // if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
897
+ // debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
898
+ // return allResults;
899
+ // }
900
+ // } else {
901
+ // noNewItemsCounter = 0;
902
+ // previousResultCount = currentResultCount;
903
+ // }
905
904
  if (checkLimit())
906
905
  return allResults;
907
906
  if (!heightChanged) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.12",
3
+ "version": "0.0.13",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",