mx-cloud 0.0.12 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +162 -11
- package/build/interpret.d.ts +40 -0
- package/build/interpret.js +317 -24
- package/build/preprocessor.js +20 -7
- package/build/selector.d.ts +27 -0
- package/build/selector.js +485 -1
- package/build/types/workflow.d.ts +2 -0
- package/package.json +1 -1
|
@@ -360,18 +360,169 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
360
360
|
window.scrapeList = function (_a) {
|
|
361
361
|
return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
|
|
362
362
|
// XPath evaluation functions
|
|
363
|
-
const
|
|
363
|
+
const queryInsideContext = (context, part) => {
|
|
364
364
|
try {
|
|
365
|
-
const
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
const
|
|
371
|
-
|
|
365
|
+
const { tagName, conditions } = parseXPathPart(part);
|
|
366
|
+
const candidateElements = Array.from(context.querySelectorAll(tagName));
|
|
367
|
+
if (candidateElements.length === 0) {
|
|
368
|
+
return [];
|
|
369
|
+
}
|
|
370
|
+
const matchingElements = candidateElements.filter((el) => {
|
|
371
|
+
return elementMatchesConditions(el, conditions);
|
|
372
|
+
});
|
|
373
|
+
return matchingElements;
|
|
372
374
|
}
|
|
373
|
-
catch (
|
|
374
|
-
console.
|
|
375
|
+
catch (err) {
|
|
376
|
+
console.error("Error in queryInsideContext:", err);
|
|
377
|
+
return [];
|
|
378
|
+
}
|
|
379
|
+
};
|
|
380
|
+
// Helper function to parse XPath part
|
|
381
|
+
const parseXPathPart = (part) => {
|
|
382
|
+
const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
|
|
383
|
+
const tagName = tagMatch ? tagMatch[1] : "*";
|
|
384
|
+
const conditionMatches = part.match(/\[([^\]]+)\]/g);
|
|
385
|
+
const conditions = conditionMatches
|
|
386
|
+
? conditionMatches.map((c) => c.slice(1, -1))
|
|
387
|
+
: [];
|
|
388
|
+
return { tagName, conditions };
|
|
389
|
+
};
|
|
390
|
+
// Helper function to check if element matches all conditions
|
|
391
|
+
const elementMatchesConditions = (element, conditions) => {
|
|
392
|
+
for (const condition of conditions) {
|
|
393
|
+
if (!elementMatchesCondition(element, condition)) {
|
|
394
|
+
return false;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
return true;
|
|
398
|
+
};
|
|
399
|
+
// Helper function to check if element matches a single condition
|
|
400
|
+
const elementMatchesCondition = (element, condition) => {
|
|
401
|
+
var _a, _b;
|
|
402
|
+
condition = condition.trim();
|
|
403
|
+
if (/^\d+$/.test(condition)) {
|
|
404
|
+
return true;
|
|
405
|
+
}
|
|
406
|
+
// Handle @attribute="value"
|
|
407
|
+
const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
|
|
408
|
+
if (attrMatch) {
|
|
409
|
+
const [, attr, value] = attrMatch;
|
|
410
|
+
const elementValue = element.getAttribute(attr);
|
|
411
|
+
return elementValue === value;
|
|
412
|
+
}
|
|
413
|
+
// Handle contains(@class, 'value')
|
|
414
|
+
const classContainsMatch = condition.match(/^contains\(@class,\s*["']([^"']+)["']\)$/);
|
|
415
|
+
if (classContainsMatch) {
|
|
416
|
+
const className = classContainsMatch[1];
|
|
417
|
+
return element.classList.contains(className);
|
|
418
|
+
}
|
|
419
|
+
// Handle contains(@attribute, 'value')
|
|
420
|
+
const attrContainsMatch = condition.match(/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/);
|
|
421
|
+
if (attrContainsMatch) {
|
|
422
|
+
const [, attr, value] = attrContainsMatch;
|
|
423
|
+
const elementValue = element.getAttribute(attr) || "";
|
|
424
|
+
return elementValue.includes(value);
|
|
425
|
+
}
|
|
426
|
+
// Handle text()="value"
|
|
427
|
+
const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
|
|
428
|
+
if (textMatch) {
|
|
429
|
+
const expectedText = textMatch[1];
|
|
430
|
+
const elementText = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || "";
|
|
431
|
+
return elementText === expectedText;
|
|
432
|
+
}
|
|
433
|
+
// Handle contains(text(), 'value')
|
|
434
|
+
const textContainsMatch = condition.match(/^contains\(text\(\),\s*["']([^"']+)["']\)$/);
|
|
435
|
+
if (textContainsMatch) {
|
|
436
|
+
const expectedText = textContainsMatch[1];
|
|
437
|
+
const elementText = ((_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()) || "";
|
|
438
|
+
return elementText.includes(expectedText);
|
|
439
|
+
}
|
|
440
|
+
// Handle count(*)=0 (element has no children)
|
|
441
|
+
if (condition === "count(*)=0") {
|
|
442
|
+
return element.children.length === 0;
|
|
443
|
+
}
|
|
444
|
+
// Handle other count conditions
|
|
445
|
+
const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
|
|
446
|
+
if (countMatch) {
|
|
447
|
+
const expectedCount = parseInt(countMatch[1]);
|
|
448
|
+
return element.children.length === expectedCount;
|
|
449
|
+
}
|
|
450
|
+
return true;
|
|
451
|
+
};
|
|
452
|
+
const evaluateXPath = (document, xpath, isShadow = false) => {
|
|
453
|
+
try {
|
|
454
|
+
const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
|
455
|
+
if (!isShadow) {
|
|
456
|
+
if (result === null) {
|
|
457
|
+
return null;
|
|
458
|
+
}
|
|
459
|
+
return result;
|
|
460
|
+
}
|
|
461
|
+
let cleanPath = xpath;
|
|
462
|
+
let isIndexed = false;
|
|
463
|
+
const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
|
|
464
|
+
if (indexedMatch) {
|
|
465
|
+
cleanPath = indexedMatch[1] + indexedMatch[3];
|
|
466
|
+
isIndexed = true;
|
|
467
|
+
}
|
|
468
|
+
const pathParts = cleanPath
|
|
469
|
+
.replace(/^\/\//, "")
|
|
470
|
+
.split("/")
|
|
471
|
+
.map((p) => p.trim())
|
|
472
|
+
.filter((p) => p.length > 0);
|
|
473
|
+
let currentContexts = [document];
|
|
474
|
+
for (let i = 0; i < pathParts.length; i++) {
|
|
475
|
+
const part = pathParts[i];
|
|
476
|
+
const nextContexts = [];
|
|
477
|
+
for (const ctx of currentContexts) {
|
|
478
|
+
const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
|
|
479
|
+
let partWithoutPosition = part;
|
|
480
|
+
let requestedPosition = null;
|
|
481
|
+
if (positionalMatch) {
|
|
482
|
+
partWithoutPosition = positionalMatch[1];
|
|
483
|
+
requestedPosition = parseInt(positionalMatch[2]);
|
|
484
|
+
}
|
|
485
|
+
const matched = queryInsideContext(ctx, partWithoutPosition);
|
|
486
|
+
let elementsToAdd = matched;
|
|
487
|
+
if (requestedPosition !== null) {
|
|
488
|
+
const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
|
|
489
|
+
if (index >= 0 && index < matched.length) {
|
|
490
|
+
elementsToAdd = [matched[index]];
|
|
491
|
+
}
|
|
492
|
+
else {
|
|
493
|
+
console.warn(`Position ${requestedPosition} out of range (${matched.length} elements found)`);
|
|
494
|
+
elementsToAdd = [];
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
elementsToAdd.forEach((el) => {
|
|
498
|
+
nextContexts.push(el);
|
|
499
|
+
if (el.shadowRoot) {
|
|
500
|
+
nextContexts.push(el.shadowRoot);
|
|
501
|
+
}
|
|
502
|
+
});
|
|
503
|
+
}
|
|
504
|
+
if (nextContexts.length === 0) {
|
|
505
|
+
return null;
|
|
506
|
+
}
|
|
507
|
+
currentContexts = nextContexts;
|
|
508
|
+
}
|
|
509
|
+
if (currentContexts.length > 0) {
|
|
510
|
+
if (isIndexed && indexedMatch) {
|
|
511
|
+
const requestedIndex = parseInt(indexedMatch[2]) - 1;
|
|
512
|
+
if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
|
|
513
|
+
return currentContexts[requestedIndex];
|
|
514
|
+
}
|
|
515
|
+
else {
|
|
516
|
+
console.warn(`Requested index ${requestedIndex + 1} out of range (${currentContexts.length} elements found)`);
|
|
517
|
+
return null;
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
return currentContexts[0];
|
|
521
|
+
}
|
|
522
|
+
return null;
|
|
523
|
+
}
|
|
524
|
+
catch (err) {
|
|
525
|
+
console.error("Critical XPath failure:", xpath, err);
|
|
375
526
|
return null;
|
|
376
527
|
}
|
|
377
528
|
};
|
|
@@ -834,7 +985,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
834
985
|
if (isXPathSelector(field.selector)) {
|
|
835
986
|
// Create indexed absolute XPath
|
|
836
987
|
const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
|
|
837
|
-
element = evaluateXPath(document, indexedSelector);
|
|
988
|
+
element = evaluateXPath(document, indexedSelector, field.isShadow);
|
|
838
989
|
}
|
|
839
990
|
else {
|
|
840
991
|
// Fallback for CSS selectors within XPath containers
|
package/build/interpret.d.ts
CHANGED
|
@@ -11,12 +11,14 @@ declare global {
|
|
|
11
11
|
selector: string;
|
|
12
12
|
tag: string;
|
|
13
13
|
attribute: string;
|
|
14
|
+
listFallbackSelector?: string;
|
|
14
15
|
}>) => Record<string, any>;
|
|
15
16
|
scrapeList: (config: {
|
|
16
17
|
listSelector: string;
|
|
17
18
|
fields: any;
|
|
18
19
|
limit?: number;
|
|
19
20
|
pagination: any;
|
|
21
|
+
listFallbackSelector?: string;
|
|
20
22
|
}) => Record<string, any>[];
|
|
21
23
|
scrapeListAuto: (listSelector: string) => {
|
|
22
24
|
selector: string;
|
|
@@ -95,6 +97,44 @@ export default class Interpreter extends EventEmitter {
|
|
|
95
97
|
private generatePageNodeInformation;
|
|
96
98
|
private detectElementChanges;
|
|
97
99
|
private validateWorkflowAction;
|
|
100
|
+
/**
|
|
101
|
+
* Test if a selector is working on the current page
|
|
102
|
+
* @param {Page} page - Playwright page object
|
|
103
|
+
* @param {string} selector - Selector to test
|
|
104
|
+
* @param {boolean} isListSelector - Whether this should find multiple elements
|
|
105
|
+
* @returns {Promise<boolean>} - Whether the selector works
|
|
106
|
+
*/
|
|
107
|
+
private testSelectorWorks;
|
|
108
|
+
/**
|
|
109
|
+
* Generate new selector from fallback selector
|
|
110
|
+
* @param {Page} page - Playwright page object
|
|
111
|
+
* @param {string} fallbackSelector - Fallback selector to use
|
|
112
|
+
* @param {boolean} isListSelector - Whether this is a list selector
|
|
113
|
+
* @param {string} listContext - List selector context for field selectors
|
|
114
|
+
* @returns {Promise<string|null>} - New selector or null if failed
|
|
115
|
+
*/
|
|
116
|
+
private generateSelectorFromFallback;
|
|
117
|
+
/**
|
|
118
|
+
* Validate and fix scrapeList action selectors
|
|
119
|
+
* @param {Object} scrapeListConfig - ScrapeList configuration object
|
|
120
|
+
* @param {Page} page - Playwright page object
|
|
121
|
+
* @returns {Promise<boolean>} - Whether any changes were made
|
|
122
|
+
*/
|
|
123
|
+
private validateScrapeListAction;
|
|
124
|
+
/**
|
|
125
|
+
* Validate and fix scrapeSchema action selectors
|
|
126
|
+
* @param {Object} scrapeSchemaConfig - ScrapeSchema configuration object
|
|
127
|
+
* @param {Page} page - Playwright page object
|
|
128
|
+
* @returns {Promise<boolean>} - Whether any changes were made
|
|
129
|
+
*/
|
|
130
|
+
private validateScrapeSchemaAction;
|
|
131
|
+
/**
|
|
132
|
+
* Validate and fix selectors for a workflow action just before execution
|
|
133
|
+
* @param {Page} page - Playwright page object
|
|
134
|
+
* @param {WhereWhatPair} action - The action to validate
|
|
135
|
+
* @returns {Promise<WhereWhatPair>} - The potentially modified action
|
|
136
|
+
*/
|
|
137
|
+
private validateAndFixSelectors;
|
|
98
138
|
private runLoop;
|
|
99
139
|
private ensureScriptsLoaded;
|
|
100
140
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -559,7 +559,43 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
559
559
|
}
|
|
560
560
|
return false;
|
|
561
561
|
};
|
|
562
|
-
//
|
|
562
|
+
// Helper function to detect if a selector is XPath
|
|
563
|
+
const isXPathSelector = (selector) => {
|
|
564
|
+
return selector.startsWith('//') ||
|
|
565
|
+
selector.startsWith('/') ||
|
|
566
|
+
selector.startsWith('./') ||
|
|
567
|
+
selector.includes('contains(@') ||
|
|
568
|
+
selector.includes('[count(') ||
|
|
569
|
+
selector.includes('@class=') ||
|
|
570
|
+
selector.includes('@id=') ||
|
|
571
|
+
selector.includes(' and ') ||
|
|
572
|
+
selector.includes(' or ');
|
|
573
|
+
};
|
|
574
|
+
// Helper function to wait for selector (CSS or XPath)
|
|
575
|
+
const waitForSelectorUniversal = (selector_2, ...args_1) => __awaiter(this, [selector_2, ...args_1], void 0, function* (selector, options = {}) {
|
|
576
|
+
try {
|
|
577
|
+
if (isXPathSelector(selector)) {
|
|
578
|
+
// Use XPath locator
|
|
579
|
+
const locator = page.locator(`xpath=${selector}`);
|
|
580
|
+
yield locator.waitFor({
|
|
581
|
+
state: 'attached',
|
|
582
|
+
timeout: options.timeout || 10000
|
|
583
|
+
});
|
|
584
|
+
return yield locator.elementHandle();
|
|
585
|
+
}
|
|
586
|
+
else {
|
|
587
|
+
// Use CSS selector
|
|
588
|
+
return yield page.waitForSelector(selector, {
|
|
589
|
+
state: 'attached',
|
|
590
|
+
timeout: options.timeout || 10000
|
|
591
|
+
});
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
catch (error) {
|
|
595
|
+
return null;
|
|
596
|
+
}
|
|
597
|
+
});
|
|
598
|
+
// Enhanced button finder with retry mechanism for both CSS and XPath selectors
|
|
563
599
|
const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
|
|
564
600
|
let updatedSelectors = [...selectors];
|
|
565
601
|
for (let i = 0; i < selectors.length; i++) {
|
|
@@ -568,10 +604,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
568
604
|
let selectorSuccess = false;
|
|
569
605
|
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
|
570
606
|
try {
|
|
571
|
-
const button = yield
|
|
572
|
-
state: 'attached',
|
|
573
|
-
timeout: 10000
|
|
574
|
-
});
|
|
607
|
+
const button = yield waitForSelectorUniversal(selector, { timeout: 10000 });
|
|
575
608
|
if (button) {
|
|
576
609
|
debugLog('Found working selector:', selector);
|
|
577
610
|
return {
|
|
@@ -829,9 +862,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
829
862
|
if (checkLimit())
|
|
830
863
|
return allResults;
|
|
831
864
|
let loadMoreCounter = 0;
|
|
832
|
-
let previousResultCount = allResults.length;
|
|
833
|
-
let noNewItemsCounter = 0;
|
|
834
|
-
const MAX_NO_NEW_ITEMS = 2;
|
|
865
|
+
// let previousResultCount = allResults.length;
|
|
866
|
+
// let noNewItemsCounter = 0;
|
|
867
|
+
// const MAX_NO_NEW_ITEMS = 2;
|
|
835
868
|
while (true) {
|
|
836
869
|
// Find working button with retry mechanism
|
|
837
870
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
@@ -888,20 +921,19 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
888
921
|
const heightChanged = currentHeight !== previousHeight;
|
|
889
922
|
previousHeight = currentHeight;
|
|
890
923
|
yield scrapeCurrentPage();
|
|
891
|
-
const currentResultCount = allResults.length;
|
|
892
|
-
const newItemsAdded = currentResultCount > previousResultCount;
|
|
893
|
-
if (!newItemsAdded) {
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
}
|
|
924
|
+
// const currentResultCount = allResults.length;
|
|
925
|
+
// const newItemsAdded = currentResultCount > previousResultCount;
|
|
926
|
+
// if (!newItemsAdded) {
|
|
927
|
+
// noNewItemsCounter++;
|
|
928
|
+
// debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
|
929
|
+
// if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
|
930
|
+
// debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
|
931
|
+
// return allResults;
|
|
932
|
+
// }
|
|
933
|
+
// } else {
|
|
934
|
+
// noNewItemsCounter = 0;
|
|
935
|
+
// previousResultCount = currentResultCount;
|
|
936
|
+
// }
|
|
905
937
|
if (checkLimit())
|
|
906
938
|
return allResults;
|
|
907
939
|
if (!heightChanged) {
|
|
@@ -1564,6 +1596,266 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1564
1596
|
return modifiedAction;
|
|
1565
1597
|
});
|
|
1566
1598
|
}
|
|
1599
|
+
/**
|
|
1600
|
+
* Test if a selector is working on the current page
|
|
1601
|
+
* @param {Page} page - Playwright page object
|
|
1602
|
+
* @param {string} selector - Selector to test
|
|
1603
|
+
* @param {boolean} isListSelector - Whether this should find multiple elements
|
|
1604
|
+
* @returns {Promise<boolean>} - Whether the selector works
|
|
1605
|
+
*/
|
|
1606
|
+
testSelectorWorks(page_1, selector_2) {
|
|
1607
|
+
return __awaiter(this, arguments, void 0, function* (page, selector, isListSelector = false) {
|
|
1608
|
+
try {
|
|
1609
|
+
if (!selector || selector.trim() === '') {
|
|
1610
|
+
return false;
|
|
1611
|
+
}
|
|
1612
|
+
const isXPath = selector.startsWith('//') ||
|
|
1613
|
+
selector.startsWith('/') ||
|
|
1614
|
+
selector.includes('contains(@') ||
|
|
1615
|
+
selector.includes('@class=') ||
|
|
1616
|
+
selector.includes('@id=');
|
|
1617
|
+
let count = 0;
|
|
1618
|
+
if (isXPath) {
|
|
1619
|
+
const locator = page.locator(`xpath=${selector}`);
|
|
1620
|
+
count = yield locator.count();
|
|
1621
|
+
}
|
|
1622
|
+
else {
|
|
1623
|
+
const elements = yield page.$$(selector);
|
|
1624
|
+
count = elements ? elements.length : 0;
|
|
1625
|
+
}
|
|
1626
|
+
// For list selectors, we need multiple elements
|
|
1627
|
+
if (isListSelector) {
|
|
1628
|
+
return count >= 2;
|
|
1629
|
+
}
|
|
1630
|
+
// For field selectors, we need at least one element
|
|
1631
|
+
return count >= 1;
|
|
1632
|
+
}
|
|
1633
|
+
catch (error) {
|
|
1634
|
+
return false;
|
|
1635
|
+
}
|
|
1636
|
+
});
|
|
1637
|
+
}
|
|
1638
|
+
/**
|
|
1639
|
+
* Generate new selector from fallback selector
|
|
1640
|
+
* @param {Page} page - Playwright page object
|
|
1641
|
+
* @param {string} fallbackSelector - Fallback selector to use
|
|
1642
|
+
* @param {boolean} isListSelector - Whether this is a list selector
|
|
1643
|
+
* @param {string} listContext - List selector context for field selectors
|
|
1644
|
+
* @returns {Promise<string|null>} - New selector or null if failed
|
|
1645
|
+
*/
|
|
1646
|
+
generateSelectorFromFallback(page_1, fallbackSelector_1) {
|
|
1647
|
+
return __awaiter(this, arguments, void 0, function* (page, fallbackSelector, isListSelector = false, listContext = '', isPagination = false) {
|
|
1648
|
+
var _a, _b;
|
|
1649
|
+
try {
|
|
1650
|
+
// First check if fallback selector works
|
|
1651
|
+
const fallbackWorks = yield this.testSelectorWorks(page, fallbackSelector, isListSelector);
|
|
1652
|
+
if (!fallbackWorks) {
|
|
1653
|
+
return null;
|
|
1654
|
+
}
|
|
1655
|
+
// Get element using fallback selector
|
|
1656
|
+
const isXPath = fallbackSelector.startsWith('//') ||
|
|
1657
|
+
fallbackSelector.startsWith('/') ||
|
|
1658
|
+
fallbackSelector.includes('contains(@');
|
|
1659
|
+
let element;
|
|
1660
|
+
if (isXPath) {
|
|
1661
|
+
element = yield page.locator(`xpath=${fallbackSelector}`).first().elementHandle();
|
|
1662
|
+
}
|
|
1663
|
+
else {
|
|
1664
|
+
element = yield page.$(fallbackSelector);
|
|
1665
|
+
}
|
|
1666
|
+
if (!element) {
|
|
1667
|
+
return null;
|
|
1668
|
+
}
|
|
1669
|
+
// Generate new selectors
|
|
1670
|
+
let newSelectors;
|
|
1671
|
+
if (isListSelector) {
|
|
1672
|
+
return yield (0, selector_1.generateListSelectorFromFallback)(page, fallbackSelector);
|
|
1673
|
+
}
|
|
1674
|
+
else if (listContext) {
|
|
1675
|
+
return yield (0, selector_1.generateListFieldSelectorFromFallback)(page, fallbackSelector, listContext);
|
|
1676
|
+
}
|
|
1677
|
+
else {
|
|
1678
|
+
newSelectors = yield (0, selector_1.generateFieldSelectorFromFallback)(page, fallbackSelector);
|
|
1679
|
+
if (isPagination) {
|
|
1680
|
+
// For pagination, chain selectors in priority order
|
|
1681
|
+
let chainedSelectors = [
|
|
1682
|
+
(_a = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.iframeSelector) === null || _a === void 0 ? void 0 : _a.full,
|
|
1683
|
+
(_b = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.shadowSelector) === null || _b === void 0 ? void 0 : _b.full,
|
|
1684
|
+
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.testIdSelector,
|
|
1685
|
+
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.id,
|
|
1686
|
+
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.hrefSelector,
|
|
1687
|
+
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.accessibilitySelector,
|
|
1688
|
+
newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.attrSelector,
|
|
1689
|
+
]
|
|
1690
|
+
.filter(selector => selector !== null && selector !== undefined)
|
|
1691
|
+
.join(',');
|
|
1692
|
+
return chainedSelectors;
|
|
1693
|
+
}
|
|
1694
|
+
else {
|
|
1695
|
+
// For non-pagination, use getBestSelector
|
|
1696
|
+
const tagName = yield element.evaluate(el => el.tagName.toLowerCase());
|
|
1697
|
+
return yield (0, utils_1.getBestSelector)({
|
|
1698
|
+
selectors: newSelectors,
|
|
1699
|
+
tagName: tagName
|
|
1700
|
+
});
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
catch (error) {
|
|
1705
|
+
console.error(`Failed to generate selector from fallback: ${error.message}`);
|
|
1706
|
+
return null;
|
|
1707
|
+
}
|
|
1708
|
+
});
|
|
1709
|
+
}
|
|
1710
|
+
/**
|
|
1711
|
+
* Validate and fix scrapeList action selectors
|
|
1712
|
+
* @param {Object} scrapeListConfig - ScrapeList configuration object
|
|
1713
|
+
* @param {Page} page - Playwright page object
|
|
1714
|
+
* @returns {Promise<boolean>} - Whether any changes were made
|
|
1715
|
+
*/
|
|
1716
|
+
validateScrapeListAction(scrapeListConfig, page) {
|
|
1717
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1718
|
+
let hasChanges = false;
|
|
1719
|
+
try {
|
|
1720
|
+
// Validate listSelector
|
|
1721
|
+
const listSelectorWorks = yield this.testSelectorWorks(page, scrapeListConfig.listSelector, true);
|
|
1722
|
+
if (!listSelectorWorks && scrapeListConfig.listFallbackSelector) {
|
|
1723
|
+
console.log(`ListSelector "${scrapeListConfig.listSelector}" not working, trying fallback...`);
|
|
1724
|
+
const newListSelector = yield this.generateSelectorFromFallback(page, scrapeListConfig.listFallbackSelector, true);
|
|
1725
|
+
if (newListSelector) {
|
|
1726
|
+
console.log(`Updated listSelector: ${scrapeListConfig.listSelector} -> ${newListSelector}`);
|
|
1727
|
+
scrapeListConfig.listSelector = newListSelector;
|
|
1728
|
+
hasChanges = true;
|
|
1729
|
+
}
|
|
1730
|
+
}
|
|
1731
|
+
// Validate field selectors
|
|
1732
|
+
if (scrapeListConfig.fields) {
|
|
1733
|
+
for (const [fieldName, fieldConfig] of Object.entries(scrapeListConfig.fields)) {
|
|
1734
|
+
const fieldSelectorWorks = yield this.testSelectorWorks(page, fieldConfig.selector, false);
|
|
1735
|
+
if (!fieldSelectorWorks && fieldConfig.fallbackSelector) {
|
|
1736
|
+
console.log(`Field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
|
|
1737
|
+
const newFieldSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false, scrapeListConfig.listSelector);
|
|
1738
|
+
if (newFieldSelector) {
|
|
1739
|
+
console.log(`Updated field selector for ${fieldName}: ${fieldConfig.selector} -> ${newFieldSelector}`);
|
|
1740
|
+
fieldConfig.selector = newFieldSelector;
|
|
1741
|
+
hasChanges = true;
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
}
|
|
1746
|
+
// Validate pagination selector if it exists and is not empty
|
|
1747
|
+
if (scrapeListConfig.pagination &&
|
|
1748
|
+
scrapeListConfig.pagination.selector &&
|
|
1749
|
+
scrapeListConfig.pagination.selector.trim() !== '') {
|
|
1750
|
+
// Handle comma-separated pagination selectors
|
|
1751
|
+
const paginationSelectors = scrapeListConfig.pagination.selector.split(',').map(s => s.trim());
|
|
1752
|
+
let workingSelector = null;
|
|
1753
|
+
for (const selector of paginationSelectors) {
|
|
1754
|
+
const works = yield this.testSelectorWorks(page, selector, false);
|
|
1755
|
+
if (works) {
|
|
1756
|
+
workingSelector = selector;
|
|
1757
|
+
break;
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
if (!workingSelector && scrapeListConfig.pagination.fallbackSelector) {
|
|
1761
|
+
console.log(`Pagination selector not working, trying fallback...`);
|
|
1762
|
+
const newPaginationSelector = yield this.generateSelectorFromFallback(page, scrapeListConfig.pagination.fallbackSelector, false, '', true);
|
|
1763
|
+
if (newPaginationSelector) {
|
|
1764
|
+
console.log(`Updated pagination selector: ${scrapeListConfig.pagination.selector} -> ${newPaginationSelector}`);
|
|
1765
|
+
scrapeListConfig.pagination.selector = newPaginationSelector;
|
|
1766
|
+
hasChanges = true;
|
|
1767
|
+
}
|
|
1768
|
+
}
|
|
1769
|
+
else if (workingSelector && workingSelector !== scrapeListConfig.pagination.selector) {
|
|
1770
|
+
scrapeListConfig.pagination.selector = workingSelector;
|
|
1771
|
+
hasChanges = true;
|
|
1772
|
+
}
|
|
1773
|
+
}
|
|
1774
|
+
}
|
|
1775
|
+
catch (error) {
|
|
1776
|
+
console.error(`Error validating scrapeList action: ${error.message}`);
|
|
1777
|
+
}
|
|
1778
|
+
return hasChanges;
|
|
1779
|
+
});
|
|
1780
|
+
}
|
|
1781
|
+
/**
|
|
1782
|
+
* Validate and fix scrapeSchema action selectors
|
|
1783
|
+
* @param {Object} scrapeSchemaConfig - ScrapeSchema configuration object
|
|
1784
|
+
* @param {Page} page - Playwright page object
|
|
1785
|
+
* @returns {Promise<boolean>} - Whether any changes were made
|
|
1786
|
+
*/
|
|
1787
|
+
validateScrapeSchemaAction(scrapeSchemaConfig, page) {
|
|
1788
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1789
|
+
let hasChanges = false;
|
|
1790
|
+
try {
|
|
1791
|
+
for (const [fieldName, fieldConfig] of Object.entries(scrapeSchemaConfig)) {
|
|
1792
|
+
if (fieldConfig.selector) {
|
|
1793
|
+
const selectorWorks = yield this.testSelectorWorks(page, fieldConfig.selector, false);
|
|
1794
|
+
if (!selectorWorks && fieldConfig.fallbackSelector) {
|
|
1795
|
+
console.log(`Schema field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
|
|
1796
|
+
const newSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false);
|
|
1797
|
+
if (newSelector) {
|
|
1798
|
+
console.log(`Updated schema field selector for ${fieldName}: ${fieldConfig.selector} -> ${newSelector}`);
|
|
1799
|
+
fieldConfig.selector = newSelector;
|
|
1800
|
+
hasChanges = true;
|
|
1801
|
+
}
|
|
1802
|
+
}
|
|
1803
|
+
}
|
|
1804
|
+
}
|
|
1805
|
+
}
|
|
1806
|
+
catch (error) {
|
|
1807
|
+
console.error(`Error validating scrapeSchema action: ${error.message}`);
|
|
1808
|
+
}
|
|
1809
|
+
return hasChanges;
|
|
1810
|
+
});
|
|
1811
|
+
}
|
|
1812
|
+
/**
|
|
1813
|
+
* Validate and fix selectors for a workflow action just before execution
|
|
1814
|
+
* @param {Page} page - Playwright page object
|
|
1815
|
+
* @param {WhereWhatPair} action - The action to validate
|
|
1816
|
+
* @returns {Promise<WhereWhatPair>} - The potentially modified action
|
|
1817
|
+
*/
|
|
1818
|
+
validateAndFixSelectors(page, action) {
|
|
1819
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1820
|
+
const modifiedAction = JSON.parse(JSON.stringify(action));
|
|
1821
|
+
let totalChanges = 0;
|
|
1822
|
+
try {
|
|
1823
|
+
// Process each action in the 'what' array
|
|
1824
|
+
for (let i = 0; i < modifiedAction.what.length; i++) {
|
|
1825
|
+
const whatAction = modifiedAction.what[i];
|
|
1826
|
+
// Handle scrapeList actions
|
|
1827
|
+
if (whatAction.action === 'scrapeList' && whatAction.args && whatAction.args[0]) {
|
|
1828
|
+
console.log(`Validating scrapeList action...`);
|
|
1829
|
+
const hasChanges = yield this.validateScrapeListAction(whatAction.args[0], page);
|
|
1830
|
+
if (hasChanges) {
|
|
1831
|
+
totalChanges++;
|
|
1832
|
+
console.log(`Fixed scrapeList selectors`);
|
|
1833
|
+
}
|
|
1834
|
+
}
|
|
1835
|
+
// Handle scrapeSchema actions
|
|
1836
|
+
if (whatAction.action === 'scrapeSchema' && whatAction.args && whatAction.args[0]) {
|
|
1837
|
+
console.log(`Validating scrapeSchema action...`);
|
|
1838
|
+
const hasChanges = yield this.validateScrapeSchemaAction(whatAction.args[0], page);
|
|
1839
|
+
if (hasChanges) {
|
|
1840
|
+
totalChanges++;
|
|
1841
|
+
console.log(`Fixed scrapeSchema selectors`);
|
|
1842
|
+
}
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1845
|
+
if (totalChanges > 0) {
|
|
1846
|
+
console.log(`Selector validation completed: ${totalChanges} actions modified`);
|
|
1847
|
+
}
|
|
1848
|
+
else {
|
|
1849
|
+
console.log(`Selector validation completed: No changes needed`);
|
|
1850
|
+
}
|
|
1851
|
+
}
|
|
1852
|
+
catch (error) {
|
|
1853
|
+
console.error(`Error in selector validation: ${error.message}`);
|
|
1854
|
+
this.trackAutohealFailure(`Selector validation failed: ${error.message}`);
|
|
1855
|
+
}
|
|
1856
|
+
return modifiedAction;
|
|
1857
|
+
});
|
|
1858
|
+
}
|
|
1567
1859
|
runLoop(p, workflow) {
|
|
1568
1860
|
return __awaiter(this, void 0, void 0, function* () {
|
|
1569
1861
|
var _a, _b;
|
|
@@ -1661,8 +1953,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1661
1953
|
}
|
|
1662
1954
|
lastAction = action;
|
|
1663
1955
|
try {
|
|
1664
|
-
|
|
1665
|
-
|
|
1956
|
+
const validatedAction = yield this.validateAndFixSelectors(p, action);
|
|
1957
|
+
console.log("Carrying out:", validatedAction.what);
|
|
1958
|
+
yield this.carryOutSteps(p, validatedAction.what);
|
|
1666
1959
|
usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
|
|
1667
1960
|
workflowCopy.splice(actionId, 1);
|
|
1668
1961
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
package/build/preprocessor.js
CHANGED
|
@@ -47,7 +47,7 @@ class Preprocessor {
|
|
|
47
47
|
*/
|
|
48
48
|
static getParams(workflow) {
|
|
49
49
|
const getParamsRecurse = (object) => {
|
|
50
|
-
if (typeof object === 'object') {
|
|
50
|
+
if (typeof object === 'object' && object !== null) {
|
|
51
51
|
// Recursion base case
|
|
52
52
|
if (object.$param) {
|
|
53
53
|
return [object.$param];
|
|
@@ -123,13 +123,26 @@ class Preprocessor {
|
|
|
123
123
|
const out = object;
|
|
124
124
|
// for every key (child) of the object
|
|
125
125
|
Object.keys(object).forEach((key) => {
|
|
126
|
-
|
|
127
|
-
if
|
|
128
|
-
|
|
129
|
-
|
|
126
|
+
const childValue = object[key];
|
|
127
|
+
// Skip if childValue is null, undefined, or not an object
|
|
128
|
+
if (!childValue || typeof childValue !== 'object') {
|
|
129
|
+
return; // Continue to next iteration
|
|
130
130
|
}
|
|
131
|
-
|
|
132
|
-
|
|
131
|
+
try {
|
|
132
|
+
const childKeys = Object.keys(childValue);
|
|
133
|
+
// if the field has only one key, which is `k`
|
|
134
|
+
if (childKeys.length === 1 && childValue[k]) {
|
|
135
|
+
// process the current special tag (init param, hydrate regex...)
|
|
136
|
+
out[key] = f(childValue[k]);
|
|
137
|
+
}
|
|
138
|
+
else {
|
|
139
|
+
// Recursively process the child object
|
|
140
|
+
initSpecialRecurse(childValue, k, f);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
catch (error) {
|
|
144
|
+
// If Object.keys fails or any other error, just continue
|
|
145
|
+
console.warn(`Error processing key "${key}" in initSpecialRecurse:`, error);
|
|
133
146
|
}
|
|
134
147
|
});
|
|
135
148
|
return out;
|
package/build/selector.d.ts
CHANGED
|
@@ -29,4 +29,31 @@ interface SelectorResult {
|
|
|
29
29
|
* @returns {Promise<Selectors|null|undefined>}
|
|
30
30
|
*/
|
|
31
31
|
export declare const generateNonUniqueSelectors: (page: Page, elementHandle: ElementHandle, listSelector?: string) => Promise<SelectorResult>;
|
|
32
|
+
/**
|
|
33
|
+
* Generate new list selector from fallback element (based on your reference implementation)
|
|
34
|
+
* @param page - Playwright page object
|
|
35
|
+
* @param fallbackSelector - Fallback selector to use
|
|
36
|
+
* @returns New list selector or null if failed
|
|
37
|
+
*/
|
|
38
|
+
export declare const generateListSelectorFromFallback: (page: Page, fallbackSelector: string) => Promise<string | null>;
|
|
39
|
+
/**
|
|
40
|
+
* Generate new field selector from fallback selector (one field at a time)
|
|
41
|
+
* @param page - Playwright page object
|
|
42
|
+
* @param fallbackSelector - Fallback selector to use
|
|
43
|
+
* @param listSelector - The list selector context
|
|
44
|
+
* @returns New field selector or null if failed
|
|
45
|
+
*/
|
|
46
|
+
export declare const generateListFieldSelectorFromFallback: (page: Page, fallbackSelector: string, listSelector: string) => Promise<string | null>;
|
|
47
|
+
export declare const generateFieldSelectorFromFallback: (page: Page, fallbackSelector: string) => Promise<{
|
|
48
|
+
id: string | null;
|
|
49
|
+
generalSelector: string | null;
|
|
50
|
+
attrSelector: string | null;
|
|
51
|
+
testIdSelector: string | null;
|
|
52
|
+
text: string;
|
|
53
|
+
href?: string;
|
|
54
|
+
hrefSelector: string | null;
|
|
55
|
+
accessibilitySelector: string | null;
|
|
56
|
+
formSelector: string | null;
|
|
57
|
+
relSelector: string | null;
|
|
58
|
+
} | null>;
|
|
32
59
|
export {};
|
package/build/selector.js
CHANGED
|
@@ -9,7 +9,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
11
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
-
exports.generateNonUniqueSelectors = exports.generateSelectors = void 0;
|
|
12
|
+
exports.generateFieldSelectorFromFallback = exports.generateListFieldSelectorFromFallback = exports.generateListSelectorFromFallback = exports.generateNonUniqueSelectors = exports.generateSelectors = void 0;
|
|
13
13
|
const generateSelectors = (page, elementHandle) => __awaiter(void 0, void 0, void 0, function* () {
|
|
14
14
|
try {
|
|
15
15
|
const selectors = yield elementHandle.evaluate((element) => {
|
|
@@ -848,3 +848,487 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
848
848
|
}
|
|
849
849
|
});
|
|
850
850
|
exports.generateNonUniqueSelectors = generateNonUniqueSelectors;
|
|
851
|
+
/**
|
|
852
|
+
* Generate new list selector from fallback element (based on your reference implementation)
|
|
853
|
+
* @param page - Playwright page object
|
|
854
|
+
* @param fallbackSelector - Fallback selector to use
|
|
855
|
+
* @returns New list selector or null if failed
|
|
856
|
+
*/
|
|
857
|
+
const generateListSelectorFromFallback = (page, fallbackSelector) => __awaiter(void 0, void 0, void 0, function* () {
|
|
858
|
+
try {
|
|
859
|
+
// Execute selector generation within the page context
|
|
860
|
+
const newSelector = yield page.evaluate((selector) => {
|
|
861
|
+
try {
|
|
862
|
+
// Check if selector is XPath
|
|
863
|
+
const isXPath = selector.startsWith('//') ||
|
|
864
|
+
selector.startsWith('/') ||
|
|
865
|
+
selector.includes('contains(@');
|
|
866
|
+
let elements;
|
|
867
|
+
if (isXPath) {
|
|
868
|
+
// Use XPath evaluation
|
|
869
|
+
const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
|
|
870
|
+
elements = [];
|
|
871
|
+
let node = xpathResult.iterateNext();
|
|
872
|
+
while (node && elements.length < 5) { // Limit to 5 elements for performance
|
|
873
|
+
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
874
|
+
elements.push(node);
|
|
875
|
+
}
|
|
876
|
+
node = xpathResult.iterateNext();
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
else {
|
|
880
|
+
// Use CSS selector
|
|
881
|
+
const nodeList = document.querySelectorAll(selector);
|
|
882
|
+
elements = Array.from(nodeList).slice(0, 5); // Limit to 5 elements
|
|
883
|
+
}
|
|
884
|
+
if (!elements || elements.length === 0) {
|
|
885
|
+
return null;
|
|
886
|
+
}
|
|
887
|
+
// Extract element data for analysis
|
|
888
|
+
const elementData = elements.map((el) => ({
|
|
889
|
+
tagName: el.tagName.toLowerCase(),
|
|
890
|
+
className: el.getAttribute('class') || '',
|
|
891
|
+
attributes: Array.from(el.attributes).reduce((attrs, attr) => {
|
|
892
|
+
if (!['id', 'style', 'data-mx-id'].includes(attr.name)) {
|
|
893
|
+
attrs[attr.name] = attr.value;
|
|
894
|
+
}
|
|
895
|
+
return attrs;
|
|
896
|
+
}, {}),
|
|
897
|
+
childrenCount: el.children.length
|
|
898
|
+
}));
|
|
899
|
+
if (elementData.length === 0) {
|
|
900
|
+
return null;
|
|
901
|
+
}
|
|
902
|
+
const firstElement = elementData[0];
|
|
903
|
+
const tagName = firstElement.tagName;
|
|
904
|
+
// Check if all elements have the same tag name
|
|
905
|
+
const allSameTag = elementData.every((el) => el.tagName === tagName);
|
|
906
|
+
if (!allSameTag) {
|
|
907
|
+
console.warn("Inconsistent tag names in group, using first element's tag");
|
|
908
|
+
}
|
|
909
|
+
// Start building XPath - ALWAYS generate primary XPath
|
|
910
|
+
let xpath = `//${tagName}`;
|
|
911
|
+
const predicates = [];
|
|
912
|
+
// Get common classes
|
|
913
|
+
const allClasses = elementData.map((el) => el.className.split(/\s+/).filter(Boolean));
|
|
914
|
+
if (allClasses.length > 0 && allClasses[0].length > 0) {
|
|
915
|
+
// Find classes that appear in most elements (at least 60%)
|
|
916
|
+
const classFrequency = new Map();
|
|
917
|
+
allClasses.forEach((classes) => {
|
|
918
|
+
classes.forEach((cls) => {
|
|
919
|
+
classFrequency.set(cls, (classFrequency.get(cls) || 0) + 1);
|
|
920
|
+
});
|
|
921
|
+
});
|
|
922
|
+
const minFrequency = Math.ceil(allClasses.length * 0.6);
|
|
923
|
+
const commonClasses = Array.from(classFrequency.entries())
|
|
924
|
+
.filter(([_, count]) => count >= minFrequency)
|
|
925
|
+
.map(([cls, _]) => cls);
|
|
926
|
+
if (commonClasses.length > 0) {
|
|
927
|
+
predicates.push(...commonClasses.map((cls) => `contains(@class, '${cls}')`));
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
// Get common attributes (excluding id, style, data-mx-id)
|
|
931
|
+
if (elementData.length > 1) {
|
|
932
|
+
const commonAttributes = {};
|
|
933
|
+
const firstAttrs = firstElement.attributes;
|
|
934
|
+
for (const [attr, value] of Object.entries(firstAttrs)) {
|
|
935
|
+
const isCommon = elementData.every((el) => el.attributes[attr] === value);
|
|
936
|
+
if (isCommon) {
|
|
937
|
+
commonAttributes[attr] = value;
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
for (const [attr, value] of Object.entries(commonAttributes)) {
|
|
941
|
+
predicates.push(`@${attr}='${value}'`);
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
// Optional: Common child count (only if consistent across most elements)
|
|
945
|
+
const childCountFrequency = new Map();
|
|
946
|
+
elementData.forEach((el) => {
|
|
947
|
+
childCountFrequency.set(el.childrenCount, (childCountFrequency.get(el.childrenCount) || 0) + 1);
|
|
948
|
+
});
|
|
949
|
+
const mostCommonChildCount = Array.from(childCountFrequency.entries())
|
|
950
|
+
.sort((a, b) => b[1] - a[1])[0];
|
|
951
|
+
if (mostCommonChildCount && mostCommonChildCount[1] >= Math.ceil(elementData.length * 0.8)) {
|
|
952
|
+
predicates.push(`count(*)=${mostCommonChildCount[0]}`);
|
|
953
|
+
}
|
|
954
|
+
// Build final XPath
|
|
955
|
+
if (predicates.length > 0) {
|
|
956
|
+
xpath += `[${predicates.join(' and ')}]`;
|
|
957
|
+
}
|
|
958
|
+
console.log(`Generated list selector: ${xpath} from fallback: ${selector}`);
|
|
959
|
+
return xpath;
|
|
960
|
+
}
|
|
961
|
+
catch (error) {
|
|
962
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
963
|
+
console.error(`Failed to generate list selector from fallback: ${errorMessage}`);
|
|
964
|
+
return null;
|
|
965
|
+
}
|
|
966
|
+
}, fallbackSelector);
|
|
967
|
+
return newSelector;
|
|
968
|
+
}
|
|
969
|
+
catch (error) {
|
|
970
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
971
|
+
console.error(`Failed to execute selector generation: ${errorMessage}`);
|
|
972
|
+
return null;
|
|
973
|
+
}
|
|
974
|
+
});
|
|
975
|
+
exports.generateListSelectorFromFallback = generateListSelectorFromFallback;
|
|
976
|
+
/**
|
|
977
|
+
* Generate new field selector from fallback selector (one field at a time)
|
|
978
|
+
* @param page - Playwright page object
|
|
979
|
+
* @param fallbackSelector - Fallback selector to use
|
|
980
|
+
* @param listSelector - The list selector context
|
|
981
|
+
* @returns New field selector or null if failed
|
|
982
|
+
*/
|
|
983
|
+
const generateListFieldSelectorFromFallback = (page, fallbackSelector, listSelector) => __awaiter(void 0, void 0, void 0, function* () {
|
|
984
|
+
try {
|
|
985
|
+
// Execute field selector generation within the page context
|
|
986
|
+
const newSelector = yield page.evaluate(({ fallbackSel, listSel }) => {
|
|
987
|
+
// Helper function to check if selector is XPath
|
|
988
|
+
const isXPathSelector = (selector) => {
|
|
989
|
+
return selector.startsWith('//') ||
|
|
990
|
+
selector.startsWith('/') ||
|
|
991
|
+
selector.includes('contains(@') ||
|
|
992
|
+
selector.includes('@class=') ||
|
|
993
|
+
selector.includes('@id=');
|
|
994
|
+
};
|
|
995
|
+
// Helper function to evaluate XPath
|
|
996
|
+
const evaluateXPath = (xpath) => {
|
|
997
|
+
try {
|
|
998
|
+
const result = document.evaluate(xpath, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
999
|
+
const elements = [];
|
|
1000
|
+
for (let i = 0; i < result.snapshotLength; i++) {
|
|
1001
|
+
const node = result.snapshotItem(i);
|
|
1002
|
+
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
1003
|
+
elements.push(node);
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
return elements;
|
|
1007
|
+
}
|
|
1008
|
+
catch (error) {
|
|
1009
|
+
return [];
|
|
1010
|
+
}
|
|
1011
|
+
};
|
|
1012
|
+
// Helper function to get sibling position
|
|
1013
|
+
const getSiblingPosition = (element, parent) => {
|
|
1014
|
+
const siblings = Array.from(parent.children || [])
|
|
1015
|
+
.filter((child) => child.tagName === element.tagName);
|
|
1016
|
+
return siblings.indexOf(element) + 1;
|
|
1017
|
+
};
|
|
1018
|
+
// Generate optimized structural step
|
|
1019
|
+
const generateOptimizedStructuralStep = (element, rootElement) => {
|
|
1020
|
+
const tagName = element.tagName.toLowerCase();
|
|
1021
|
+
const parent = element.parentElement;
|
|
1022
|
+
if (!parent) {
|
|
1023
|
+
return tagName;
|
|
1024
|
+
}
|
|
1025
|
+
// Use classes first
|
|
1026
|
+
const classes = Array.from(element.classList);
|
|
1027
|
+
if (classes.length > 0) {
|
|
1028
|
+
const classSelector = classes
|
|
1029
|
+
.map((cls) => `contains(@class, '${cls}')`)
|
|
1030
|
+
.join(" and ");
|
|
1031
|
+
return `${tagName}[${classSelector}]`;
|
|
1032
|
+
}
|
|
1033
|
+
// Try meaningful attributes
|
|
1034
|
+
const meaningfulAttrs = ["role", "type", "name", "src", "aria-label"];
|
|
1035
|
+
for (const attrName of meaningfulAttrs) {
|
|
1036
|
+
if (element.hasAttribute(attrName)) {
|
|
1037
|
+
const value = element.getAttribute(attrName).replace(/'/g, "\\'");
|
|
1038
|
+
return `${tagName}[@${attrName}='${value}']`;
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
// Try test ID
|
|
1042
|
+
const testId = element.getAttribute("data-testid");
|
|
1043
|
+
if (testId) {
|
|
1044
|
+
return `${tagName}[@data-testid='${testId}']`;
|
|
1045
|
+
}
|
|
1046
|
+
// Try ID
|
|
1047
|
+
if (element.id && !element.id.match(/^\d/)) {
|
|
1048
|
+
return `${tagName}[@id='${element.id}']`;
|
|
1049
|
+
}
|
|
1050
|
+
// Try other data attributes
|
|
1051
|
+
for (const attr of Array.from(element.attributes)) {
|
|
1052
|
+
if (attr.name.startsWith("data-") &&
|
|
1053
|
+
attr.name !== "data-testid" &&
|
|
1054
|
+
attr.name !== "data-mx-id" &&
|
|
1055
|
+
attr.value) {
|
|
1056
|
+
return `${tagName}[@${attr.name}='${attr.value}']`;
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
// Fallback to position
|
|
1060
|
+
const position = getSiblingPosition(element, parent);
|
|
1061
|
+
return `${tagName}[${position}]`;
|
|
1062
|
+
};
|
|
1063
|
+
// Get optimized structural path
|
|
1064
|
+
const getOptimizedStructuralPath = (targetElement, rootElement) => {
|
|
1065
|
+
if (!rootElement.contains(targetElement) || targetElement === rootElement) {
|
|
1066
|
+
return null;
|
|
1067
|
+
}
|
|
1068
|
+
const pathParts = [];
|
|
1069
|
+
let current = targetElement;
|
|
1070
|
+
// Build path from target up to root
|
|
1071
|
+
while (current && current !== rootElement) {
|
|
1072
|
+
const pathPart = generateOptimizedStructuralStep(current, rootElement);
|
|
1073
|
+
if (pathPart) {
|
|
1074
|
+
pathParts.unshift(pathPart);
|
|
1075
|
+
}
|
|
1076
|
+
current = current.parentElement;
|
|
1077
|
+
if (!current)
|
|
1078
|
+
break;
|
|
1079
|
+
}
|
|
1080
|
+
return pathParts.length > 0 ? "/" + pathParts.join("/") : null;
|
|
1081
|
+
};
|
|
1082
|
+
try {
|
|
1083
|
+
// Get the first element from fallback selector
|
|
1084
|
+
let targetElement = null;
|
|
1085
|
+
if (isXPathSelector(fallbackSel)) {
|
|
1086
|
+
const elements = evaluateXPath(fallbackSel);
|
|
1087
|
+
targetElement = elements[0] || null;
|
|
1088
|
+
}
|
|
1089
|
+
else {
|
|
1090
|
+
targetElement = document.querySelector(fallbackSel);
|
|
1091
|
+
}
|
|
1092
|
+
if (!targetElement) {
|
|
1093
|
+
return null;
|
|
1094
|
+
}
|
|
1095
|
+
// Get the list container elements
|
|
1096
|
+
const parentElements = evaluateXPath(listSel);
|
|
1097
|
+
let containingParent = null;
|
|
1098
|
+
for (const parent of parentElements) {
|
|
1099
|
+
if (parent.contains(targetElement)) {
|
|
1100
|
+
containingParent = parent;
|
|
1101
|
+
break;
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
if (!containingParent) {
|
|
1105
|
+
return null;
|
|
1106
|
+
}
|
|
1107
|
+
// Build structural path
|
|
1108
|
+
const structuralPath = getOptimizedStructuralPath(targetElement, containingParent);
|
|
1109
|
+
if (!structuralPath) {
|
|
1110
|
+
return null;
|
|
1111
|
+
}
|
|
1112
|
+
// Combine list selector with structural path
|
|
1113
|
+
const newSelector = listSel + structuralPath;
|
|
1114
|
+
console.log(`Generated field selector: ${newSelector} from fallback: ${fallbackSel}`);
|
|
1115
|
+
return newSelector;
|
|
1116
|
+
}
|
|
1117
|
+
catch (error) {
|
|
1118
|
+
console.error("Error generating field selector:", error);
|
|
1119
|
+
return null;
|
|
1120
|
+
}
|
|
1121
|
+
}, { fallbackSel: fallbackSelector, listSel: listSelector });
|
|
1122
|
+
return newSelector;
|
|
1123
|
+
}
|
|
1124
|
+
catch (error) {
|
|
1125
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
1126
|
+
console.error(`Failed to generate field selector: ${errorMessage}`);
|
|
1127
|
+
return null;
|
|
1128
|
+
}
|
|
1129
|
+
});
|
|
1130
|
+
exports.generateListFieldSelectorFromFallback = generateListFieldSelectorFromFallback;
|
|
1131
|
+
/* Generate all schema field selectors from fallback selector (CSS to CSS)
|
|
1132
|
+
* @param page - Playwright page object
|
|
1133
|
+
* @param fallbackSelector - CSS fallback selector to use
|
|
1134
|
+
* @returns Array of all generated CSS selectors or null if failed
|
|
1135
|
+
*/
|
|
1136
|
+
const generateFieldSelectorFromFallback = (page, fallbackSelector) => __awaiter(void 0, void 0, void 0, function* () {
|
|
1137
|
+
try {
|
|
1138
|
+
// Execute schema field selector generation within the page context
|
|
1139
|
+
const selectors = yield page.evaluate((fallbackSel) => {
|
|
1140
|
+
// CSS escape function (simplified version from your reference)
|
|
1141
|
+
function cssesc(string, options = {}) {
|
|
1142
|
+
const { isIdentifier = false } = options;
|
|
1143
|
+
let output = '';
|
|
1144
|
+
for (let i = 0; i < string.length; i++) {
|
|
1145
|
+
const char = string.charAt(i);
|
|
1146
|
+
const code = char.charCodeAt(0);
|
|
1147
|
+
if (code < 0x20 || code > 0x7e) {
|
|
1148
|
+
output += '\\' + code.toString(16).toUpperCase() + ' ';
|
|
1149
|
+
}
|
|
1150
|
+
else if (/[\t\n\f\r\x0B]/.test(char)) {
|
|
1151
|
+
output += '\\' + code.toString(16).toUpperCase() + ' ';
|
|
1152
|
+
}
|
|
1153
|
+
else if (char === '\\' || (isIdentifier && /[ -,\.\/:-@\[\]\^`\{-~]/.test(char))) {
|
|
1154
|
+
output += '\\' + char;
|
|
1155
|
+
}
|
|
1156
|
+
else {
|
|
1157
|
+
output += char;
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
if (isIdentifier && /\d/.test(string.charAt(0))) {
|
|
1161
|
+
output = '\\3' + string.charAt(0) + ' ' + output.slice(1);
|
|
1162
|
+
}
|
|
1163
|
+
return output;
|
|
1164
|
+
}
|
|
1165
|
+
// Main finder function (simplified version from your reference)
|
|
1166
|
+
function finder(input, options = {}) {
|
|
1167
|
+
if (input.nodeType !== Node.ELEMENT_NODE) {
|
|
1168
|
+
throw new Error("Can't generate CSS selector for non-element node type.");
|
|
1169
|
+
}
|
|
1170
|
+
if (input.tagName.toLowerCase() === 'html') {
|
|
1171
|
+
return 'html';
|
|
1172
|
+
}
|
|
1173
|
+
// If attr function is provided, use it to filter attributes
|
|
1174
|
+
if (options.attr) {
|
|
1175
|
+
const attrs = Array.from(input.attributes).filter(attr => options.attr(attr.name) && attr.name !== 'data-mx-id');
|
|
1176
|
+
if (attrs.length > 0) {
|
|
1177
|
+
const attr = attrs[0];
|
|
1178
|
+
return `[${cssesc(attr.name, { isIdentifier: true })}="${cssesc(attr.value)}"]`;
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
// Try ID first
|
|
1182
|
+
const elementId = input.getAttribute('id');
|
|
1183
|
+
if (elementId && !elementId.match(/^\d/)) {
|
|
1184
|
+
return '#' + cssesc(elementId, { isIdentifier: true });
|
|
1185
|
+
}
|
|
1186
|
+
// Try classes
|
|
1187
|
+
const classes = Array.from(input.classList);
|
|
1188
|
+
if (classes.length > 0) {
|
|
1189
|
+
const classSelector = classes.map(cls => '.' + cssesc(cls, { isIdentifier: true })).join('');
|
|
1190
|
+
const tagName = input.tagName.toLowerCase();
|
|
1191
|
+
return tagName + classSelector;
|
|
1192
|
+
}
|
|
1193
|
+
// Try attributes
|
|
1194
|
+
const meaningfulAttrs = ['data-testid', 'data-test-id', 'data-testing', 'data-test', 'data-qa', 'data-cy', 'name', 'aria-label', 'alt', 'title', 'href', 'role', 'type'];
|
|
1195
|
+
for (const attrName of meaningfulAttrs) {
|
|
1196
|
+
if (input.hasAttribute(attrName) && attrName !== 'data-mx-id') {
|
|
1197
|
+
const value = input.getAttribute(attrName);
|
|
1198
|
+
if (value) {
|
|
1199
|
+
return `[${cssesc(attrName, { isIdentifier: true })}="${cssesc(value)}"]`;
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
// Fallback to tag name with nth-child if needed
|
|
1204
|
+
const tagName = input.tagName.toLowerCase();
|
|
1205
|
+
const parent = input.parentElement;
|
|
1206
|
+
if (parent) {
|
|
1207
|
+
const siblings = Array.from(parent.children).filter(child => child.tagName === input.tagName);
|
|
1208
|
+
if (siblings.length > 1) {
|
|
1209
|
+
const index = siblings.indexOf(input) + 1;
|
|
1210
|
+
return `${tagName}:nth-child(${index})`;
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
return tagName;
|
|
1214
|
+
}
|
|
1215
|
+
// Generate selectors for attributes
|
|
1216
|
+
function genSelectorForAttributes(element, attributes) {
|
|
1217
|
+
try {
|
|
1218
|
+
for (const attr of attributes) {
|
|
1219
|
+
if (element.hasAttribute(attr)) {
|
|
1220
|
+
const value = element.getAttribute(attr);
|
|
1221
|
+
if (value && value.length > 0) {
|
|
1222
|
+
if (attr === 'rel') {
|
|
1223
|
+
return `[rel="${value}"]`;
|
|
1224
|
+
}
|
|
1225
|
+
return `[${cssesc(attr, { isIdentifier: true })}="${cssesc(value)}"]`;
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
return null;
|
|
1230
|
+
}
|
|
1231
|
+
catch (e) {
|
|
1232
|
+
return null;
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
// Check if character is number
|
|
1236
|
+
function isCharacterNumber(char) {
|
|
1237
|
+
return char && char.length === 1 && /[0-9]/.test(char);
|
|
1238
|
+
}
|
|
1239
|
+
// Generate attribute set
|
|
1240
|
+
function genAttributeSet(element, attributes) {
|
|
1241
|
+
return new Set(attributes.filter((attr) => {
|
|
1242
|
+
const attrValue = element.getAttribute(attr);
|
|
1243
|
+
return attrValue != null && attrValue.length > 0;
|
|
1244
|
+
}));
|
|
1245
|
+
}
|
|
1246
|
+
// Check if attributes are defined
|
|
1247
|
+
function isAttributesDefined(element, attributes) {
|
|
1248
|
+
return genAttributeSet(element, attributes).size > 0;
|
|
1249
|
+
}
|
|
1250
|
+
// Generate valid attribute filter
|
|
1251
|
+
function genValidAttributeFilter(element, attributes) {
|
|
1252
|
+
const attrSet = genAttributeSet(element, attributes);
|
|
1253
|
+
return (name) => attrSet.has(name);
|
|
1254
|
+
}
|
|
1255
|
+
// Main selector generation function (based on genSelectors from your reference)
|
|
1256
|
+
function genSelectors(element) {
|
|
1257
|
+
var _a;
|
|
1258
|
+
const href = element.getAttribute('href');
|
|
1259
|
+
let generalSelector = null;
|
|
1260
|
+
try {
|
|
1261
|
+
generalSelector = finder(element);
|
|
1262
|
+
}
|
|
1263
|
+
catch (e) {
|
|
1264
|
+
console.warn('Error generating general selector:', e);
|
|
1265
|
+
}
|
|
1266
|
+
let attrSelector = null;
|
|
1267
|
+
try {
|
|
1268
|
+
attrSelector = finder(element, { attr: () => true });
|
|
1269
|
+
}
|
|
1270
|
+
catch (e) {
|
|
1271
|
+
console.warn('Error generating attr selector:', e);
|
|
1272
|
+
}
|
|
1273
|
+
const relSelector = genSelectorForAttributes(element, ['rel']);
|
|
1274
|
+
const hrefSelector = genSelectorForAttributes(element, ['href']);
|
|
1275
|
+
const formSelector = genSelectorForAttributes(element, ['name', 'placeholder', 'for']);
|
|
1276
|
+
const accessibilitySelector = genSelectorForAttributes(element, ['aria-label', 'alt', 'title']);
|
|
1277
|
+
const testIdSelector = genSelectorForAttributes(element, [
|
|
1278
|
+
'data-testid', 'data-test-id', 'data-testing', 'data-test', 'data-qa', 'data-cy'
|
|
1279
|
+
]);
|
|
1280
|
+
// We won't use an id selector if the id is invalid (starts with a number)
|
|
1281
|
+
let idSelector = null;
|
|
1282
|
+
try {
|
|
1283
|
+
idSelector = isAttributesDefined(element, ['id']) &&
|
|
1284
|
+
!isCharacterNumber((_a = element.id) === null || _a === void 0 ? void 0 : _a[0])
|
|
1285
|
+
? finder(element, {
|
|
1286
|
+
attr: (name) => name === 'id',
|
|
1287
|
+
})
|
|
1288
|
+
: null;
|
|
1289
|
+
}
|
|
1290
|
+
catch (e) {
|
|
1291
|
+
console.warn('Error generating id selector:', e);
|
|
1292
|
+
}
|
|
1293
|
+
return {
|
|
1294
|
+
id: idSelector,
|
|
1295
|
+
generalSelector,
|
|
1296
|
+
attrSelector,
|
|
1297
|
+
testIdSelector,
|
|
1298
|
+
text: element.innerText,
|
|
1299
|
+
href: href !== null && href !== void 0 ? href : undefined,
|
|
1300
|
+
hrefSelector,
|
|
1301
|
+
accessibilitySelector,
|
|
1302
|
+
formSelector,
|
|
1303
|
+
relSelector,
|
|
1304
|
+
};
|
|
1305
|
+
}
|
|
1306
|
+
try {
|
|
1307
|
+
// Get the target element from CSS fallback selector
|
|
1308
|
+
const targetElement = document.querySelector(fallbackSel);
|
|
1309
|
+
if (!targetElement) {
|
|
1310
|
+
console.warn('Target element not found with CSS fallback selector:', fallbackSel);
|
|
1311
|
+
return null;
|
|
1312
|
+
}
|
|
1313
|
+
// Prioritize Link logic (from your reference)
|
|
1314
|
+
const { parentElement } = targetElement;
|
|
1315
|
+
const element = (parentElement === null || parentElement === void 0 ? void 0 : parentElement.tagName) === 'A' ? parentElement : targetElement;
|
|
1316
|
+
// Generate all selectors using the same logic as your reference
|
|
1317
|
+
const generatedSelectors = genSelectors(element);
|
|
1318
|
+
console.log('Generated schema field CSS selectors:', generatedSelectors);
|
|
1319
|
+
return generatedSelectors;
|
|
1320
|
+
}
|
|
1321
|
+
catch (error) {
|
|
1322
|
+
console.error('Error in schema field CSS selector generation:', error);
|
|
1323
|
+
return null;
|
|
1324
|
+
}
|
|
1325
|
+
}, fallbackSelector);
|
|
1326
|
+
return selectors;
|
|
1327
|
+
}
|
|
1328
|
+
catch (error) {
|
|
1329
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
1330
|
+
console.error(`Failed to generate schema field CSS selectors: ${errorMessage}`);
|
|
1331
|
+
return null;
|
|
1332
|
+
}
|
|
1333
|
+
});
|
|
1334
|
+
exports.generateFieldSelectorFromFallback = generateFieldSelectorFromFallback;
|
|
@@ -83,6 +83,7 @@ export interface SchemaConfig {
|
|
|
83
83
|
nodeInfo: SerializedNode;
|
|
84
84
|
selector: string;
|
|
85
85
|
attribute: string;
|
|
86
|
+
fallbackSelector?: string;
|
|
86
87
|
coordinates: Coordinates;
|
|
87
88
|
elementMetadata: {
|
|
88
89
|
classList: string[];
|
|
@@ -93,6 +94,7 @@ export interface SchemaConfig {
|
|
|
93
94
|
}
|
|
94
95
|
export interface ScrapeListSchema {
|
|
95
96
|
listSelector: string;
|
|
97
|
+
listFallbackSelector?: string;
|
|
96
98
|
listSelectorInfo: {
|
|
97
99
|
nodeInfo: SerializedNode;
|
|
98
100
|
coordinates: Coordinates;
|