@odavl/guardian 0.1.0-rc1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +62 -0
- package/README.md +3 -3
- package/bin/guardian.js +212 -8
- package/package.json +6 -1
- package/src/guardian/attempt-engine.js +19 -5
- package/src/guardian/attempt.js +61 -39
- package/src/guardian/attempts-filter.js +63 -0
- package/src/guardian/baseline.js +44 -10
- package/src/guardian/browser-pool.js +131 -0
- package/src/guardian/browser.js +28 -1
- package/src/guardian/ci-mode.js +15 -0
- package/src/guardian/ci-output.js +37 -0
- package/src/guardian/cli-summary.js +117 -4
- package/src/guardian/data-guardian-detector.js +189 -0
- package/src/guardian/detection-layers.js +271 -0
- package/src/guardian/first-run.js +49 -0
- package/src/guardian/flag-validator.js +97 -0
- package/src/guardian/flow-executor.js +309 -44
- package/src/guardian/language-detection.js +99 -0
- package/src/guardian/market-reporter.js +16 -1
- package/src/guardian/parallel-executor.js +116 -0
- package/src/guardian/prerequisite-checker.js +101 -0
- package/src/guardian/preset-loader.js +18 -12
- package/src/guardian/profile-loader.js +96 -0
- package/src/guardian/reality.js +382 -46
- package/src/guardian/run-summary.js +20 -0
- package/src/guardian/semantic-contact-detection.js +255 -0
- package/src/guardian/semantic-contact-finder.js +200 -0
- package/src/guardian/semantic-targets.js +234 -0
- package/src/guardian/smoke.js +258 -0
- package/src/guardian/snapshot.js +23 -1
- package/src/guardian/success-evaluator.js +214 -0
- package/src/guardian/timeout-profiles.js +57 -0
- package/src/guardian/wait-for-outcome.js +120 -0
- package/src/guardian/watch-runner.js +185 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Contact Detection
|
|
3
|
+
*
|
|
4
|
+
* Deterministic, multilingual detection of contact links and elements.
|
|
5
|
+
* Returns ranked candidates with source, confidence, and matched tokens.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const { getTokensForTarget, normalizeText, getMatchedToken } = require('./semantic-targets');
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Confidence levels
|
|
12
|
+
*/
|
|
13
|
+
const CONFIDENCE = {
|
|
14
|
+
HIGH: 'high',
|
|
15
|
+
MEDIUM: 'medium',
|
|
16
|
+
LOW: 'low'
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Detection sources
|
|
21
|
+
*/
|
|
22
|
+
const DETECTION_SOURCE = {
|
|
23
|
+
DATA_GUARDIAN: 'data-guardian',
|
|
24
|
+
ARIA: 'aria',
|
|
25
|
+
HREF: 'href',
|
|
26
|
+
TEXT: 'text',
|
|
27
|
+
NAV_FOOTER: 'nav/footer',
|
|
28
|
+
HEURISTIC: 'heuristic'
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Detect contact candidates on page
|
|
33
|
+
*
|
|
34
|
+
* @param {Page} page - Playwright page object
|
|
35
|
+
* @param {string} baseUrl - Base URL for relative link resolution
|
|
36
|
+
* @returns {Promise<Array>} Array of contact candidates, ranked by confidence
|
|
37
|
+
*/
|
|
38
|
+
async function detectContactCandidates(page, baseUrl = '') {
|
|
39
|
+
const candidates = [];
|
|
40
|
+
|
|
41
|
+
try {
|
|
42
|
+
const pageData = await page.evaluate(async () => {
|
|
43
|
+
const results = [];
|
|
44
|
+
|
|
45
|
+
// Find all clickable/linkable elements
|
|
46
|
+
const elements = document.querySelectorAll('a, button, [role="link"], [role="button"], [data-guardian], .nav a, footer a');
|
|
47
|
+
|
|
48
|
+
for (const el of elements) {
|
|
49
|
+
const data = {
|
|
50
|
+
tagName: el.tagName.toLowerCase(),
|
|
51
|
+
text: el.textContent?.trim() || '',
|
|
52
|
+
href: el.href || el.getAttribute('href') || '',
|
|
53
|
+
dataGuardian: el.getAttribute('data-guardian') || '',
|
|
54
|
+
ariaLabel: el.getAttribute('aria-label') || '',
|
|
55
|
+
title: el.getAttribute('title') || '',
|
|
56
|
+
className: el.className,
|
|
57
|
+
isInNav: !!el.closest('nav, [role="navigation"]'),
|
|
58
|
+
isInFooter: !!el.closest('footer, [role="contentinfo"]')
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
results.push(data);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return results;
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
// Process each element
|
|
68
|
+
for (const element of pageData) {
|
|
69
|
+
const contactCandidates = evaluateElement(element, baseUrl);
|
|
70
|
+
candidates.push(...contactCandidates);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Sort by confidence (high > medium > low) and then by detection order
|
|
74
|
+
candidates.sort((a, b) => {
|
|
75
|
+
const confidenceOrder = { high: 0, medium: 1, low: 2 };
|
|
76
|
+
return confidenceOrder[a.confidence] - confidenceOrder[b.confidence];
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// Remove duplicates (same href or same text)
|
|
80
|
+
const seen = new Set();
|
|
81
|
+
const deduplicated = [];
|
|
82
|
+
|
|
83
|
+
for (const candidate of candidates) {
|
|
84
|
+
const key = `${candidate.matchedText}:${candidate.href}`;
|
|
85
|
+
if (!seen.has(key)) {
|
|
86
|
+
seen.add(key);
|
|
87
|
+
deduplicated.push(candidate);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return deduplicated;
|
|
92
|
+
} catch (error) {
|
|
93
|
+
console.warn(`Failed to detect contact candidates: ${error.message}`);
|
|
94
|
+
return candidates;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Evaluate a single element for contact relevance
|
|
100
|
+
*/
|
|
101
|
+
function evaluateElement(element, baseUrl = '') {
|
|
102
|
+
const candidates = [];
|
|
103
|
+
const contactTokens = getTokensForTarget('contact');
|
|
104
|
+
|
|
105
|
+
// Rule A: data-guardian attribute (highest priority)
|
|
106
|
+
if (element.dataGuardian) {
|
|
107
|
+
const normalized = normalizeText(element.dataGuardian);
|
|
108
|
+
if (normalized.includes('contact')) {
|
|
109
|
+
candidates.push({
|
|
110
|
+
selector: buildSelector(element),
|
|
111
|
+
matchedText: element.text || element.dataGuardian,
|
|
112
|
+
matchedToken: 'contact',
|
|
113
|
+
source: DETECTION_SOURCE.DATA_GUARDIAN,
|
|
114
|
+
confidence: CONFIDENCE.HIGH,
|
|
115
|
+
href: element.href,
|
|
116
|
+
ariaLabel: element.ariaLabel
|
|
117
|
+
});
|
|
118
|
+
return candidates;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Rule B: href-based detection
|
|
123
|
+
if (element.href) {
|
|
124
|
+
const normalizedHref = normalizeText(element.href);
|
|
125
|
+
const matchedToken = getMatchedToken(normalizedHref, contactTokens);
|
|
126
|
+
|
|
127
|
+
if (matchedToken) {
|
|
128
|
+
candidates.push({
|
|
129
|
+
selector: buildSelector(element),
|
|
130
|
+
matchedText: element.text || element.href,
|
|
131
|
+
matchedToken: matchedToken,
|
|
132
|
+
source: DETECTION_SOURCE.HREF,
|
|
133
|
+
confidence: CONFIDENCE.HIGH,
|
|
134
|
+
href: element.href,
|
|
135
|
+
ariaLabel: element.ariaLabel
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Rule C: visible text-based detection
|
|
141
|
+
if (element.text) {
|
|
142
|
+
const normalizedText = normalizeText(element.text);
|
|
143
|
+
const matchedToken = getMatchedToken(normalizedText, contactTokens);
|
|
144
|
+
|
|
145
|
+
if (matchedToken) {
|
|
146
|
+
// Higher confidence if in nav or footer
|
|
147
|
+
let confidence = CONFIDENCE.MEDIUM;
|
|
148
|
+
let source = DETECTION_SOURCE.TEXT;
|
|
149
|
+
|
|
150
|
+
if (element.isInNav || element.isInFooter) {
|
|
151
|
+
confidence = CONFIDENCE.HIGH;
|
|
152
|
+
source = element.isInNav ? DETECTION_SOURCE.NAV_FOOTER : DETECTION_SOURCE.NAV_FOOTER;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
candidates.push({
|
|
156
|
+
selector: buildSelector(element),
|
|
157
|
+
matchedText: element.text,
|
|
158
|
+
matchedToken: matchedToken,
|
|
159
|
+
source: source,
|
|
160
|
+
confidence: confidence,
|
|
161
|
+
href: element.href,
|
|
162
|
+
ariaLabel: element.ariaLabel
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Rule D: aria-label or title attribute
|
|
168
|
+
if (element.ariaLabel || element.title) {
|
|
169
|
+
const textToCheck = element.ariaLabel || element.title;
|
|
170
|
+
const normalizedText = normalizeText(textToCheck);
|
|
171
|
+
const matchedToken = getMatchedToken(normalizedText, contactTokens);
|
|
172
|
+
|
|
173
|
+
if (matchedToken) {
|
|
174
|
+
candidates.push({
|
|
175
|
+
selector: buildSelector(element),
|
|
176
|
+
matchedText: textToCheck,
|
|
177
|
+
matchedToken: matchedToken,
|
|
178
|
+
source: DETECTION_SOURCE.ARIA,
|
|
179
|
+
confidence: CONFIDENCE.MEDIUM,
|
|
180
|
+
href: element.href,
|
|
181
|
+
ariaLabel: element.ariaLabel
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return candidates;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Build a CSS selector for an element
|
|
191
|
+
*/
|
|
192
|
+
function buildSelector(element) {
|
|
193
|
+
// Prefer data-guardian if available
|
|
194
|
+
if (element.dataGuardian) {
|
|
195
|
+
return `[data-guardian="${element.dataGuardian}"]`;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// For links/buttons, use href or text
|
|
199
|
+
if (element.tagName === 'a' && element.href) {
|
|
200
|
+
// Use href in selector
|
|
201
|
+
return `a[href*="${normalizeHrefForSelector(element.href)}"]`;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (element.ariaLabel) {
|
|
205
|
+
return `${element.tagName}[aria-label*="${element.ariaLabel}"]`;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// Fallback
|
|
209
|
+
return `${element.tagName}`;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Normalize href for use in CSS selector
|
|
214
|
+
*/
|
|
215
|
+
function normalizeHrefForSelector(href) {
|
|
216
|
+
// Extract path portion
|
|
217
|
+
try {
|
|
218
|
+
const url = new URL(href, 'http://localhost');
|
|
219
|
+
return url.pathname.split('/').filter(p => p)[0] || '';
|
|
220
|
+
} catch {
|
|
221
|
+
// If URL parsing fails, extract first path component
|
|
222
|
+
return href.split('/')[1] || '';
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Format detection result for human-readable output
|
|
228
|
+
*/
|
|
229
|
+
function formatDetectionResult(candidate, language = 'unknown') {
|
|
230
|
+
const languageStr = language !== 'unknown' ? `lang=${language}` : 'lang=unknown';
|
|
231
|
+
const parts = [
|
|
232
|
+
`Contact detected`,
|
|
233
|
+
`(${languageStr}`,
|
|
234
|
+
`source=${candidate.source}`,
|
|
235
|
+
`token=${candidate.matchedToken}`,
|
|
236
|
+
`confidence=${candidate.confidence})`
|
|
237
|
+
];
|
|
238
|
+
|
|
239
|
+
return parts.join(', ');
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Get hint message if contact not found
|
|
244
|
+
*/
|
|
245
|
+
function getNoContactFoundHint() {
|
|
246
|
+
return 'No contact found. Consider adding a stable marker like data-guardian="contact" or ensure contact link text/href is recognizable.';
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
module.exports = {
|
|
250
|
+
detectContactCandidates,
|
|
251
|
+
formatDetectionResult,
|
|
252
|
+
getNoContactFoundHint,
|
|
253
|
+
CONFIDENCE,
|
|
254
|
+
DETECTION_SOURCE
|
|
255
|
+
};
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Contact Finder Integration
|
|
3
|
+
*
|
|
4
|
+
* Integrates semantic contact detection into the scanning flow.
|
|
5
|
+
* Works with Playwright to find contact links/forms in real pages.
|
|
6
|
+
*
|
|
7
|
+
* Now includes Wave 1.2 detection layers with data-guardian attribute support.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const { detectLanguage, getPrimaryLanguage, getLanguageName } = require('./language-detection');
|
|
11
|
+
const { detectContactCandidates, formatDetectionResult, getNoContactFoundHint } = require('./semantic-contact-detection');
|
|
12
|
+
const { getTokensForTarget, normalizeText } = require('./semantic-targets');
|
|
13
|
+
const { detectByLayers, LAYER, CONFIDENCE } = require('./detection-layers');
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Find contact elements on a page using semantic detection
|
|
17
|
+
*
|
|
18
|
+
* @param {Page} page - Playwright page object
|
|
19
|
+
* @param {string} baseUrl - Base URL for relative links
|
|
20
|
+
* @returns {Promise<Object>} Detection result with language, candidates, and recommendations
|
|
21
|
+
*/
|
|
22
|
+
async function findContactOnPage(page, baseUrl = '') {
|
|
23
|
+
const result = {
|
|
24
|
+
language: 'unknown',
|
|
25
|
+
languageName: 'Unknown',
|
|
26
|
+
candidates: [],
|
|
27
|
+
found: false,
|
|
28
|
+
hint: ''
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
try {
|
|
32
|
+
// Detect language
|
|
33
|
+
result.language = await detectLanguage(page);
|
|
34
|
+
result.languageName = getLanguageName(result.language);
|
|
35
|
+
|
|
36
|
+
// Find contact candidates
|
|
37
|
+
const candidates = await detectContactCandidates(page, baseUrl);
|
|
38
|
+
result.candidates = candidates;
|
|
39
|
+
|
|
40
|
+
if (candidates.length > 0) {
|
|
41
|
+
result.found = true;
|
|
42
|
+
result.primaryCandidate = candidates[0]; // Highest confidence
|
|
43
|
+
} else {
|
|
44
|
+
result.hint = getNoContactFoundHint();
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return result;
|
|
48
|
+
} catch (error) {
|
|
49
|
+
console.warn(`Contact detection failed: ${error.message}`);
|
|
50
|
+
result.hint = `Contact detection failed: ${error.message}. Fallback to default selectors.`;
|
|
51
|
+
return result;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Generate Playwright selectors from semantic candidates
|
|
57
|
+
* Returns a fallback selector chain compatible with attempt registry
|
|
58
|
+
*/
|
|
59
|
+
function generateSelectorsFromCandidates(candidates) {
|
|
60
|
+
if (!candidates || candidates.length === 0) {
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Take top 3 candidates, prefer high confidence
|
|
65
|
+
const topCandidates = candidates.slice(0, 3);
|
|
66
|
+
|
|
67
|
+
// Build selector chain
|
|
68
|
+
const selectors = topCandidates
|
|
69
|
+
.map(c => c.selector)
|
|
70
|
+
.filter(Boolean)
|
|
71
|
+
.join(', ');
|
|
72
|
+
|
|
73
|
+
return selectors || null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Find contact elements using Wave 1.2 detection layers
|
|
78
|
+
* Respects priority: data-guardian > href > text > structure
|
|
79
|
+
*
|
|
80
|
+
* @param {Page} page - Playwright page object
|
|
81
|
+
* @param {string} target - Detection target (contact, form, submit, about)
|
|
82
|
+
* @param {string} baseUrl - Base URL for relative links
|
|
83
|
+
* @returns {Promise<Object>} Detection result with layer, confidence, reason
|
|
84
|
+
*/
|
|
85
|
+
async function findElementByLayers(page, target, baseUrl = '') {
|
|
86
|
+
const result = {
|
|
87
|
+
language: 'unknown',
|
|
88
|
+
languageName: 'Unknown',
|
|
89
|
+
target: target,
|
|
90
|
+
found: false,
|
|
91
|
+
layer: null,
|
|
92
|
+
confidence: null,
|
|
93
|
+
candidates: [],
|
|
94
|
+
primaryCandidate: null,
|
|
95
|
+
reason: '',
|
|
96
|
+
hint: ''
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
try {
|
|
100
|
+
// Detect language
|
|
101
|
+
result.language = await detectLanguage(page);
|
|
102
|
+
result.languageName = getLanguageName(result.language);
|
|
103
|
+
|
|
104
|
+
// Use Wave 1.2 detection layers
|
|
105
|
+
const layerResult = await detectByLayers(page, target, baseUrl);
|
|
106
|
+
|
|
107
|
+
result.found = layerResult.found;
|
|
108
|
+
result.layer = layerResult.layer;
|
|
109
|
+
result.confidence = layerResult.confidence;
|
|
110
|
+
result.candidates = layerResult.candidates;
|
|
111
|
+
result.primaryCandidate = layerResult.primaryCandidate;
|
|
112
|
+
result.evidence = layerResult.evidence;
|
|
113
|
+
result.reason = layerResult.reason;
|
|
114
|
+
|
|
115
|
+
if (!result.found) {
|
|
116
|
+
result.hint = `No ${target} detected. Consider adding data-guardian="${target}" attribute for guaranteed stability.`;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return result;
|
|
120
|
+
} catch (error) {
|
|
121
|
+
console.warn(`Detection by layers failed: ${error.message}`);
|
|
122
|
+
result.reason = `Detection error: ${error.message}`;
|
|
123
|
+
result.hint = 'Fallback to manual configuration or heuristic detection.';
|
|
124
|
+
return result;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Generate Playwright selectors from semantic candidates
|
|
130
|
+
* Returns a fallback selector chain compatible with attempt registry
|
|
131
|
+
*/
|
|
132
|
+
function generateSelectorsFromCandidates(candidates) {
|
|
133
|
+
if (!candidates || candidates.length === 0) {
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Take top 3 candidates, prefer high confidence
|
|
138
|
+
const topCandidates = candidates.slice(0, 3);
|
|
139
|
+
|
|
140
|
+
// Build selector chain
|
|
141
|
+
const selectors = topCandidates
|
|
142
|
+
.map(c => c.selector)
|
|
143
|
+
.filter(Boolean)
|
|
144
|
+
.join(', ');
|
|
145
|
+
|
|
146
|
+
return selectors || null;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Format detection output for CLI reporting (Wave 1.2 enhanced)
|
|
151
|
+
* Shows which layer was used and how to improve stability
|
|
152
|
+
*/
|
|
153
|
+
function formatDetectionForReport(detectionResult) {
|
|
154
|
+
const lines = [];
|
|
155
|
+
|
|
156
|
+
lines.push(`🌍 Language Detection: ${detectionResult.languageName}`);
|
|
157
|
+
lines.push(` (lang=${detectionResult.language})`);
|
|
158
|
+
|
|
159
|
+
if (detectionResult.found && detectionResult.candidates.length > 0) {
|
|
160
|
+
lines.push('');
|
|
161
|
+
|
|
162
|
+
// Show detection layer (Wave 1.2)
|
|
163
|
+
if (detectionResult.layer) {
|
|
164
|
+
lines.push(`📍 Detection Layer: ${detectionResult.layer} (confidence: ${detectionResult.confidence})`);
|
|
165
|
+
lines.push(` ${detectionResult.reason}`);
|
|
166
|
+
lines.push('');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
lines.push(`✅ ${detectionResult.target} Detection Results (${detectionResult.candidates.length} candidate${detectionResult.candidates.length > 1 ? 's' : ''})`);
|
|
170
|
+
|
|
171
|
+
detectionResult.candidates.forEach((candidate, idx) => {
|
|
172
|
+
const formatted = formatDetectionResult(candidate, detectionResult.language);
|
|
173
|
+
lines.push(` ${idx + 1}. ${formatted}`);
|
|
174
|
+
if (candidate.matchedText) {
|
|
175
|
+
lines.push(` Text: "${candidate.matchedText}"`);
|
|
176
|
+
}
|
|
177
|
+
if (candidate.href) {
|
|
178
|
+
lines.push(` Link: ${candidate.href}`);
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
} else {
|
|
182
|
+
lines.push('');
|
|
183
|
+
lines.push(`❌ No ${detectionResult.target || 'target'} found`);
|
|
184
|
+
if (detectionResult.reason) {
|
|
185
|
+
lines.push(` Reason: ${detectionResult.reason}`);
|
|
186
|
+
}
|
|
187
|
+
if (detectionResult.hint) {
|
|
188
|
+
lines.push(` 💡 ${detectionResult.hint}`);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return lines.join('\n');
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
module.exports = {
|
|
196
|
+
findContactOnPage,
|
|
197
|
+
findElementByLayers,
|
|
198
|
+
generateSelectorsFromCandidates,
|
|
199
|
+
formatDetectionForReport
|
|
200
|
+
};
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Targets & Multilingual Dictionary
|
|
3
|
+
*
|
|
4
|
+
* Provides deterministic, language-independent detection of semantic targets
|
|
5
|
+
* (contact, about, etc.) using normalized tokens from multiple languages.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Multilingual dictionary for semantic targets
|
|
10
|
+
* Keys: target names, Values: arrays of normalized token variants
|
|
11
|
+
*/
|
|
12
|
+
const SEMANTIC_DICTIONARY = {
|
|
13
|
+
contact: [
|
|
14
|
+
// English
|
|
15
|
+
'contact', 'contactus', 'contact-us', 'contact us', 'get in touch', 'getintouch',
|
|
16
|
+
'reach out', 'reachout', 'contact form', 'contactform', 'contact page', 'contactpage',
|
|
17
|
+
'inquiry', 'inquiries', 'message us', 'messageus', 'write to us', 'writetus',
|
|
18
|
+
// German
|
|
19
|
+
'kontakt', 'kontaktieren', 'kontaktaufnahme', 'kontaktformular', 'kontakten',
|
|
20
|
+
'kontakts', 'kontakt formular', 'kontakt-formular', 'anfrage', 'anfragen',
|
|
21
|
+
// Spanish
|
|
22
|
+
'contacto', 'contactanos', 'contacta', 'formulario de contacto',
|
|
23
|
+
'pongase en contacto', 'ponte en contacto', 'escribenos', 'escriba',
|
|
24
|
+
// French
|
|
25
|
+
'contact', 'contactez', 'contactez-nous', 'formulaire de contact',
|
|
26
|
+
'nous contacter', 'nous ecrire',
|
|
27
|
+
// Portuguese
|
|
28
|
+
'contato', 'contacto', 'formulario de contato', 'entre em contato',
|
|
29
|
+
'fale conosco', 'escreva para nos',
|
|
30
|
+
// Italian
|
|
31
|
+
'contatti', 'contatto', 'contattaci', 'modulo di contatto',
|
|
32
|
+
'modulo contatti', 'mettersi in contatto',
|
|
33
|
+
// Dutch
|
|
34
|
+
'contact', 'contacteer', 'contact opnemen', 'contactformulier',
|
|
35
|
+
// Swedish
|
|
36
|
+
'kontakt', 'kontakta', 'kontaktformular',
|
|
37
|
+
// Arabic
|
|
38
|
+
'تواصل', 'اتصل', 'استفسار', 'استفسارات', 'نموذج الاتصال', 'نموذج تواصل',
|
|
39
|
+
// Chinese
|
|
40
|
+
'联系', '联系我们', '联系表单', '留言', '反馈'
|
|
41
|
+
],
|
|
42
|
+
about: [
|
|
43
|
+
// English
|
|
44
|
+
'about', 'about us', 'aboutus', 'our story', 'about-us',
|
|
45
|
+
'company', 'team', 'who we are', 'whoweare', 'more about us',
|
|
46
|
+
// German
|
|
47
|
+
'uber', 'über', 'ueber', 'uber uns', 'über uns', 'ueber uns',
|
|
48
|
+
'uber unsere', 'über unsere', 'ueber unsere', 'team', 'unternehmen',
|
|
49
|
+
// Spanish
|
|
50
|
+
'acerca', 'acerca de', 'acerca de nosotros', 'sobre nosotros',
|
|
51
|
+
'quienes somos', 'quiénes somos', 'nuestra empresa',
|
|
52
|
+
// French
|
|
53
|
+
'a propos', 'à propos', 'a propos de nous', 'à propos de nous',
|
|
54
|
+
'qui sommes nous', 'qui nous sommes', 'notre histoire',
|
|
55
|
+
// Portuguese
|
|
56
|
+
'sobre', 'sobre nos', 'sobre nós', 'quem somos',
|
|
57
|
+
// Italian
|
|
58
|
+
'chi siamo', 'chi siamo noi', 'la nostra storia',
|
|
59
|
+
// Dutch
|
|
60
|
+
'over', 'over ons', 'wie zijn we',
|
|
61
|
+
// Swedish
|
|
62
|
+
'om', 'om oss', 'var historia',
|
|
63
|
+
// Arabic
|
|
64
|
+
'عن', 'عننا', 'عن الشركة', 'فريقنا', 'قصتنا'
|
|
65
|
+
],
|
|
66
|
+
form: [
|
|
67
|
+
// English
|
|
68
|
+
'form', 'form submission', 'contact form', 'feedback form', 'inquiry form',
|
|
69
|
+
'form page', 'form element', 'form section', 'form area',
|
|
70
|
+
// German
|
|
71
|
+
'formular', 'form', 'kontaktformular', 'feedback formular',
|
|
72
|
+
// Spanish
|
|
73
|
+
'formulario', 'formulario de contacto', 'formulario de envio',
|
|
74
|
+
// French
|
|
75
|
+
'formulaire', 'formulaire de contact', 'formulaire d envoi',
|
|
76
|
+
// Portuguese
|
|
77
|
+
'formulario', 'formulario de contato', 'formulario de envio',
|
|
78
|
+
// Italian
|
|
79
|
+
'modulo', 'modulo di contatto', 'modulo di invio',
|
|
80
|
+
// Dutch
|
|
81
|
+
'formulier', 'contactformulier', 'formulier voor contact',
|
|
82
|
+
// Swedish
|
|
83
|
+
'formular', 'kontaktformular', 'feedback formular'
|
|
84
|
+
],
|
|
85
|
+
submit: [
|
|
86
|
+
// English
|
|
87
|
+
'submit', 'send', 'send message', 'send form', 'submit form',
|
|
88
|
+
'submit button', 'send button', 'post', 'publish', 'share',
|
|
89
|
+
// German
|
|
90
|
+
'senden', 'absenden', 'abschicken', 'submit', 'ubermitteln',
|
|
91
|
+
// Spanish
|
|
92
|
+
'enviar', 'enviar formulario', 'enviar mensaje', 'publicar',
|
|
93
|
+
// French
|
|
94
|
+
'envoyer', 'soumettre', 'publier', 'partager',
|
|
95
|
+
// Portuguese
|
|
96
|
+
'enviar', 'enviar formulario', 'enviar mensagem', 'publicar',
|
|
97
|
+
// Italian
|
|
98
|
+
'inviare', 'inviare modulo', 'inviare messaggio', 'pubblicare',
|
|
99
|
+
// Dutch
|
|
100
|
+
'verzenden', 'verstuur', 'verstuur formulier', 'publiceren',
|
|
101
|
+
// Swedish
|
|
102
|
+
'skicka', 'skicka formular', 'publicera'
|
|
103
|
+
]
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Normalize text for comparison
|
|
108
|
+
* - Lowercase
|
|
109
|
+
* - Trim whitespace
|
|
110
|
+
* - Remove diacritics (é → e, ü → u, etc.)
|
|
111
|
+
* - Remove punctuation
|
|
112
|
+
* - Collapse multiple spaces
|
|
113
|
+
*/
|
|
114
|
+
function normalizeText(text) {
|
|
115
|
+
if (typeof text !== 'string') {
|
|
116
|
+
return '';
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Lowercase
|
|
120
|
+
let normalized = text.toLowerCase();
|
|
121
|
+
|
|
122
|
+
// Remove diacritics using Unicode normalization
|
|
123
|
+
// NFD: decompose accented characters, then filter combining marks
|
|
124
|
+
normalized = normalized
|
|
125
|
+
.normalize('NFD')
|
|
126
|
+
.replace(/[\u0300-\u036f]/g, '');
|
|
127
|
+
|
|
128
|
+
// Remove punctuation and special characters, keep spaces
|
|
129
|
+
normalized = normalized.replace(/[^\w\s]/g, ' ');
|
|
130
|
+
|
|
131
|
+
// Collapse multiple spaces
|
|
132
|
+
normalized = normalized.replace(/\s+/g, ' ').trim();
|
|
133
|
+
|
|
134
|
+
return normalized;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if normalized text includes any token from the list
|
|
139
|
+
* Matches whole words/tokens at word boundaries where appropriate
|
|
140
|
+
*/
|
|
141
|
+
function includesAnyToken(normalizedText, tokenList) {
|
|
142
|
+
if (!normalizedText || !Array.isArray(tokenList)) {
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Check each token
|
|
147
|
+
for (const token of tokenList) {
|
|
148
|
+
// Normalize the token
|
|
149
|
+
const normalizedToken = normalizeText(token);
|
|
150
|
+
|
|
151
|
+
if (!normalizedToken) {
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// For very short tokens (<=4 chars), require word boundary
|
|
156
|
+
// For longer tokens (>4 chars), allow substring matching
|
|
157
|
+
if (normalizedToken.length <= 4) {
|
|
158
|
+
// Word boundary match
|
|
159
|
+
const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
|
|
160
|
+
if (wordBoundaryRegex.test(normalizedText)) {
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
} else {
|
|
164
|
+
// Substring match
|
|
165
|
+
if (normalizedText.includes(normalizedToken)) {
|
|
166
|
+
return true;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Get the best matching token from a list for a given text
|
|
176
|
+
* Returns the token that was matched, or null
|
|
177
|
+
*/
|
|
178
|
+
function getMatchedToken(normalizedText, tokenList) {
|
|
179
|
+
if (!normalizedText || !Array.isArray(tokenList)) {
|
|
180
|
+
return null;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
for (const token of tokenList) {
|
|
184
|
+
const normalizedToken = normalizeText(token);
|
|
185
|
+
|
|
186
|
+
if (!normalizedToken) {
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (normalizedToken.length <= 4) {
|
|
191
|
+
const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
|
|
192
|
+
if (wordBoundaryRegex.test(normalizedText)) {
|
|
193
|
+
return token;
|
|
194
|
+
}
|
|
195
|
+
} else {
|
|
196
|
+
if (normalizedText.includes(normalizedToken)) {
|
|
197
|
+
return token;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Get all target names available in dictionary
|
|
207
|
+
*/
|
|
208
|
+
function getAvailableTargets() {
|
|
209
|
+
return Object.keys(SEMANTIC_DICTIONARY);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Check if a semantic target exists in dictionary
|
|
214
|
+
*/
|
|
215
|
+
function isValidTarget(targetName) {
|
|
216
|
+
return targetName in SEMANTIC_DICTIONARY;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Get token list for a specific target
|
|
221
|
+
*/
|
|
222
|
+
function getTokensForTarget(targetName) {
|
|
223
|
+
return SEMANTIC_DICTIONARY[targetName] || [];
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
module.exports = {
|
|
227
|
+
SEMANTIC_DICTIONARY,
|
|
228
|
+
normalizeText,
|
|
229
|
+
includesAnyToken,
|
|
230
|
+
getMatchedToken,
|
|
231
|
+
getAvailableTargets,
|
|
232
|
+
isValidTarget,
|
|
233
|
+
getTokensForTarget
|
|
234
|
+
};
|