@odavl/guardian 0.1.0-rc1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +146 -0
- package/README.md +155 -97
- package/bin/guardian.js +1544 -55
- package/config/README.md +59 -0
- package/config/profiles/landing-demo.yaml +16 -0
- package/package.json +26 -11
- package/policies/landing-demo.json +22 -0
- package/src/enterprise/audit-logger.js +166 -0
- package/src/enterprise/pdf-exporter.js +267 -0
- package/src/enterprise/rbac-gate.js +142 -0
- package/src/enterprise/rbac.js +239 -0
- package/src/enterprise/site-manager.js +180 -0
- package/src/founder/feedback-system.js +156 -0
- package/src/founder/founder-tracker.js +213 -0
- package/src/founder/usage-signals.js +141 -0
- package/src/guardian/alert-ledger.js +121 -0
- package/src/guardian/attempt-engine.js +587 -12
- package/src/guardian/attempt-registry.js +42 -1
- package/src/guardian/attempt-relevance.js +106 -0
- package/src/guardian/attempt.js +85 -39
- package/src/guardian/attempts-filter.js +63 -0
- package/src/guardian/baseline.js +50 -8
- package/src/guardian/breakage-intelligence.js +1 -0
- package/src/guardian/browser-pool.js +131 -0
- package/src/guardian/browser.js +28 -1
- package/src/guardian/ci-cli.js +121 -0
- package/src/guardian/ci-mode.js +15 -0
- package/src/guardian/ci-output.js +38 -0
- package/src/guardian/cli-summary.js +167 -67
- package/src/guardian/config-loader.js +162 -0
- package/src/guardian/data-guardian-detector.js +189 -0
- package/src/guardian/detection-layers.js +271 -0
- package/src/guardian/drift-detector.js +100 -0
- package/src/guardian/enhanced-html-reporter.js +221 -4
- package/src/guardian/env-guard.js +127 -0
- package/src/guardian/failure-intelligence.js +173 -0
- package/src/guardian/first-run-profile.js +89 -0
- package/src/guardian/first-run.js +54 -0
- package/src/guardian/flag-validator.js +111 -0
- package/src/guardian/flow-executor.js +309 -44
- package/src/guardian/html-reporter.js +2 -0
- package/src/guardian/human-reporter.js +431 -0
- package/src/guardian/index.js +22 -19
- package/src/guardian/init-command.js +9 -5
- package/src/guardian/intent-detector.js +146 -0
- package/src/guardian/journey-definitions.js +132 -0
- package/src/guardian/journey-scan-cli.js +145 -0
- package/src/guardian/journey-scanner.js +583 -0
- package/src/guardian/junit-reporter.js +18 -1
- package/src/guardian/language-detection.js +99 -0
- package/src/guardian/live-cli.js +95 -0
- package/src/guardian/live-scheduler-runner.js +137 -0
- package/src/guardian/live-scheduler.js +146 -0
- package/src/guardian/market-reporter.js +357 -82
- package/src/guardian/parallel-executor.js +116 -0
- package/src/guardian/pattern-analyzer.js +348 -0
- package/src/guardian/policy.js +80 -3
- package/src/guardian/prerequisite-checker.js +101 -0
- package/src/guardian/preset-loader.js +27 -18
- package/src/guardian/profile-loader.js +96 -0
- package/src/guardian/reality.js +1612 -115
- package/src/guardian/reporter.js +27 -41
- package/src/guardian/run-artifacts.js +212 -0
- package/src/guardian/run-cleanup.js +207 -0
- package/src/guardian/run-latest.js +90 -0
- package/src/guardian/run-list.js +211 -0
- package/src/guardian/run-summary.js +20 -0
- package/src/guardian/scan-presets.js +100 -11
- package/src/guardian/selector-fallbacks.js +394 -0
- package/src/guardian/semantic-contact-detection.js +255 -0
- package/src/guardian/semantic-contact-finder.js +201 -0
- package/src/guardian/semantic-targets.js +234 -0
- package/src/guardian/site-introspection.js +257 -0
- package/src/guardian/smoke.js +258 -0
- package/src/guardian/snapshot-schema.js +25 -1
- package/src/guardian/snapshot.js +69 -3
- package/src/guardian/stability-scorer.js +169 -0
- package/src/guardian/success-evaluator.js +214 -0
- package/src/guardian/template-command.js +184 -0
- package/src/guardian/text-formatters.js +426 -0
- package/src/guardian/timeout-profiles.js +57 -0
- package/src/guardian/verdict.js +320 -0
- package/src/guardian/verdicts.js +74 -0
- package/src/guardian/wait-for-outcome.js +120 -0
- package/src/guardian/watch-runner.js +181 -0
- package/src/payments/stripe-checkout.js +169 -0
- package/src/plans/plan-definitions.js +148 -0
- package/src/plans/plan-manager.js +211 -0
- package/src/plans/usage-tracker.js +210 -0
- package/src/recipes/recipe-engine.js +188 -0
- package/src/recipes/recipe-failure-analysis.js +159 -0
- package/src/recipes/recipe-registry.js +134 -0
- package/src/recipes/recipe-runtime.js +507 -0
- package/src/recipes/recipe-store.js +410 -0
- package/guardian-contract-v1.md +0 -149
- /package/{guardian.config.json → config/guardian.config.json} +0 -0
- /package/{guardian.policy.json → config/guardian.policy.json} +0 -0
- /package/{guardian.profile.docs.yaml → config/profiles/docs.yaml} +0 -0
- /package/{guardian.profile.ecommerce.yaml → config/profiles/ecommerce.yaml} +0 -0
- /package/{guardian.profile.marketing.yaml → config/profiles/marketing.yaml} +0 -0
- /package/{guardian.profile.saas.yaml → config/profiles/saas.yaml} +0 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Contact Finder Integration
|
|
3
|
+
*
|
|
4
|
+
* Integrates semantic contact detection into the scanning flow.
|
|
5
|
+
* Works with Playwright to find contact links/forms in real pages.
|
|
6
|
+
*
|
|
7
|
+
* Now includes Wave 1.2 detection layers with data-guardian attribute support.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const { detectLanguage, getPrimaryLanguage, getLanguageName } = require('./language-detection');
|
|
11
|
+
const { detectContactCandidates, formatDetectionResult, getNoContactFoundHint } = require('./semantic-contact-detection');
|
|
12
|
+
const { getTokensForTarget, normalizeText } = require('./semantic-targets');
|
|
13
|
+
const { detectByLayers, LAYER, CONFIDENCE } = require('./detection-layers');
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Find contact elements on a page using semantic detection
|
|
17
|
+
*
|
|
18
|
+
* @param {Page} page - Playwright page object
|
|
19
|
+
* @param {string} baseUrl - Base URL for relative links
|
|
20
|
+
* @returns {Promise<Object>} Detection result with language, candidates, and recommendations
|
|
21
|
+
*/
|
|
22
|
+
async function findContactOnPage(page, baseUrl = '') {
|
|
23
|
+
const result = {
|
|
24
|
+
language: 'unknown',
|
|
25
|
+
languageName: 'Unknown',
|
|
26
|
+
candidates: [],
|
|
27
|
+
found: false,
|
|
28
|
+
hint: ''
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
try {
|
|
32
|
+
// Detect language
|
|
33
|
+
result.language = await detectLanguage(page);
|
|
34
|
+
result.languageName = getLanguageName(result.language);
|
|
35
|
+
|
|
36
|
+
// Find contact candidates
|
|
37
|
+
const candidates = await detectContactCandidates(page, baseUrl);
|
|
38
|
+
result.candidates = candidates;
|
|
39
|
+
|
|
40
|
+
if (candidates.length > 0) {
|
|
41
|
+
result.found = true;
|
|
42
|
+
result.primaryCandidate = candidates[0]; // Highest confidence
|
|
43
|
+
} else {
|
|
44
|
+
result.hint = getNoContactFoundHint();
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return result;
|
|
48
|
+
} catch (error) {
|
|
49
|
+
console.warn(`Contact detection failed: ${error.message}`);
|
|
50
|
+
result.hint = `Contact detection failed: ${error.message}. Fallback to default selectors.`;
|
|
51
|
+
return result;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Generate Playwright selectors from semantic candidates
|
|
57
|
+
* Returns a fallback selector chain compatible with attempt registry
|
|
58
|
+
*/
|
|
59
|
+
function generateSelectorsFromCandidates(candidates) {
|
|
60
|
+
if (!candidates || candidates.length === 0) {
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Take top 3 candidates, prefer high confidence
|
|
65
|
+
const topCandidates = candidates.slice(0, 3);
|
|
66
|
+
|
|
67
|
+
// Build selector chain
|
|
68
|
+
const selectors = topCandidates
|
|
69
|
+
.map(c => c.selector)
|
|
70
|
+
.filter(Boolean)
|
|
71
|
+
.join(', ');
|
|
72
|
+
|
|
73
|
+
return selectors || null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Find contact elements using Wave 1.2 detection layers
|
|
78
|
+
* Respects priority: data-guardian > href > text > structure
|
|
79
|
+
*
|
|
80
|
+
* @param {Page} page - Playwright page object
|
|
81
|
+
* @param {string} target - Detection target (contact, form, submit, about)
|
|
82
|
+
* @param {string} baseUrl - Base URL for relative links
|
|
83
|
+
* @returns {Promise<Object>} Detection result with layer, confidence, reason
|
|
84
|
+
*/
|
|
85
|
+
async function findElementByLayers(page, target, baseUrl = '') {
|
|
86
|
+
const result = {
|
|
87
|
+
language: 'unknown',
|
|
88
|
+
languageName: 'Unknown',
|
|
89
|
+
target: target,
|
|
90
|
+
found: false,
|
|
91
|
+
layer: null,
|
|
92
|
+
confidence: null,
|
|
93
|
+
candidates: [],
|
|
94
|
+
primaryCandidate: null,
|
|
95
|
+
reason: '',
|
|
96
|
+
hint: ''
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
try {
|
|
100
|
+
// Detect language
|
|
101
|
+
result.language = await detectLanguage(page);
|
|
102
|
+
result.languageName = getLanguageName(result.language);
|
|
103
|
+
|
|
104
|
+
// Use Wave 1.2 detection layers
|
|
105
|
+
const layerResult = await detectByLayers(page, target, baseUrl);
|
|
106
|
+
|
|
107
|
+
result.found = layerResult.found;
|
|
108
|
+
result.layer = layerResult.layer;
|
|
109
|
+
result.confidence = layerResult.confidence;
|
|
110
|
+
result.candidates = layerResult.candidates;
|
|
111
|
+
result.primaryCandidate = layerResult.primaryCandidate;
|
|
112
|
+
result.evidence = layerResult.evidence;
|
|
113
|
+
result.reason = layerResult.reason;
|
|
114
|
+
|
|
115
|
+
if (!result.found) {
|
|
116
|
+
result.hint = `No ${target} detected. Consider adding data-guardian="${target}" attribute for guaranteed stability.`;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return result;
|
|
120
|
+
} catch (error) {
|
|
121
|
+
console.warn(`Detection by layers failed: ${error.message}`);
|
|
122
|
+
result.reason = `Detection error: ${error.message}`;
|
|
123
|
+
result.hint = 'Fallback to manual configuration or heuristic detection.';
|
|
124
|
+
return result;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Generate Playwright selectors from semantic candidates
|
|
130
|
+
* Returns a fallback selector chain compatible with attempt registry
|
|
131
|
+
*/
|
|
132
|
+
function generateSelectorsFromCandidates(candidates) {
|
|
133
|
+
if (!candidates || candidates.length === 0) {
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Take top 3 candidates, prefer high confidence
|
|
138
|
+
const topCandidates = candidates.slice(0, 3);
|
|
139
|
+
|
|
140
|
+
// Build selector chain
|
|
141
|
+
const selectors = topCandidates
|
|
142
|
+
.map(c => c.selector)
|
|
143
|
+
.filter(Boolean)
|
|
144
|
+
.join(', ');
|
|
145
|
+
|
|
146
|
+
return selectors || null;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Format detection output for CLI reporting (Wave 1.2 enhanced)
|
|
151
|
+
* Shows which layer was used and how to improve stability
|
|
152
|
+
*/
|
|
153
|
+
function formatDetectionForReport(detectionResult) {
|
|
154
|
+
const lines = [];
|
|
155
|
+
|
|
156
|
+
lines.push(`🌍 Language Detection: ${detectionResult.languageName}`);
|
|
157
|
+
lines.push(` (lang=${detectionResult.language})`);
|
|
158
|
+
|
|
159
|
+
if (detectionResult.found && detectionResult.candidates.length > 0) {
|
|
160
|
+
lines.push('');
|
|
161
|
+
|
|
162
|
+
// Show detection layer (Wave 1.2)
|
|
163
|
+
if (detectionResult.layer) {
|
|
164
|
+
lines.push(`📍 Detection Layer: ${detectionResult.layer} (confidence: ${detectionResult.confidence})`);
|
|
165
|
+
lines.push(` ${detectionResult.reason}`);
|
|
166
|
+
lines.push('');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
lines.push(`✅ ${detectionResult.target} Detection Results (${detectionResult.candidates.length} candidate${detectionResult.candidates.length > 1 ? 's' : ''})`);
|
|
170
|
+
|
|
171
|
+
detectionResult.candidates.forEach((candidate, idx) => {
|
|
172
|
+
const formatted = formatDetectionResult(candidate, detectionResult.language);
|
|
173
|
+
lines.push(` ${idx + 1}. ${formatted}`);
|
|
174
|
+
if (candidate.matchedText) {
|
|
175
|
+
lines.push(` Text: "${candidate.matchedText}"`);
|
|
176
|
+
}
|
|
177
|
+
if (candidate.href) {
|
|
178
|
+
lines.push(` Link: ${candidate.href}`);
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
} else {
|
|
182
|
+
lines.push('');
|
|
183
|
+
// Clarify selector-based scope to avoid overstating discovery
|
|
184
|
+
lines.push(`❌ No ${detectionResult.target || 'contact'} page/link discovered via selectors`);
|
|
185
|
+
if (detectionResult.reason) {
|
|
186
|
+
lines.push(` Reason: ${detectionResult.reason}`);
|
|
187
|
+
}
|
|
188
|
+
if (detectionResult.hint) {
|
|
189
|
+
lines.push(` 💡 ${detectionResult.hint}`);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return lines.join('\n');
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
module.exports = {
|
|
197
|
+
findContactOnPage,
|
|
198
|
+
findElementByLayers,
|
|
199
|
+
generateSelectorsFromCandidates,
|
|
200
|
+
formatDetectionForReport
|
|
201
|
+
};
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Targets & Multilingual Dictionary
|
|
3
|
+
*
|
|
4
|
+
* Provides deterministic, language-independent detection of semantic targets
|
|
5
|
+
* (contact, about, etc.) using normalized tokens from multiple languages.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Multilingual dictionary for semantic targets
|
|
10
|
+
* Keys: target names, Values: arrays of normalized token variants
|
|
11
|
+
*/
|
|
12
|
+
const SEMANTIC_DICTIONARY = {
|
|
13
|
+
contact: [
|
|
14
|
+
// English
|
|
15
|
+
'contact', 'contactus', 'contact-us', 'contact us', 'get in touch', 'getintouch',
|
|
16
|
+
'reach out', 'reachout', 'contact form', 'contactform', 'contact page', 'contactpage',
|
|
17
|
+
'inquiry', 'inquiries', 'message us', 'messageus', 'write to us', 'writetus',
|
|
18
|
+
// German
|
|
19
|
+
'kontakt', 'kontaktieren', 'kontaktaufnahme', 'kontaktformular', 'kontakten',
|
|
20
|
+
'kontakts', 'kontakt formular', 'kontakt-formular', 'anfrage', 'anfragen',
|
|
21
|
+
// Spanish
|
|
22
|
+
'contacto', 'contactanos', 'contacta', 'formulario de contacto',
|
|
23
|
+
'pongase en contacto', 'ponte en contacto', 'escribenos', 'escriba',
|
|
24
|
+
// French
|
|
25
|
+
'contact', 'contactez', 'contactez-nous', 'formulaire de contact',
|
|
26
|
+
'nous contacter', 'nous ecrire',
|
|
27
|
+
// Portuguese
|
|
28
|
+
'contato', 'contacto', 'formulario de contato', 'entre em contato',
|
|
29
|
+
'fale conosco', 'escreva para nos',
|
|
30
|
+
// Italian
|
|
31
|
+
'contatti', 'contatto', 'contattaci', 'modulo di contatto',
|
|
32
|
+
'modulo contatti', 'mettersi in contatto',
|
|
33
|
+
// Dutch
|
|
34
|
+
'contact', 'contacteer', 'contact opnemen', 'contactformulier',
|
|
35
|
+
// Swedish
|
|
36
|
+
'kontakt', 'kontakta', 'kontaktformular',
|
|
37
|
+
// Arabic
|
|
38
|
+
'تواصل', 'اتصل', 'استفسار', 'استفسارات', 'نموذج الاتصال', 'نموذج تواصل',
|
|
39
|
+
// Chinese
|
|
40
|
+
'联系', '联系我们', '联系表单', '留言', '反馈'
|
|
41
|
+
],
|
|
42
|
+
about: [
|
|
43
|
+
// English
|
|
44
|
+
'about', 'about us', 'aboutus', 'our story', 'about-us',
|
|
45
|
+
'company', 'team', 'who we are', 'whoweare', 'more about us',
|
|
46
|
+
// German
|
|
47
|
+
'uber', 'über', 'ueber', 'uber uns', 'über uns', 'ueber uns',
|
|
48
|
+
'uber unsere', 'über unsere', 'ueber unsere', 'team', 'unternehmen',
|
|
49
|
+
// Spanish
|
|
50
|
+
'acerca', 'acerca de', 'acerca de nosotros', 'sobre nosotros',
|
|
51
|
+
'quienes somos', 'quiénes somos', 'nuestra empresa',
|
|
52
|
+
// French
|
|
53
|
+
'a propos', 'à propos', 'a propos de nous', 'à propos de nous',
|
|
54
|
+
'qui sommes nous', 'qui nous sommes', 'notre histoire',
|
|
55
|
+
// Portuguese
|
|
56
|
+
'sobre', 'sobre nos', 'sobre nós', 'quem somos',
|
|
57
|
+
// Italian
|
|
58
|
+
'chi siamo', 'chi siamo noi', 'la nostra storia',
|
|
59
|
+
// Dutch
|
|
60
|
+
'over', 'over ons', 'wie zijn we',
|
|
61
|
+
// Swedish
|
|
62
|
+
'om', 'om oss', 'var historia',
|
|
63
|
+
// Arabic
|
|
64
|
+
'عن', 'عننا', 'عن الشركة', 'فريقنا', 'قصتنا'
|
|
65
|
+
],
|
|
66
|
+
form: [
|
|
67
|
+
// English
|
|
68
|
+
'form', 'form submission', 'contact form', 'feedback form', 'inquiry form',
|
|
69
|
+
'form page', 'form element', 'form section', 'form area',
|
|
70
|
+
// German
|
|
71
|
+
'formular', 'form', 'kontaktformular', 'feedback formular',
|
|
72
|
+
// Spanish
|
|
73
|
+
'formulario', 'formulario de contacto', 'formulario de envio',
|
|
74
|
+
// French
|
|
75
|
+
'formulaire', 'formulaire de contact', 'formulaire d envoi',
|
|
76
|
+
// Portuguese
|
|
77
|
+
'formulario', 'formulario de contato', 'formulario de envio',
|
|
78
|
+
// Italian
|
|
79
|
+
'modulo', 'modulo di contatto', 'modulo di invio',
|
|
80
|
+
// Dutch
|
|
81
|
+
'formulier', 'contactformulier', 'formulier voor contact',
|
|
82
|
+
// Swedish
|
|
83
|
+
'formular', 'kontaktformular', 'feedback formular'
|
|
84
|
+
],
|
|
85
|
+
submit: [
|
|
86
|
+
// English
|
|
87
|
+
'submit', 'send', 'send message', 'send form', 'submit form',
|
|
88
|
+
'submit button', 'send button', 'post', 'publish', 'share',
|
|
89
|
+
// German
|
|
90
|
+
'senden', 'absenden', 'abschicken', 'submit', 'ubermitteln',
|
|
91
|
+
// Spanish
|
|
92
|
+
'enviar', 'enviar formulario', 'enviar mensaje', 'publicar',
|
|
93
|
+
// French
|
|
94
|
+
'envoyer', 'soumettre', 'publier', 'partager',
|
|
95
|
+
// Portuguese
|
|
96
|
+
'enviar', 'enviar formulario', 'enviar mensagem', 'publicar',
|
|
97
|
+
// Italian
|
|
98
|
+
'inviare', 'inviare modulo', 'inviare messaggio', 'pubblicare',
|
|
99
|
+
// Dutch
|
|
100
|
+
'verzenden', 'verstuur', 'verstuur formulier', 'publiceren',
|
|
101
|
+
// Swedish
|
|
102
|
+
'skicka', 'skicka formular', 'publicera'
|
|
103
|
+
]
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Normalize text for comparison
|
|
108
|
+
* - Lowercase
|
|
109
|
+
* - Trim whitespace
|
|
110
|
+
* - Remove diacritics (é → e, ü → u, etc.)
|
|
111
|
+
* - Remove punctuation
|
|
112
|
+
* - Collapse multiple spaces
|
|
113
|
+
*/
|
|
114
|
+
function normalizeText(text) {
|
|
115
|
+
if (typeof text !== 'string') {
|
|
116
|
+
return '';
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Lowercase
|
|
120
|
+
let normalized = text.toLowerCase();
|
|
121
|
+
|
|
122
|
+
// Remove diacritics using Unicode normalization
|
|
123
|
+
// NFD: decompose accented characters, then filter combining marks
|
|
124
|
+
normalized = normalized
|
|
125
|
+
.normalize('NFD')
|
|
126
|
+
.replace(/[\u0300-\u036f]/g, '');
|
|
127
|
+
|
|
128
|
+
// Remove punctuation and special characters, keep spaces
|
|
129
|
+
normalized = normalized.replace(/[^\w\s]/g, ' ');
|
|
130
|
+
|
|
131
|
+
// Collapse multiple spaces
|
|
132
|
+
normalized = normalized.replace(/\s+/g, ' ').trim();
|
|
133
|
+
|
|
134
|
+
return normalized;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if normalized text includes any token from the list
|
|
139
|
+
* Matches whole words/tokens at word boundaries where appropriate
|
|
140
|
+
*/
|
|
141
|
+
function includesAnyToken(normalizedText, tokenList) {
|
|
142
|
+
if (!normalizedText || !Array.isArray(tokenList)) {
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Check each token
|
|
147
|
+
for (const token of tokenList) {
|
|
148
|
+
// Normalize the token
|
|
149
|
+
const normalizedToken = normalizeText(token);
|
|
150
|
+
|
|
151
|
+
if (!normalizedToken) {
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// For very short tokens (<=4 chars), require word boundary
|
|
156
|
+
// For longer tokens (>4 chars), allow substring matching
|
|
157
|
+
if (normalizedToken.length <= 4) {
|
|
158
|
+
// Word boundary match
|
|
159
|
+
const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
|
|
160
|
+
if (wordBoundaryRegex.test(normalizedText)) {
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
} else {
|
|
164
|
+
// Substring match
|
|
165
|
+
if (normalizedText.includes(normalizedToken)) {
|
|
166
|
+
return true;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Get the best matching token from a list for a given text
|
|
176
|
+
* Returns the token that was matched, or null
|
|
177
|
+
*/
|
|
178
|
+
function getMatchedToken(normalizedText, tokenList) {
|
|
179
|
+
if (!normalizedText || !Array.isArray(tokenList)) {
|
|
180
|
+
return null;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
for (const token of tokenList) {
|
|
184
|
+
const normalizedToken = normalizeText(token);
|
|
185
|
+
|
|
186
|
+
if (!normalizedToken) {
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (normalizedToken.length <= 4) {
|
|
191
|
+
const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
|
|
192
|
+
if (wordBoundaryRegex.test(normalizedText)) {
|
|
193
|
+
return token;
|
|
194
|
+
}
|
|
195
|
+
} else {
|
|
196
|
+
if (normalizedText.includes(normalizedToken)) {
|
|
197
|
+
return token;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Get all target names available in dictionary
|
|
207
|
+
*/
|
|
208
|
+
function getAvailableTargets() {
|
|
209
|
+
return Object.keys(SEMANTIC_DICTIONARY);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Check if a semantic target exists in dictionary
|
|
214
|
+
*/
|
|
215
|
+
function isValidTarget(targetName) {
|
|
216
|
+
return targetName in SEMANTIC_DICTIONARY;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Get token list for a specific target
|
|
221
|
+
*/
|
|
222
|
+
function getTokensForTarget(targetName) {
|
|
223
|
+
return SEMANTIC_DICTIONARY[targetName] || [];
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
module.exports = {
|
|
227
|
+
SEMANTIC_DICTIONARY,
|
|
228
|
+
normalizeText,
|
|
229
|
+
includesAnyToken,
|
|
230
|
+
getMatchedToken,
|
|
231
|
+
getAvailableTargets,
|
|
232
|
+
isValidTarget,
|
|
233
|
+
getTokensForTarget
|
|
234
|
+
};
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Site Introspection - DOM-based capability detection
|
|
3
|
+
* Deterministically identifies site features by inspecting the loaded page.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Inspect the page for various capabilities
|
|
8
|
+
*
|
|
9
|
+
* @param {Page} page - Playwright page object (already loaded)
|
|
10
|
+
* @returns {Promise<Object>} introspection results
|
|
11
|
+
*/
|
|
12
|
+
async function inspectSite(page) {
|
|
13
|
+
const introspection = {
|
|
14
|
+
hasLogin: false,
|
|
15
|
+
hasSignup: false,
|
|
16
|
+
hasCheckout: false,
|
|
17
|
+
hasNewsletter: false,
|
|
18
|
+
hasContactForm: false,
|
|
19
|
+
hasLanguageSwitch: false,
|
|
20
|
+
hasContentSignals: false
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
try {
|
|
24
|
+
// Check for forms - basic existence
|
|
25
|
+
const hasForms = await page.evaluate(() => {
|
|
26
|
+
return document.querySelectorAll('form').length > 0;
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
// Check for login indicators — STRONG SIGNALS ONLY
|
|
30
|
+
introspection.hasLogin = await page.evaluate(() => {
|
|
31
|
+
const isValidHrefForAuth = (href) => {
|
|
32
|
+
if (!href) return false;
|
|
33
|
+
const h = href.trim().toLowerCase();
|
|
34
|
+
if (h.startsWith('javascript:')) return false;
|
|
35
|
+
if (h.startsWith('#')) return false;
|
|
36
|
+
try {
|
|
37
|
+
const u = new URL(h, window.location.origin);
|
|
38
|
+
const p = u.pathname.toLowerCase();
|
|
39
|
+
return /(\/login|\/signin|\/sign-in|\/auth\/login)$|\/(login|signin)(\/|$)/.test(p);
|
|
40
|
+
} catch (_) {
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
// Strong signal 1: password input on page
|
|
46
|
+
const hasPasswordInput = document.querySelectorAll('input[type="password"]').length > 0;
|
|
47
|
+
|
|
48
|
+
// Strong signal 2: a form containing a password field
|
|
49
|
+
const formWithPassword = Array.from(document.querySelectorAll('form')).some(f => f.querySelector('input[type="password"]'));
|
|
50
|
+
|
|
51
|
+
// Strong signal 3: link/button to common auth routes
|
|
52
|
+
const authRouteLink = Array.from(document.querySelectorAll('a')).some(a => isValidHrefForAuth(a.getAttribute('href')));
|
|
53
|
+
|
|
54
|
+
return hasPasswordInput || formWithPassword || authRouteLink;
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Check for signup indicators — STRONG SIGNALS ONLY
|
|
58
|
+
introspection.hasSignup = await page.evaluate(() => {
|
|
59
|
+
const isValidHrefForSignup = (href) => {
|
|
60
|
+
if (!href) return false;
|
|
61
|
+
const h = href.trim().toLowerCase();
|
|
62
|
+
if (h.startsWith('javascript:')) return false;
|
|
63
|
+
if (h.startsWith('#')) return false;
|
|
64
|
+
try {
|
|
65
|
+
const u = new URL(h, window.location.origin);
|
|
66
|
+
const p = u.pathname.toLowerCase();
|
|
67
|
+
return /(\/signup|\/register|\/sign-up|\/auth\/signup)$|\/(signup|register|sign-up)(\/|$)/.test(p);
|
|
68
|
+
} catch (_) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
// Strong signal 1: form that contains a password field AND signup/register text
|
|
74
|
+
const formWithPasswordAndSignupText = Array.from(document.querySelectorAll('form')).some(form => {
|
|
75
|
+
const hasPwd = !!form.querySelector('input[type="password"]');
|
|
76
|
+
if (!hasPwd) return false;
|
|
77
|
+
const txt = (form.textContent || '').toLowerCase();
|
|
78
|
+
return /\b(sign ?up|register|create account|join|get started)\b/.test(txt);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// Strong signal 2: auth route link to signup/register paths
|
|
82
|
+
const authRouteLink = Array.from(document.querySelectorAll('a')).some(a => isValidHrefForSignup(a.getAttribute('href')));
|
|
83
|
+
|
|
84
|
+
return formWithPasswordAndSignupText || authRouteLink;
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
// Check for checkout/cart indicators — STRONG SIGNALS ONLY
|
|
88
|
+
introspection.hasCheckout = await page.evaluate(() => {
|
|
89
|
+
const isValidHrefForCheckout = (href) => {
|
|
90
|
+
if (!href) return false;
|
|
91
|
+
const h = href.trim().toLowerCase();
|
|
92
|
+
if (h.startsWith('javascript:')) return false;
|
|
93
|
+
if (h.startsWith('#')) return false;
|
|
94
|
+
try {
|
|
95
|
+
const u = new URL(h, window.location.origin);
|
|
96
|
+
const p = u.pathname.toLowerCase();
|
|
97
|
+
// Strong checkout/cart routes
|
|
98
|
+
return /(\/cart|\/checkout|\/basket)$|\/(cart|checkout|basket)(\/|$)/.test(p);
|
|
99
|
+
} catch (_) {
|
|
100
|
+
return false;
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
// Strong signal 1: auth route links to cart/checkout/basket
|
|
105
|
+
const routeLink = Array.from(document.querySelectorAll('a')).some(a => isValidHrefForCheckout(a.getAttribute('href')));
|
|
106
|
+
|
|
107
|
+
// Strong signal 2: buttons with explicit commerce actions
|
|
108
|
+
const commerceButtons = Array.from(document.querySelectorAll('button, input[type="submit"]')).some(btn => {
|
|
109
|
+
const text = (btn.textContent || btn.value || '').toLowerCase();
|
|
110
|
+
return /\b(add to cart|buy now|checkout|purchase)\b/.test(text);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
// Strong signal 3: explicit cart identifiers
|
|
114
|
+
const cartIndicators = Array.from(document.querySelectorAll('[id*="cart" i], [class*="cart" i], [class*="basket" i]')).length > 0;
|
|
115
|
+
|
|
116
|
+
return routeLink || commerceButtons || cartIndicators;
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
// Check for newsletter signup
|
|
120
|
+
introspection.hasNewsletter = await page.evaluate(() => {
|
|
121
|
+
// Check for newsletter-specific inputs
|
|
122
|
+
const hasNewsletterInput = Array.from(document.querySelectorAll('input[type="email"]')).some(input => {
|
|
123
|
+
const placeholder = (input.placeholder || '').toLowerCase();
|
|
124
|
+
const id = (input.id || '').toLowerCase();
|
|
125
|
+
const name = (input.name || '').toLowerCase();
|
|
126
|
+
return placeholder.match(/newsletter|subscribe|email/) ||
|
|
127
|
+
id.match(/newsletter|subscribe/) ||
|
|
128
|
+
name.match(/newsletter|subscribe/);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// Check for newsletter text
|
|
132
|
+
const hasNewsletterText = Array.from(document.querySelectorAll('form, div')).some(el => {
|
|
133
|
+
const text = (el.textContent || '').toLowerCase();
|
|
134
|
+
return text.match(/\b(newsletter|subscribe to|stay updated|get updates)\b/);
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
return hasNewsletterInput || hasNewsletterText;
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
// Check for contact form
|
|
141
|
+
introspection.hasContactForm = await page.evaluate(() => {
|
|
142
|
+
// Check links
|
|
143
|
+
const contactLinks = Array.from(document.querySelectorAll('a')).some(a => {
|
|
144
|
+
const text = (a.textContent || '').toLowerCase();
|
|
145
|
+
const href = (a.href || '').toLowerCase();
|
|
146
|
+
return text.match(/\b(contact|contact us|get in touch)\b/) ||
|
|
147
|
+
href.match(/\/contact/);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
// Check for forms with contact-related fields
|
|
151
|
+
const hasContactForm = Array.from(document.querySelectorAll('form')).some(form => {
|
|
152
|
+
const formText = (form.textContent || '').toLowerCase();
|
|
153
|
+
const hasNameField = form.querySelectorAll('input[name*="name"]').length > 0;
|
|
154
|
+
const hasEmailField = form.querySelectorAll('input[type="email"]').length > 0;
|
|
155
|
+
const hasMessageField = form.querySelectorAll('textarea').length > 0;
|
|
156
|
+
return formText.match(/contact|message|inquiry/) && hasNameField && hasEmailField && hasMessageField;
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
return contactLinks || hasContactForm;
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
// Check for language switch
|
|
163
|
+
introspection.hasLanguageSwitch = await page.evaluate(() => {
|
|
164
|
+
// Check for language selectors
|
|
165
|
+
const hasLangSelect = Array.from(document.querySelectorAll('select')).some(select => {
|
|
166
|
+
const id = (select.id || '').toLowerCase();
|
|
167
|
+
const name = (select.name || '').toLowerCase();
|
|
168
|
+
return id.match(/lang|language/) || name.match(/lang|language/);
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// Check for language links (common patterns)
|
|
172
|
+
const hasLangLinks = Array.from(document.querySelectorAll('a, button')).some(el => {
|
|
173
|
+
const text = (el.textContent || '').toLowerCase().trim();
|
|
174
|
+
const ariaLabel = (el.getAttribute('aria-label') || '').toLowerCase();
|
|
175
|
+
// Common language codes
|
|
176
|
+
return text.match(/^(en|es|fr|de|it|pt|ja|zh|ko|ru)$/i) ||
|
|
177
|
+
ariaLabel.match(/language|lang/) ||
|
|
178
|
+
text.match(/\b(english|español|français|deutsch)\b/i);
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
// Check for globe icon (common language switch indicator)
|
|
182
|
+
const hasGlobeIcon = Array.from(document.querySelectorAll('[class*="globe"], [class*="lang"], [class*="language"]')).length > 0;
|
|
183
|
+
|
|
184
|
+
return hasLangSelect || hasLangLinks || hasGlobeIcon;
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Content signals (generic): many internal links + article-like structure
|
|
188
|
+
introspection.hasContentSignals = await page.evaluate(() => {
|
|
189
|
+
try {
|
|
190
|
+
const originHost = window.location.hostname;
|
|
191
|
+
const internalLinks = Array.from(document.querySelectorAll('a')).filter(a => {
|
|
192
|
+
const href = a.getAttribute('href');
|
|
193
|
+
if (!href) return false;
|
|
194
|
+
const h = href.trim().toLowerCase();
|
|
195
|
+
if (h.startsWith('javascript:')) return false;
|
|
196
|
+
if (h.startsWith('#')) return false;
|
|
197
|
+
try {
|
|
198
|
+
const u = new URL(h, window.location.origin);
|
|
199
|
+
return u.hostname === originHost;
|
|
200
|
+
} catch (_) {
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
const mainEl = document.querySelector('main') || document.querySelector('article');
|
|
206
|
+
const paragraphCount = mainEl ? mainEl.querySelectorAll('p').length : 0;
|
|
207
|
+
|
|
208
|
+
const manyInternalLinks = internalLinks.length >= 20;
|
|
209
|
+
const hasArticleStructure = paragraphCount >= 10;
|
|
210
|
+
|
|
211
|
+
// Tiny special-case acceptable for Wikipedia (high confidence content site)
|
|
212
|
+
const isWikipedia = /(^|\.)wikipedia\.org$/.test(window.location.hostname);
|
|
213
|
+
|
|
214
|
+
return (manyInternalLinks && hasArticleStructure) || isWikipedia;
|
|
215
|
+
} catch (_) {
|
|
216
|
+
return false;
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
} catch (error) {
|
|
221
|
+
// If introspection fails, return all false (fail-safe)
|
|
222
|
+
console.warn(`[Introspection] Error during site inspection: ${error.message}`);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return introspection;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Detect site profile based on introspection results
|
|
230
|
+
*
|
|
231
|
+
* @param {Object} introspection - Result from inspectSite()
|
|
232
|
+
* @returns {string} Profile: 'ecommerce', 'saas', 'content', or 'unknown'
|
|
233
|
+
*/
|
|
234
|
+
function detectProfile(introspection) {
|
|
235
|
+
// E-commerce: strong checkout/cart signals
|
|
236
|
+
if (introspection.hasCheckout) {
|
|
237
|
+
return 'ecommerce';
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// SaaS: strong auth signals (login/signup) and no checkout
|
|
241
|
+
if ((introspection.hasLogin || introspection.hasSignup)) {
|
|
242
|
+
return 'saas';
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Content site: absence of ecommerce & saas; presence of content signals
|
|
246
|
+
if (introspection.hasLanguageSwitch || introspection.hasContactForm || introspection.hasContentSignals) {
|
|
247
|
+
return 'content';
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Unknown: nothing detected
|
|
251
|
+
return 'unknown';
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
module.exports = {
|
|
255
|
+
inspectSite,
|
|
256
|
+
detectProfile
|
|
257
|
+
};
|