@odavl/guardian 0.1.0-rc1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +146 -0
  2. package/README.md +155 -97
  3. package/bin/guardian.js +1544 -55
  4. package/config/README.md +59 -0
  5. package/config/profiles/landing-demo.yaml +16 -0
  6. package/package.json +26 -11
  7. package/policies/landing-demo.json +22 -0
  8. package/src/enterprise/audit-logger.js +166 -0
  9. package/src/enterprise/pdf-exporter.js +267 -0
  10. package/src/enterprise/rbac-gate.js +142 -0
  11. package/src/enterprise/rbac.js +239 -0
  12. package/src/enterprise/site-manager.js +180 -0
  13. package/src/founder/feedback-system.js +156 -0
  14. package/src/founder/founder-tracker.js +213 -0
  15. package/src/founder/usage-signals.js +141 -0
  16. package/src/guardian/alert-ledger.js +121 -0
  17. package/src/guardian/attempt-engine.js +587 -12
  18. package/src/guardian/attempt-registry.js +42 -1
  19. package/src/guardian/attempt-relevance.js +106 -0
  20. package/src/guardian/attempt.js +85 -39
  21. package/src/guardian/attempts-filter.js +63 -0
  22. package/src/guardian/baseline.js +50 -8
  23. package/src/guardian/breakage-intelligence.js +1 -0
  24. package/src/guardian/browser-pool.js +131 -0
  25. package/src/guardian/browser.js +28 -1
  26. package/src/guardian/ci-cli.js +121 -0
  27. package/src/guardian/ci-mode.js +15 -0
  28. package/src/guardian/ci-output.js +38 -0
  29. package/src/guardian/cli-summary.js +167 -67
  30. package/src/guardian/config-loader.js +162 -0
  31. package/src/guardian/data-guardian-detector.js +189 -0
  32. package/src/guardian/detection-layers.js +271 -0
  33. package/src/guardian/drift-detector.js +100 -0
  34. package/src/guardian/enhanced-html-reporter.js +221 -4
  35. package/src/guardian/env-guard.js +127 -0
  36. package/src/guardian/failure-intelligence.js +173 -0
  37. package/src/guardian/first-run-profile.js +89 -0
  38. package/src/guardian/first-run.js +54 -0
  39. package/src/guardian/flag-validator.js +111 -0
  40. package/src/guardian/flow-executor.js +309 -44
  41. package/src/guardian/html-reporter.js +2 -0
  42. package/src/guardian/human-reporter.js +431 -0
  43. package/src/guardian/index.js +22 -19
  44. package/src/guardian/init-command.js +9 -5
  45. package/src/guardian/intent-detector.js +146 -0
  46. package/src/guardian/journey-definitions.js +132 -0
  47. package/src/guardian/journey-scan-cli.js +145 -0
  48. package/src/guardian/journey-scanner.js +583 -0
  49. package/src/guardian/junit-reporter.js +18 -1
  50. package/src/guardian/language-detection.js +99 -0
  51. package/src/guardian/live-cli.js +95 -0
  52. package/src/guardian/live-scheduler-runner.js +137 -0
  53. package/src/guardian/live-scheduler.js +146 -0
  54. package/src/guardian/market-reporter.js +357 -82
  55. package/src/guardian/parallel-executor.js +116 -0
  56. package/src/guardian/pattern-analyzer.js +348 -0
  57. package/src/guardian/policy.js +80 -3
  58. package/src/guardian/prerequisite-checker.js +101 -0
  59. package/src/guardian/preset-loader.js +27 -18
  60. package/src/guardian/profile-loader.js +96 -0
  61. package/src/guardian/reality.js +1612 -115
  62. package/src/guardian/reporter.js +27 -41
  63. package/src/guardian/run-artifacts.js +212 -0
  64. package/src/guardian/run-cleanup.js +207 -0
  65. package/src/guardian/run-latest.js +90 -0
  66. package/src/guardian/run-list.js +211 -0
  67. package/src/guardian/run-summary.js +20 -0
  68. package/src/guardian/scan-presets.js +100 -11
  69. package/src/guardian/selector-fallbacks.js +394 -0
  70. package/src/guardian/semantic-contact-detection.js +255 -0
  71. package/src/guardian/semantic-contact-finder.js +201 -0
  72. package/src/guardian/semantic-targets.js +234 -0
  73. package/src/guardian/site-introspection.js +257 -0
  74. package/src/guardian/smoke.js +258 -0
  75. package/src/guardian/snapshot-schema.js +25 -1
  76. package/src/guardian/snapshot.js +69 -3
  77. package/src/guardian/stability-scorer.js +169 -0
  78. package/src/guardian/success-evaluator.js +214 -0
  79. package/src/guardian/template-command.js +184 -0
  80. package/src/guardian/text-formatters.js +426 -0
  81. package/src/guardian/timeout-profiles.js +57 -0
  82. package/src/guardian/verdict.js +320 -0
  83. package/src/guardian/verdicts.js +74 -0
  84. package/src/guardian/wait-for-outcome.js +120 -0
  85. package/src/guardian/watch-runner.js +181 -0
  86. package/src/payments/stripe-checkout.js +169 -0
  87. package/src/plans/plan-definitions.js +148 -0
  88. package/src/plans/plan-manager.js +211 -0
  89. package/src/plans/usage-tracker.js +210 -0
  90. package/src/recipes/recipe-engine.js +188 -0
  91. package/src/recipes/recipe-failure-analysis.js +159 -0
  92. package/src/recipes/recipe-registry.js +134 -0
  93. package/src/recipes/recipe-runtime.js +507 -0
  94. package/src/recipes/recipe-store.js +410 -0
  95. package/guardian-contract-v1.md +0 -149
  96. /package/{guardian.config.json → config/guardian.config.json} +0 -0
  97. /package/{guardian.policy.json → config/guardian.policy.json} +0 -0
  98. /package/{guardian.profile.docs.yaml → config/profiles/docs.yaml} +0 -0
  99. /package/{guardian.profile.ecommerce.yaml → config/profiles/ecommerce.yaml} +0 -0
  100. /package/{guardian.profile.marketing.yaml → config/profiles/marketing.yaml} +0 -0
  101. /package/{guardian.profile.saas.yaml → config/profiles/saas.yaml} +0 -0
@@ -0,0 +1,201 @@
1
+ /**
2
+ * Semantic Contact Finder Integration
3
+ *
4
+ * Integrates semantic contact detection into the scanning flow.
5
+ * Works with Playwright to find contact links/forms in real pages.
6
+ *
7
+ * Now includes Wave 1.2 detection layers with data-guardian attribute support.
8
+ */
9
+
10
+ const { detectLanguage, getPrimaryLanguage, getLanguageName } = require('./language-detection');
11
+ const { detectContactCandidates, formatDetectionResult, getNoContactFoundHint } = require('./semantic-contact-detection');
12
+ const { getTokensForTarget, normalizeText } = require('./semantic-targets');
13
+ const { detectByLayers, LAYER, CONFIDENCE } = require('./detection-layers');
14
+
15
+ /**
16
+ * Find contact elements on a page using semantic detection
17
+ *
18
+ * @param {Page} page - Playwright page object
19
+ * @param {string} baseUrl - Base URL for relative links
20
+ * @returns {Promise<Object>} Detection result with language, candidates, and recommendations
21
+ */
22
+ async function findContactOnPage(page, baseUrl = '') {
23
+ const result = {
24
+ language: 'unknown',
25
+ languageName: 'Unknown',
26
+ candidates: [],
27
+ found: false,
28
+ hint: ''
29
+ };
30
+
31
+ try {
32
+ // Detect language
33
+ result.language = await detectLanguage(page);
34
+ result.languageName = getLanguageName(result.language);
35
+
36
+ // Find contact candidates
37
+ const candidates = await detectContactCandidates(page, baseUrl);
38
+ result.candidates = candidates;
39
+
40
+ if (candidates.length > 0) {
41
+ result.found = true;
42
+ result.primaryCandidate = candidates[0]; // Highest confidence
43
+ } else {
44
+ result.hint = getNoContactFoundHint();
45
+ }
46
+
47
+ return result;
48
+ } catch (error) {
49
+ console.warn(`Contact detection failed: ${error.message}`);
50
+ result.hint = `Contact detection failed: ${error.message}. Fallback to default selectors.`;
51
+ return result;
52
+ }
53
+ }
54
+
55
+ /**
56
+ * Generate Playwright selectors from semantic candidates
57
+ * Returns a fallback selector chain compatible with attempt registry
58
+ */
59
+ function generateSelectorsFromCandidates(candidates) {
60
+ if (!candidates || candidates.length === 0) {
61
+ return null;
62
+ }
63
+
64
+ // Take top 3 candidates, prefer high confidence
65
+ const topCandidates = candidates.slice(0, 3);
66
+
67
+ // Build selector chain
68
+ const selectors = topCandidates
69
+ .map(c => c.selector)
70
+ .filter(Boolean)
71
+ .join(', ');
72
+
73
+ return selectors || null;
74
+ }
75
+
76
+ /**
77
+ * Find contact elements using Wave 1.2 detection layers
78
+ * Respects priority: data-guardian > href > text > structure
79
+ *
80
+ * @param {Page} page - Playwright page object
81
+ * @param {string} target - Detection target (contact, form, submit, about)
82
+ * @param {string} baseUrl - Base URL for relative links
83
+ * @returns {Promise<Object>} Detection result with layer, confidence, reason
84
+ */
85
+ async function findElementByLayers(page, target, baseUrl = '') {
86
+ const result = {
87
+ language: 'unknown',
88
+ languageName: 'Unknown',
89
+ target: target,
90
+ found: false,
91
+ layer: null,
92
+ confidence: null,
93
+ candidates: [],
94
+ primaryCandidate: null,
95
+ reason: '',
96
+ hint: ''
97
+ };
98
+
99
+ try {
100
+ // Detect language
101
+ result.language = await detectLanguage(page);
102
+ result.languageName = getLanguageName(result.language);
103
+
104
+ // Use Wave 1.2 detection layers
105
+ const layerResult = await detectByLayers(page, target, baseUrl);
106
+
107
+ result.found = layerResult.found;
108
+ result.layer = layerResult.layer;
109
+ result.confidence = layerResult.confidence;
110
+ result.candidates = layerResult.candidates;
111
+ result.primaryCandidate = layerResult.primaryCandidate;
112
+ result.evidence = layerResult.evidence;
113
+ result.reason = layerResult.reason;
114
+
115
+ if (!result.found) {
116
+ result.hint = `No ${target} detected. Consider adding data-guardian="${target}" attribute for guaranteed stability.`;
117
+ }
118
+
119
+ return result;
120
+ } catch (error) {
121
+ console.warn(`Detection by layers failed: ${error.message}`);
122
+ result.reason = `Detection error: ${error.message}`;
123
+ result.hint = 'Fallback to manual configuration or heuristic detection.';
124
+ return result;
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Generate Playwright selectors from semantic candidates
130
+ * Returns a fallback selector chain compatible with attempt registry
131
+ */
132
+ function generateSelectorsFromCandidates(candidates) {
133
+ if (!candidates || candidates.length === 0) {
134
+ return null;
135
+ }
136
+
137
+ // Take top 3 candidates, prefer high confidence
138
+ const topCandidates = candidates.slice(0, 3);
139
+
140
+ // Build selector chain
141
+ const selectors = topCandidates
142
+ .map(c => c.selector)
143
+ .filter(Boolean)
144
+ .join(', ');
145
+
146
+ return selectors || null;
147
+ }
148
+
149
+ /**
150
+ * Format detection output for CLI reporting (Wave 1.2 enhanced)
151
+ * Shows which layer was used and how to improve stability
152
+ */
153
+ function formatDetectionForReport(detectionResult) {
154
+ const lines = [];
155
+
156
+ lines.push(`🌍 Language Detection: ${detectionResult.languageName}`);
157
+ lines.push(` (lang=${detectionResult.language})`);
158
+
159
+ if (detectionResult.found && detectionResult.candidates.length > 0) {
160
+ lines.push('');
161
+
162
+ // Show detection layer (Wave 1.2)
163
+ if (detectionResult.layer) {
164
+ lines.push(`📍 Detection Layer: ${detectionResult.layer} (confidence: ${detectionResult.confidence})`);
165
+ lines.push(` ${detectionResult.reason}`);
166
+ lines.push('');
167
+ }
168
+
169
+ lines.push(`✅ ${detectionResult.target} Detection Results (${detectionResult.candidates.length} candidate${detectionResult.candidates.length > 1 ? 's' : ''})`);
170
+
171
+ detectionResult.candidates.forEach((candidate, idx) => {
172
+ const formatted = formatDetectionResult(candidate, detectionResult.language);
173
+ lines.push(` ${idx + 1}. ${formatted}`);
174
+ if (candidate.matchedText) {
175
+ lines.push(` Text: "${candidate.matchedText}"`);
176
+ }
177
+ if (candidate.href) {
178
+ lines.push(` Link: ${candidate.href}`);
179
+ }
180
+ });
181
+ } else {
182
+ lines.push('');
183
+ // Clarify selector-based scope to avoid overstating discovery
184
+ lines.push(`❌ No ${detectionResult.target || 'contact'} page/link discovered via selectors`);
185
+ if (detectionResult.reason) {
186
+ lines.push(` Reason: ${detectionResult.reason}`);
187
+ }
188
+ if (detectionResult.hint) {
189
+ lines.push(` 💡 ${detectionResult.hint}`);
190
+ }
191
+ }
192
+
193
+ return lines.join('\n');
194
+ }
195
+
196
+ module.exports = {
197
+ findContactOnPage,
198
+ findElementByLayers,
199
+ generateSelectorsFromCandidates,
200
+ formatDetectionForReport
201
+ };
@@ -0,0 +1,234 @@
1
+ /**
2
+ * Semantic Targets & Multilingual Dictionary
3
+ *
4
+ * Provides deterministic, language-independent detection of semantic targets
5
+ * (contact, about, etc.) using normalized tokens from multiple languages.
6
+ */
7
+
8
+ /**
9
+ * Multilingual dictionary for semantic targets
10
+ * Keys: target names, Values: arrays of normalized token variants
11
+ */
12
+ const SEMANTIC_DICTIONARY = {
13
+ contact: [
14
+ // English
15
+ 'contact', 'contactus', 'contact-us', 'contact us', 'get in touch', 'getintouch',
16
+ 'reach out', 'reachout', 'contact form', 'contactform', 'contact page', 'contactpage',
17
+ 'inquiry', 'inquiries', 'message us', 'messageus', 'write to us', 'writetus',
18
+ // German
19
+ 'kontakt', 'kontaktieren', 'kontaktaufnahme', 'kontaktformular', 'kontakten',
20
+ 'kontakts', 'kontakt formular', 'kontakt-formular', 'anfrage', 'anfragen',
21
+ // Spanish
22
+ 'contacto', 'contactanos', 'contacta', 'formulario de contacto',
23
+ 'pongase en contacto', 'ponte en contacto', 'escribenos', 'escriba',
24
+ // French
25
+ 'contact', 'contactez', 'contactez-nous', 'formulaire de contact',
26
+ 'nous contacter', 'nous ecrire',
27
+ // Portuguese
28
+ 'contato', 'contacto', 'formulario de contato', 'entre em contato',
29
+ 'fale conosco', 'escreva para nos',
30
+ // Italian
31
+ 'contatti', 'contatto', 'contattaci', 'modulo di contatto',
32
+ 'modulo contatti', 'mettersi in contatto',
33
+ // Dutch
34
+ 'contact', 'contacteer', 'contact opnemen', 'contactformulier',
35
+ // Swedish
36
+ 'kontakt', 'kontakta', 'kontaktformular',
37
+ // Arabic
38
+ 'تواصل', 'اتصل', 'استفسار', 'استفسارات', 'نموذج الاتصال', 'نموذج تواصل',
39
+ // Chinese
40
+ '联系', '联系我们', '联系表单', '留言', '反馈'
41
+ ],
42
+ about: [
43
+ // English
44
+ 'about', 'about us', 'aboutus', 'our story', 'about-us',
45
+ 'company', 'team', 'who we are', 'whoweare', 'more about us',
46
+ // German
47
+ 'uber', 'über', 'ueber', 'uber uns', 'über uns', 'ueber uns',
48
+ 'uber unsere', 'über unsere', 'ueber unsere', 'team', 'unternehmen',
49
+ // Spanish
50
+ 'acerca', 'acerca de', 'acerca de nosotros', 'sobre nosotros',
51
+ 'quienes somos', 'quiénes somos', 'nuestra empresa',
52
+ // French
53
+ 'a propos', 'à propos', 'a propos de nous', 'à propos de nous',
54
+ 'qui sommes nous', 'qui nous sommes', 'notre histoire',
55
+ // Portuguese
56
+ 'sobre', 'sobre nos', 'sobre nós', 'quem somos',
57
+ // Italian
58
+ 'chi siamo', 'chi siamo noi', 'la nostra storia',
59
+ // Dutch
60
+ 'over', 'over ons', 'wie zijn we',
61
+ // Swedish
62
+ 'om', 'om oss', 'var historia',
63
+ // Arabic
64
+ 'عن', 'عننا', 'عن الشركة', 'فريقنا', 'قصتنا'
65
+ ],
66
+ form: [
67
+ // English
68
+ 'form', 'form submission', 'contact form', 'feedback form', 'inquiry form',
69
+ 'form page', 'form element', 'form section', 'form area',
70
+ // German
71
+ 'formular', 'form', 'kontaktformular', 'feedback formular',
72
+ // Spanish
73
+ 'formulario', 'formulario de contacto', 'formulario de envio',
74
+ // French
75
+ 'formulaire', 'formulaire de contact', 'formulaire d envoi',
76
+ // Portuguese
77
+ 'formulario', 'formulario de contato', 'formulario de envio',
78
+ // Italian
79
+ 'modulo', 'modulo di contatto', 'modulo di invio',
80
+ // Dutch
81
+ 'formulier', 'contactformulier', 'formulier voor contact',
82
+ // Swedish
83
+ 'formular', 'kontaktformular', 'feedback formular'
84
+ ],
85
+ submit: [
86
+ // English
87
+ 'submit', 'send', 'send message', 'send form', 'submit form',
88
+ 'submit button', 'send button', 'post', 'publish', 'share',
89
+ // German
90
+ 'senden', 'absenden', 'abschicken', 'submit', 'ubermitteln',
91
+ // Spanish
92
+ 'enviar', 'enviar formulario', 'enviar mensaje', 'publicar',
93
+ // French
94
+ 'envoyer', 'soumettre', 'publier', 'partager',
95
+ // Portuguese
96
+ 'enviar', 'enviar formulario', 'enviar mensagem', 'publicar',
97
+ // Italian
98
+ 'inviare', 'inviare modulo', 'inviare messaggio', 'pubblicare',
99
+ // Dutch
100
+ 'verzenden', 'verstuur', 'verstuur formulier', 'publiceren',
101
+ // Swedish
102
+ 'skicka', 'skicka formular', 'publicera'
103
+ ]
104
+ };
105
+
106
+ /**
107
+ * Normalize text for comparison
108
+ * - Lowercase
109
+ * - Trim whitespace
110
+ * - Remove diacritics (é → e, ü → u, etc.)
111
+ * - Remove punctuation
112
+ * - Collapse multiple spaces
113
+ */
114
+ function normalizeText(text) {
115
+ if (typeof text !== 'string') {
116
+ return '';
117
+ }
118
+
119
+ // Lowercase
120
+ let normalized = text.toLowerCase();
121
+
122
+ // Remove diacritics using Unicode normalization
123
+ // NFD: decompose accented characters, then filter combining marks
124
+ normalized = normalized
125
+ .normalize('NFD')
126
+ .replace(/[\u0300-\u036f]/g, '');
127
+
128
+ // Remove punctuation and special characters, keep spaces
129
+ normalized = normalized.replace(/[^\w\s]/g, ' ');
130
+
131
+ // Collapse multiple spaces
132
+ normalized = normalized.replace(/\s+/g, ' ').trim();
133
+
134
+ return normalized;
135
+ }
136
+
137
+ /**
138
+ * Check if normalized text includes any token from the list
139
+ * Matches whole words/tokens at word boundaries where appropriate
140
+ */
141
+ function includesAnyToken(normalizedText, tokenList) {
142
+ if (!normalizedText || !Array.isArray(tokenList)) {
143
+ return false;
144
+ }
145
+
146
+ // Check each token
147
+ for (const token of tokenList) {
148
+ // Normalize the token
149
+ const normalizedToken = normalizeText(token);
150
+
151
+ if (!normalizedToken) {
152
+ continue;
153
+ }
154
+
155
+ // For very short tokens (<=4 chars), require word boundary
156
+ // For longer tokens (>4 chars), allow substring matching
157
+ if (normalizedToken.length <= 4) {
158
+ // Word boundary match
159
+ const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
160
+ if (wordBoundaryRegex.test(normalizedText)) {
161
+ return true;
162
+ }
163
+ } else {
164
+ // Substring match
165
+ if (normalizedText.includes(normalizedToken)) {
166
+ return true;
167
+ }
168
+ }
169
+ }
170
+
171
+ return false;
172
+ }
173
+
174
+ /**
175
+ * Get the best matching token from a list for a given text
176
+ * Returns the token that was matched, or null
177
+ */
178
+ function getMatchedToken(normalizedText, tokenList) {
179
+ if (!normalizedText || !Array.isArray(tokenList)) {
180
+ return null;
181
+ }
182
+
183
+ for (const token of tokenList) {
184
+ const normalizedToken = normalizeText(token);
185
+
186
+ if (!normalizedToken) {
187
+ continue;
188
+ }
189
+
190
+ if (normalizedToken.length <= 4) {
191
+ const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
192
+ if (wordBoundaryRegex.test(normalizedText)) {
193
+ return token;
194
+ }
195
+ } else {
196
+ if (normalizedText.includes(normalizedToken)) {
197
+ return token;
198
+ }
199
+ }
200
+ }
201
+
202
+ return null;
203
+ }
204
+
205
+ /**
206
+ * Get all target names available in dictionary
207
+ */
208
+ function getAvailableTargets() {
209
+ return Object.keys(SEMANTIC_DICTIONARY);
210
+ }
211
+
212
+ /**
213
+ * Check if a semantic target exists in dictionary
214
+ */
215
+ function isValidTarget(targetName) {
216
+ return targetName in SEMANTIC_DICTIONARY;
217
+ }
218
+
219
+ /**
220
+ * Get token list for a specific target
221
+ */
222
+ function getTokensForTarget(targetName) {
223
+ return SEMANTIC_DICTIONARY[targetName] || [];
224
+ }
225
+
226
+ module.exports = {
227
+ SEMANTIC_DICTIONARY,
228
+ normalizeText,
229
+ includesAnyToken,
230
+ getMatchedToken,
231
+ getAvailableTargets,
232
+ isValidTarget,
233
+ getTokensForTarget
234
+ };
@@ -0,0 +1,257 @@
1
+ /**
2
+ * Site Introspection - DOM-based capability detection
3
+ * Deterministically identifies site features by inspecting the loaded page.
4
+ */
5
+
6
+ /**
7
+ * Inspect the page for various capabilities
8
+ *
9
+ * @param {Page} page - Playwright page object (already loaded)
10
+ * @returns {Promise<Object>} introspection results
11
+ */
12
+ async function inspectSite(page) {
13
+ const introspection = {
14
+ hasLogin: false,
15
+ hasSignup: false,
16
+ hasCheckout: false,
17
+ hasNewsletter: false,
18
+ hasContactForm: false,
19
+ hasLanguageSwitch: false,
20
+ hasContentSignals: false
21
+ };
22
+
23
+ try {
24
+ // Check for forms - basic existence
25
+ const hasForms = await page.evaluate(() => {
26
+ return document.querySelectorAll('form').length > 0;
27
+ });
28
+
29
+ // Check for login indicators — STRONG SIGNALS ONLY
30
+ introspection.hasLogin = await page.evaluate(() => {
31
+ const isValidHrefForAuth = (href) => {
32
+ if (!href) return false;
33
+ const h = href.trim().toLowerCase();
34
+ if (h.startsWith('javascript:')) return false;
35
+ if (h.startsWith('#')) return false;
36
+ try {
37
+ const u = new URL(h, window.location.origin);
38
+ const p = u.pathname.toLowerCase();
39
+ return /(\/login|\/signin|\/sign-in|\/auth\/login)$|\/(login|signin)(\/|$)/.test(p);
40
+ } catch (_) {
41
+ return false;
42
+ }
43
+ };
44
+
45
+ // Strong signal 1: password input on page
46
+ const hasPasswordInput = document.querySelectorAll('input[type="password"]').length > 0;
47
+
48
+ // Strong signal 2: a form containing a password field
49
+ const formWithPassword = Array.from(document.querySelectorAll('form')).some(f => f.querySelector('input[type="password"]'));
50
+
51
+ // Strong signal 3: link/button to common auth routes
52
+ const authRouteLink = Array.from(document.querySelectorAll('a')).some(a => isValidHrefForAuth(a.getAttribute('href')));
53
+
54
+ return hasPasswordInput || formWithPassword || authRouteLink;
55
+ });
56
+
57
+ // Check for signup indicators — STRONG SIGNALS ONLY
58
+ introspection.hasSignup = await page.evaluate(() => {
59
+ const isValidHrefForSignup = (href) => {
60
+ if (!href) return false;
61
+ const h = href.trim().toLowerCase();
62
+ if (h.startsWith('javascript:')) return false;
63
+ if (h.startsWith('#')) return false;
64
+ try {
65
+ const u = new URL(h, window.location.origin);
66
+ const p = u.pathname.toLowerCase();
67
+ return /(\/signup|\/register|\/sign-up|\/auth\/signup)$|\/(signup|register|sign-up)(\/|$)/.test(p);
68
+ } catch (_) {
69
+ return false;
70
+ }
71
+ };
72
+
73
+ // Strong signal 1: form that contains a password field AND signup/register text
74
+ const formWithPasswordAndSignupText = Array.from(document.querySelectorAll('form')).some(form => {
75
+ const hasPwd = !!form.querySelector('input[type="password"]');
76
+ if (!hasPwd) return false;
77
+ const txt = (form.textContent || '').toLowerCase();
78
+ return /\b(sign ?up|register|create account|join|get started)\b/.test(txt);
79
+ });
80
+
81
+ // Strong signal 2: auth route link to signup/register paths
82
+ const authRouteLink = Array.from(document.querySelectorAll('a')).some(a => isValidHrefForSignup(a.getAttribute('href')));
83
+
84
+ return formWithPasswordAndSignupText || authRouteLink;
85
+ });
86
+
87
+ // Check for checkout/cart indicators — STRONG SIGNALS ONLY
88
+ introspection.hasCheckout = await page.evaluate(() => {
89
+ const isValidHrefForCheckout = (href) => {
90
+ if (!href) return false;
91
+ const h = href.trim().toLowerCase();
92
+ if (h.startsWith('javascript:')) return false;
93
+ if (h.startsWith('#')) return false;
94
+ try {
95
+ const u = new URL(h, window.location.origin);
96
+ const p = u.pathname.toLowerCase();
97
+ // Strong checkout/cart routes
98
+ return /(\/cart|\/checkout|\/basket)$|\/(cart|checkout|basket)(\/|$)/.test(p);
99
+ } catch (_) {
100
+ return false;
101
+ }
102
+ };
103
+
104
+ // Strong signal 1: auth route links to cart/checkout/basket
105
+ const routeLink = Array.from(document.querySelectorAll('a')).some(a => isValidHrefForCheckout(a.getAttribute('href')));
106
+
107
+ // Strong signal 2: buttons with explicit commerce actions
108
+ const commerceButtons = Array.from(document.querySelectorAll('button, input[type="submit"]')).some(btn => {
109
+ const text = (btn.textContent || btn.value || '').toLowerCase();
110
+ return /\b(add to cart|buy now|checkout|purchase)\b/.test(text);
111
+ });
112
+
113
+ // Strong signal 3: explicit cart identifiers
114
+ const cartIndicators = Array.from(document.querySelectorAll('[id*="cart" i], [class*="cart" i], [class*="basket" i]')).length > 0;
115
+
116
+ return routeLink || commerceButtons || cartIndicators;
117
+ });
118
+
119
+ // Check for newsletter signup
120
+ introspection.hasNewsletter = await page.evaluate(() => {
121
+ // Check for newsletter-specific inputs
122
+ const hasNewsletterInput = Array.from(document.querySelectorAll('input[type="email"]')).some(input => {
123
+ const placeholder = (input.placeholder || '').toLowerCase();
124
+ const id = (input.id || '').toLowerCase();
125
+ const name = (input.name || '').toLowerCase();
126
+ return placeholder.match(/newsletter|subscribe|email/) ||
127
+ id.match(/newsletter|subscribe/) ||
128
+ name.match(/newsletter|subscribe/);
129
+ });
130
+
131
+ // Check for newsletter text
132
+ const hasNewsletterText = Array.from(document.querySelectorAll('form, div')).some(el => {
133
+ const text = (el.textContent || '').toLowerCase();
134
+ return text.match(/\b(newsletter|subscribe to|stay updated|get updates)\b/);
135
+ });
136
+
137
+ return hasNewsletterInput || hasNewsletterText;
138
+ });
139
+
140
+ // Check for contact form
141
+ introspection.hasContactForm = await page.evaluate(() => {
142
+ // Check links
143
+ const contactLinks = Array.from(document.querySelectorAll('a')).some(a => {
144
+ const text = (a.textContent || '').toLowerCase();
145
+ const href = (a.href || '').toLowerCase();
146
+ return text.match(/\b(contact|contact us|get in touch)\b/) ||
147
+ href.match(/\/contact/);
148
+ });
149
+
150
+ // Check for forms with contact-related fields
151
+ const hasContactForm = Array.from(document.querySelectorAll('form')).some(form => {
152
+ const formText = (form.textContent || '').toLowerCase();
153
+ const hasNameField = form.querySelectorAll('input[name*="name"]').length > 0;
154
+ const hasEmailField = form.querySelectorAll('input[type="email"]').length > 0;
155
+ const hasMessageField = form.querySelectorAll('textarea').length > 0;
156
+ return formText.match(/contact|message|inquiry/) && hasNameField && hasEmailField && hasMessageField;
157
+ });
158
+
159
+ return contactLinks || hasContactForm;
160
+ });
161
+
162
+ // Check for language switch
163
+ introspection.hasLanguageSwitch = await page.evaluate(() => {
164
+ // Check for language selectors
165
+ const hasLangSelect = Array.from(document.querySelectorAll('select')).some(select => {
166
+ const id = (select.id || '').toLowerCase();
167
+ const name = (select.name || '').toLowerCase();
168
+ return id.match(/lang|language/) || name.match(/lang|language/);
169
+ });
170
+
171
+ // Check for language links (common patterns)
172
+ const hasLangLinks = Array.from(document.querySelectorAll('a, button')).some(el => {
173
+ const text = (el.textContent || '').toLowerCase().trim();
174
+ const ariaLabel = (el.getAttribute('aria-label') || '').toLowerCase();
175
+ // Common language codes
176
+ return text.match(/^(en|es|fr|de|it|pt|ja|zh|ko|ru)$/i) ||
177
+ ariaLabel.match(/language|lang/) ||
178
+ text.match(/\b(english|español|français|deutsch)\b/i);
179
+ });
180
+
181
+ // Check for globe icon (common language switch indicator)
182
+ const hasGlobeIcon = Array.from(document.querySelectorAll('[class*="globe"], [class*="lang"], [class*="language"]')).length > 0;
183
+
184
+ return hasLangSelect || hasLangLinks || hasGlobeIcon;
185
+ });
186
+
187
+ // Content signals (generic): many internal links + article-like structure
188
+ introspection.hasContentSignals = await page.evaluate(() => {
189
+ try {
190
+ const originHost = window.location.hostname;
191
+ const internalLinks = Array.from(document.querySelectorAll('a')).filter(a => {
192
+ const href = a.getAttribute('href');
193
+ if (!href) return false;
194
+ const h = href.trim().toLowerCase();
195
+ if (h.startsWith('javascript:')) return false;
196
+ if (h.startsWith('#')) return false;
197
+ try {
198
+ const u = new URL(h, window.location.origin);
199
+ return u.hostname === originHost;
200
+ } catch (_) {
201
+ return false;
202
+ }
203
+ });
204
+
205
+ const mainEl = document.querySelector('main') || document.querySelector('article');
206
+ const paragraphCount = mainEl ? mainEl.querySelectorAll('p').length : 0;
207
+
208
+ const manyInternalLinks = internalLinks.length >= 20;
209
+ const hasArticleStructure = paragraphCount >= 10;
210
+
211
+ // Tiny special-case acceptable for Wikipedia (high confidence content site)
212
+ const isWikipedia = /(^|\.)wikipedia\.org$/.test(window.location.hostname);
213
+
214
+ return (manyInternalLinks && hasArticleStructure) || isWikipedia;
215
+ } catch (_) {
216
+ return false;
217
+ }
218
+ });
219
+
220
+ } catch (error) {
221
+ // If introspection fails, return all false (fail-safe)
222
+ console.warn(`[Introspection] Error during site inspection: ${error.message}`);
223
+ }
224
+
225
+ return introspection;
226
+ }
227
+
228
+ /**
229
+ * Detect site profile based on introspection results
230
+ *
231
+ * @param {Object} introspection - Result from inspectSite()
232
+ * @returns {string} Profile: 'ecommerce', 'saas', 'content', or 'unknown'
233
+ */
234
+ function detectProfile(introspection) {
235
+ // E-commerce: strong checkout/cart signals
236
+ if (introspection.hasCheckout) {
237
+ return 'ecommerce';
238
+ }
239
+
240
+ // SaaS: strong auth signals (login/signup) and no checkout
241
+ if ((introspection.hasLogin || introspection.hasSignup)) {
242
+ return 'saas';
243
+ }
244
+
245
+ // Content site: absence of ecommerce & saas; presence of content signals
246
+ if (introspection.hasLanguageSwitch || introspection.hasContactForm || introspection.hasContentSignals) {
247
+ return 'content';
248
+ }
249
+
250
+ // Unknown: nothing detected
251
+ return 'unknown';
252
+ }
253
+
254
+ module.exports = {
255
+ inspectSite,
256
+ detectProfile
257
+ };