@odavl/guardian 0.1.0-rc1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CHANGELOG.md +62 -0
  2. package/README.md +3 -3
  3. package/bin/guardian.js +212 -8
  4. package/package.json +6 -1
  5. package/src/guardian/attempt-engine.js +19 -5
  6. package/src/guardian/attempt.js +61 -39
  7. package/src/guardian/attempts-filter.js +63 -0
  8. package/src/guardian/baseline.js +44 -10
  9. package/src/guardian/browser-pool.js +131 -0
  10. package/src/guardian/browser.js +28 -1
  11. package/src/guardian/ci-mode.js +15 -0
  12. package/src/guardian/ci-output.js +37 -0
  13. package/src/guardian/cli-summary.js +117 -4
  14. package/src/guardian/data-guardian-detector.js +189 -0
  15. package/src/guardian/detection-layers.js +271 -0
  16. package/src/guardian/first-run.js +49 -0
  17. package/src/guardian/flag-validator.js +97 -0
  18. package/src/guardian/flow-executor.js +309 -44
  19. package/src/guardian/language-detection.js +99 -0
  20. package/src/guardian/market-reporter.js +16 -1
  21. package/src/guardian/parallel-executor.js +116 -0
  22. package/src/guardian/prerequisite-checker.js +101 -0
  23. package/src/guardian/preset-loader.js +18 -12
  24. package/src/guardian/profile-loader.js +96 -0
  25. package/src/guardian/reality.js +382 -46
  26. package/src/guardian/run-summary.js +20 -0
  27. package/src/guardian/semantic-contact-detection.js +255 -0
  28. package/src/guardian/semantic-contact-finder.js +200 -0
  29. package/src/guardian/semantic-targets.js +234 -0
  30. package/src/guardian/smoke.js +258 -0
  31. package/src/guardian/snapshot.js +23 -1
  32. package/src/guardian/success-evaluator.js +214 -0
  33. package/src/guardian/timeout-profiles.js +57 -0
  34. package/src/guardian/wait-for-outcome.js +120 -0
  35. package/src/guardian/watch-runner.js +185 -0
@@ -0,0 +1,255 @@
1
+ /**
2
+ * Semantic Contact Detection
3
+ *
4
+ * Deterministic, multilingual detection of contact links and elements.
5
+ * Returns ranked candidates with source, confidence, and matched tokens.
6
+ */
7
+
8
+ const { getTokensForTarget, normalizeText, getMatchedToken } = require('./semantic-targets');
9
+
10
+ /**
11
+ * Confidence levels
12
+ */
13
+ const CONFIDENCE = {
14
+ HIGH: 'high',
15
+ MEDIUM: 'medium',
16
+ LOW: 'low'
17
+ };
18
+
19
+ /**
20
+ * Detection sources
21
+ */
22
+ const DETECTION_SOURCE = {
23
+ DATA_GUARDIAN: 'data-guardian',
24
+ ARIA: 'aria',
25
+ HREF: 'href',
26
+ TEXT: 'text',
27
+ NAV_FOOTER: 'nav/footer',
28
+ HEURISTIC: 'heuristic'
29
+ };
30
+
31
+ /**
32
+ * Detect contact candidates on page
33
+ *
34
+ * @param {Page} page - Playwright page object
35
+ * @param {string} baseUrl - Base URL for relative link resolution
36
+ * @returns {Promise<Array>} Array of contact candidates, ranked by confidence
37
+ */
38
+ async function detectContactCandidates(page, baseUrl = '') {
39
+ const candidates = [];
40
+
41
+ try {
42
+ const pageData = await page.evaluate(async () => {
43
+ const results = [];
44
+
45
+ // Find all clickable/linkable elements
46
+ const elements = document.querySelectorAll('a, button, [role="link"], [role="button"], [data-guardian], .nav a, footer a');
47
+
48
+ for (const el of elements) {
49
+ const data = {
50
+ tagName: el.tagName.toLowerCase(),
51
+ text: el.textContent?.trim() || '',
52
+ href: el.href || el.getAttribute('href') || '',
53
+ dataGuardian: el.getAttribute('data-guardian') || '',
54
+ ariaLabel: el.getAttribute('aria-label') || '',
55
+ title: el.getAttribute('title') || '',
56
+ className: el.className,
57
+ isInNav: !!el.closest('nav, [role="navigation"]'),
58
+ isInFooter: !!el.closest('footer, [role="contentinfo"]')
59
+ };
60
+
61
+ results.push(data);
62
+ }
63
+
64
+ return results;
65
+ });
66
+
67
+ // Process each element
68
+ for (const element of pageData) {
69
+ const contactCandidates = evaluateElement(element, baseUrl);
70
+ candidates.push(...contactCandidates);
71
+ }
72
+
73
+ // Sort by confidence (high > medium > low) and then by detection order
74
+ candidates.sort((a, b) => {
75
+ const confidenceOrder = { high: 0, medium: 1, low: 2 };
76
+ return confidenceOrder[a.confidence] - confidenceOrder[b.confidence];
77
+ });
78
+
79
+ // Remove duplicates (same href or same text)
80
+ const seen = new Set();
81
+ const deduplicated = [];
82
+
83
+ for (const candidate of candidates) {
84
+ const key = `${candidate.matchedText}:${candidate.href}`;
85
+ if (!seen.has(key)) {
86
+ seen.add(key);
87
+ deduplicated.push(candidate);
88
+ }
89
+ }
90
+
91
+ return deduplicated;
92
+ } catch (error) {
93
+ console.warn(`Failed to detect contact candidates: ${error.message}`);
94
+ return candidates;
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Evaluate a single element for contact relevance
100
+ */
101
+ function evaluateElement(element, baseUrl = '') {
102
+ const candidates = [];
103
+ const contactTokens = getTokensForTarget('contact');
104
+
105
+ // Rule A: data-guardian attribute (highest priority)
106
+ if (element.dataGuardian) {
107
+ const normalized = normalizeText(element.dataGuardian);
108
+ if (normalized.includes('contact')) {
109
+ candidates.push({
110
+ selector: buildSelector(element),
111
+ matchedText: element.text || element.dataGuardian,
112
+ matchedToken: 'contact',
113
+ source: DETECTION_SOURCE.DATA_GUARDIAN,
114
+ confidence: CONFIDENCE.HIGH,
115
+ href: element.href,
116
+ ariaLabel: element.ariaLabel
117
+ });
118
+ return candidates;
119
+ }
120
+ }
121
+
122
+ // Rule B: href-based detection
123
+ if (element.href) {
124
+ const normalizedHref = normalizeText(element.href);
125
+ const matchedToken = getMatchedToken(normalizedHref, contactTokens);
126
+
127
+ if (matchedToken) {
128
+ candidates.push({
129
+ selector: buildSelector(element),
130
+ matchedText: element.text || element.href,
131
+ matchedToken: matchedToken,
132
+ source: DETECTION_SOURCE.HREF,
133
+ confidence: CONFIDENCE.HIGH,
134
+ href: element.href,
135
+ ariaLabel: element.ariaLabel
136
+ });
137
+ }
138
+ }
139
+
140
+ // Rule C: visible text-based detection
141
+ if (element.text) {
142
+ const normalizedText = normalizeText(element.text);
143
+ const matchedToken = getMatchedToken(normalizedText, contactTokens);
144
+
145
+ if (matchedToken) {
146
+ // Higher confidence if in nav or footer
147
+ let confidence = CONFIDENCE.MEDIUM;
148
+ let source = DETECTION_SOURCE.TEXT;
149
+
150
+ if (element.isInNav || element.isInFooter) {
151
+ confidence = CONFIDENCE.HIGH;
152
+ source = element.isInNav ? DETECTION_SOURCE.NAV_FOOTER : DETECTION_SOURCE.NAV_FOOTER;
153
+ }
154
+
155
+ candidates.push({
156
+ selector: buildSelector(element),
157
+ matchedText: element.text,
158
+ matchedToken: matchedToken,
159
+ source: source,
160
+ confidence: confidence,
161
+ href: element.href,
162
+ ariaLabel: element.ariaLabel
163
+ });
164
+ }
165
+ }
166
+
167
+ // Rule D: aria-label or title attribute
168
+ if (element.ariaLabel || element.title) {
169
+ const textToCheck = element.ariaLabel || element.title;
170
+ const normalizedText = normalizeText(textToCheck);
171
+ const matchedToken = getMatchedToken(normalizedText, contactTokens);
172
+
173
+ if (matchedToken) {
174
+ candidates.push({
175
+ selector: buildSelector(element),
176
+ matchedText: textToCheck,
177
+ matchedToken: matchedToken,
178
+ source: DETECTION_SOURCE.ARIA,
179
+ confidence: CONFIDENCE.MEDIUM,
180
+ href: element.href,
181
+ ariaLabel: element.ariaLabel
182
+ });
183
+ }
184
+ }
185
+
186
+ return candidates;
187
+ }
188
+
189
+ /**
190
+ * Build a CSS selector for an element
191
+ */
192
+ function buildSelector(element) {
193
+ // Prefer data-guardian if available
194
+ if (element.dataGuardian) {
195
+ return `[data-guardian="${element.dataGuardian}"]`;
196
+ }
197
+
198
+ // For links/buttons, use href or text
199
+ if (element.tagName === 'a' && element.href) {
200
+ // Use href in selector
201
+ return `a[href*="${normalizeHrefForSelector(element.href)}"]`;
202
+ }
203
+
204
+ if (element.ariaLabel) {
205
+ return `${element.tagName}[aria-label*="${element.ariaLabel}"]`;
206
+ }
207
+
208
+ // Fallback
209
+ return `${element.tagName}`;
210
+ }
211
+
212
+ /**
213
+ * Normalize href for use in CSS selector
214
+ */
215
+ function normalizeHrefForSelector(href) {
216
+ // Extract path portion
217
+ try {
218
+ const url = new URL(href, 'http://localhost');
219
+ return url.pathname.split('/').filter(p => p)[0] || '';
220
+ } catch {
221
+ // If URL parsing fails, extract first path component
222
+ return href.split('/')[1] || '';
223
+ }
224
+ }
225
+
226
+ /**
227
+ * Format detection result for human-readable output
228
+ */
229
+ function formatDetectionResult(candidate, language = 'unknown') {
230
+ const languageStr = language !== 'unknown' ? `lang=${language}` : 'lang=unknown';
231
+ const parts = [
232
+ `Contact detected`,
233
+ `(${languageStr}`,
234
+ `source=${candidate.source}`,
235
+ `token=${candidate.matchedToken}`,
236
+ `confidence=${candidate.confidence})`
237
+ ];
238
+
239
+ return parts.join(', ');
240
+ }
241
+
242
+ /**
243
+ * Get hint message if contact not found
244
+ */
245
+ function getNoContactFoundHint() {
246
+ return 'No contact found. Consider adding a stable marker like data-guardian="contact" or ensure contact link text/href is recognizable.';
247
+ }
248
+
249
+ module.exports = {
250
+ detectContactCandidates,
251
+ formatDetectionResult,
252
+ getNoContactFoundHint,
253
+ CONFIDENCE,
254
+ DETECTION_SOURCE
255
+ };
@@ -0,0 +1,200 @@
1
+ /**
2
+ * Semantic Contact Finder Integration
3
+ *
4
+ * Integrates semantic contact detection into the scanning flow.
5
+ * Works with Playwright to find contact links/forms in real pages.
6
+ *
7
+ * Now includes Wave 1.2 detection layers with data-guardian attribute support.
8
+ */
9
+
10
+ const { detectLanguage, getPrimaryLanguage, getLanguageName } = require('./language-detection');
11
+ const { detectContactCandidates, formatDetectionResult, getNoContactFoundHint } = require('./semantic-contact-detection');
12
+ const { getTokensForTarget, normalizeText } = require('./semantic-targets');
13
+ const { detectByLayers, LAYER, CONFIDENCE } = require('./detection-layers');
14
+
15
+ /**
16
+ * Find contact elements on a page using semantic detection
17
+ *
18
+ * @param {Page} page - Playwright page object
19
+ * @param {string} baseUrl - Base URL for relative links
20
+ * @returns {Promise<Object>} Detection result with language, candidates, and recommendations
21
+ */
22
+ async function findContactOnPage(page, baseUrl = '') {
23
+ const result = {
24
+ language: 'unknown',
25
+ languageName: 'Unknown',
26
+ candidates: [],
27
+ found: false,
28
+ hint: ''
29
+ };
30
+
31
+ try {
32
+ // Detect language
33
+ result.language = await detectLanguage(page);
34
+ result.languageName = getLanguageName(result.language);
35
+
36
+ // Find contact candidates
37
+ const candidates = await detectContactCandidates(page, baseUrl);
38
+ result.candidates = candidates;
39
+
40
+ if (candidates.length > 0) {
41
+ result.found = true;
42
+ result.primaryCandidate = candidates[0]; // Highest confidence
43
+ } else {
44
+ result.hint = getNoContactFoundHint();
45
+ }
46
+
47
+ return result;
48
+ } catch (error) {
49
+ console.warn(`Contact detection failed: ${error.message}`);
50
+ result.hint = `Contact detection failed: ${error.message}. Fallback to default selectors.`;
51
+ return result;
52
+ }
53
+ }
54
+
55
+ /**
56
+ * Generate Playwright selectors from semantic candidates
57
+ * Returns a fallback selector chain compatible with attempt registry
58
+ */
59
+ function generateSelectorsFromCandidates(candidates) {
60
+ if (!candidates || candidates.length === 0) {
61
+ return null;
62
+ }
63
+
64
+ // Take top 3 candidates, prefer high confidence
65
+ const topCandidates = candidates.slice(0, 3);
66
+
67
+ // Build selector chain
68
+ const selectors = topCandidates
69
+ .map(c => c.selector)
70
+ .filter(Boolean)
71
+ .join(', ');
72
+
73
+ return selectors || null;
74
+ }
75
+
76
+ /**
77
+ * Find contact elements using Wave 1.2 detection layers
78
+ * Respects priority: data-guardian > href > text > structure
79
+ *
80
+ * @param {Page} page - Playwright page object
81
+ * @param {string} target - Detection target (contact, form, submit, about)
82
+ * @param {string} baseUrl - Base URL for relative links
83
+ * @returns {Promise<Object>} Detection result with layer, confidence, reason
84
+ */
85
+ async function findElementByLayers(page, target, baseUrl = '') {
86
+ const result = {
87
+ language: 'unknown',
88
+ languageName: 'Unknown',
89
+ target: target,
90
+ found: false,
91
+ layer: null,
92
+ confidence: null,
93
+ candidates: [],
94
+ primaryCandidate: null,
95
+ reason: '',
96
+ hint: ''
97
+ };
98
+
99
+ try {
100
+ // Detect language
101
+ result.language = await detectLanguage(page);
102
+ result.languageName = getLanguageName(result.language);
103
+
104
+ // Use Wave 1.2 detection layers
105
+ const layerResult = await detectByLayers(page, target, baseUrl);
106
+
107
+ result.found = layerResult.found;
108
+ result.layer = layerResult.layer;
109
+ result.confidence = layerResult.confidence;
110
+ result.candidates = layerResult.candidates;
111
+ result.primaryCandidate = layerResult.primaryCandidate;
112
+ result.evidence = layerResult.evidence;
113
+ result.reason = layerResult.reason;
114
+
115
+ if (!result.found) {
116
+ result.hint = `No ${target} detected. Consider adding data-guardian="${target}" attribute for guaranteed stability.`;
117
+ }
118
+
119
+ return result;
120
+ } catch (error) {
121
+ console.warn(`Detection by layers failed: ${error.message}`);
122
+ result.reason = `Detection error: ${error.message}`;
123
+ result.hint = 'Fallback to manual configuration or heuristic detection.';
124
+ return result;
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Generate Playwright selectors from semantic candidates
130
+ * Returns a fallback selector chain compatible with attempt registry
131
+ */
132
+ function generateSelectorsFromCandidates(candidates) {
133
+ if (!candidates || candidates.length === 0) {
134
+ return null;
135
+ }
136
+
137
+ // Take top 3 candidates, prefer high confidence
138
+ const topCandidates = candidates.slice(0, 3);
139
+
140
+ // Build selector chain
141
+ const selectors = topCandidates
142
+ .map(c => c.selector)
143
+ .filter(Boolean)
144
+ .join(', ');
145
+
146
+ return selectors || null;
147
+ }
148
+
149
+ /**
150
+ * Format detection output for CLI reporting (Wave 1.2 enhanced)
151
+ * Shows which layer was used and how to improve stability
152
+ */
153
+ function formatDetectionForReport(detectionResult) {
154
+ const lines = [];
155
+
156
+ lines.push(`🌍 Language Detection: ${detectionResult.languageName}`);
157
+ lines.push(` (lang=${detectionResult.language})`);
158
+
159
+ if (detectionResult.found && detectionResult.candidates.length > 0) {
160
+ lines.push('');
161
+
162
+ // Show detection layer (Wave 1.2)
163
+ if (detectionResult.layer) {
164
+ lines.push(`📍 Detection Layer: ${detectionResult.layer} (confidence: ${detectionResult.confidence})`);
165
+ lines.push(` ${detectionResult.reason}`);
166
+ lines.push('');
167
+ }
168
+
169
+ lines.push(`✅ ${detectionResult.target} Detection Results (${detectionResult.candidates.length} candidate${detectionResult.candidates.length > 1 ? 's' : ''})`);
170
+
171
+ detectionResult.candidates.forEach((candidate, idx) => {
172
+ const formatted = formatDetectionResult(candidate, detectionResult.language);
173
+ lines.push(` ${idx + 1}. ${formatted}`);
174
+ if (candidate.matchedText) {
175
+ lines.push(` Text: "${candidate.matchedText}"`);
176
+ }
177
+ if (candidate.href) {
178
+ lines.push(` Link: ${candidate.href}`);
179
+ }
180
+ });
181
+ } else {
182
+ lines.push('');
183
+ lines.push(`❌ No ${detectionResult.target || 'target'} found`);
184
+ if (detectionResult.reason) {
185
+ lines.push(` Reason: ${detectionResult.reason}`);
186
+ }
187
+ if (detectionResult.hint) {
188
+ lines.push(` 💡 ${detectionResult.hint}`);
189
+ }
190
+ }
191
+
192
+ return lines.join('\n');
193
+ }
194
+
195
+ module.exports = {
196
+ findContactOnPage,
197
+ findElementByLayers,
198
+ generateSelectorsFromCandidates,
199
+ formatDetectionForReport
200
+ };
@@ -0,0 +1,234 @@
1
+ /**
2
+ * Semantic Targets & Multilingual Dictionary
3
+ *
4
+ * Provides deterministic, language-independent detection of semantic targets
5
+ * (contact, about, etc.) using normalized tokens from multiple languages.
6
+ */
7
+
8
+ /**
9
+ * Multilingual dictionary for semantic targets
10
+ * Keys: target names, Values: arrays of normalized token variants
11
+ */
12
+ const SEMANTIC_DICTIONARY = {
13
+ contact: [
14
+ // English
15
+ 'contact', 'contactus', 'contact-us', 'contact us', 'get in touch', 'getintouch',
16
+ 'reach out', 'reachout', 'contact form', 'contactform', 'contact page', 'contactpage',
17
+ 'inquiry', 'inquiries', 'message us', 'messageus', 'write to us', 'writetus',
18
+ // German
19
+ 'kontakt', 'kontaktieren', 'kontaktaufnahme', 'kontaktformular', 'kontakten',
20
+ 'kontakts', 'kontakt formular', 'kontakt-formular', 'anfrage', 'anfragen',
21
+ // Spanish
22
+ 'contacto', 'contactanos', 'contacta', 'formulario de contacto',
23
+ 'pongase en contacto', 'ponte en contacto', 'escribenos', 'escriba',
24
+ // French
25
+ 'contact', 'contactez', 'contactez-nous', 'formulaire de contact',
26
+ 'nous contacter', 'nous ecrire',
27
+ // Portuguese
28
+ 'contato', 'contacto', 'formulario de contato', 'entre em contato',
29
+ 'fale conosco', 'escreva para nos',
30
+ // Italian
31
+ 'contatti', 'contatto', 'contattaci', 'modulo di contatto',
32
+ 'modulo contatti', 'mettersi in contatto',
33
+ // Dutch
34
+ 'contact', 'contacteer', 'contact opnemen', 'contactformulier',
35
+ // Swedish
36
+ 'kontakt', 'kontakta', 'kontaktformular',
37
+ // Arabic
38
+ 'تواصل', 'اتصل', 'استفسار', 'استفسارات', 'نموذج الاتصال', 'نموذج تواصل',
39
+ // Chinese
40
+ '联系', '联系我们', '联系表单', '留言', '反馈'
41
+ ],
42
+ about: [
43
+ // English
44
+ 'about', 'about us', 'aboutus', 'our story', 'about-us',
45
+ 'company', 'team', 'who we are', 'whoweare', 'more about us',
46
+ // German
47
+ 'uber', 'über', 'ueber', 'uber uns', 'über uns', 'ueber uns',
48
+ 'uber unsere', 'über unsere', 'ueber unsere', 'team', 'unternehmen',
49
+ // Spanish
50
+ 'acerca', 'acerca de', 'acerca de nosotros', 'sobre nosotros',
51
+ 'quienes somos', 'quiénes somos', 'nuestra empresa',
52
+ // French
53
+ 'a propos', 'à propos', 'a propos de nous', 'à propos de nous',
54
+ 'qui sommes nous', 'qui nous sommes', 'notre histoire',
55
+ // Portuguese
56
+ 'sobre', 'sobre nos', 'sobre nós', 'quem somos',
57
+ // Italian
58
+ 'chi siamo', 'chi siamo noi', 'la nostra storia',
59
+ // Dutch
60
+ 'over', 'over ons', 'wie zijn we',
61
+ // Swedish
62
+ 'om', 'om oss', 'var historia',
63
+ // Arabic
64
+ 'عن', 'عننا', 'عن الشركة', 'فريقنا', 'قصتنا'
65
+ ],
66
+ form: [
67
+ // English
68
+ 'form', 'form submission', 'contact form', 'feedback form', 'inquiry form',
69
+ 'form page', 'form element', 'form section', 'form area',
70
+ // German
71
+ 'formular', 'form', 'kontaktformular', 'feedback formular',
72
+ // Spanish
73
+ 'formulario', 'formulario de contacto', 'formulario de envio',
74
+ // French
75
+ 'formulaire', 'formulaire de contact', 'formulaire d envoi',
76
+ // Portuguese
77
+ 'formulario', 'formulario de contato', 'formulario de envio',
78
+ // Italian
79
+ 'modulo', 'modulo di contatto', 'modulo di invio',
80
+ // Dutch
81
+ 'formulier', 'contactformulier', 'formulier voor contact',
82
+ // Swedish
83
+ 'formular', 'kontaktformular', 'feedback formular'
84
+ ],
85
+ submit: [
86
+ // English
87
+ 'submit', 'send', 'send message', 'send form', 'submit form',
88
+ 'submit button', 'send button', 'post', 'publish', 'share',
89
+ // German
90
+ 'senden', 'absenden', 'abschicken', 'submit', 'ubermitteln',
91
+ // Spanish
92
+ 'enviar', 'enviar formulario', 'enviar mensaje', 'publicar',
93
+ // French
94
+ 'envoyer', 'soumettre', 'publier', 'partager',
95
+ // Portuguese
96
+ 'enviar', 'enviar formulario', 'enviar mensagem', 'publicar',
97
+ // Italian
98
+ 'inviare', 'inviare modulo', 'inviare messaggio', 'pubblicare',
99
+ // Dutch
100
+ 'verzenden', 'verstuur', 'verstuur formulier', 'publiceren',
101
+ // Swedish
102
+ 'skicka', 'skicka formular', 'publicera'
103
+ ]
104
+ };
105
+
106
+ /**
107
+ * Normalize text for comparison
108
+ * - Lowercase
109
+ * - Trim whitespace
110
+ * - Remove diacritics (é → e, ü → u, etc.)
111
+ * - Remove punctuation
112
+ * - Collapse multiple spaces
113
+ */
114
+ function normalizeText(text) {
115
+ if (typeof text !== 'string') {
116
+ return '';
117
+ }
118
+
119
+ // Lowercase
120
+ let normalized = text.toLowerCase();
121
+
122
+ // Remove diacritics using Unicode normalization
123
+ // NFD: decompose accented characters, then filter combining marks
124
+ normalized = normalized
125
+ .normalize('NFD')
126
+ .replace(/[\u0300-\u036f]/g, '');
127
+
128
+ // Remove punctuation and special characters, keep spaces
129
+ normalized = normalized.replace(/[^\w\s]/g, ' ');
130
+
131
+ // Collapse multiple spaces
132
+ normalized = normalized.replace(/\s+/g, ' ').trim();
133
+
134
+ return normalized;
135
+ }
136
+
137
+ /**
138
+ * Check if normalized text includes any token from the list
139
+ * Matches whole words/tokens at word boundaries where appropriate
140
+ */
141
+ function includesAnyToken(normalizedText, tokenList) {
142
+ if (!normalizedText || !Array.isArray(tokenList)) {
143
+ return false;
144
+ }
145
+
146
+ // Check each token
147
+ for (const token of tokenList) {
148
+ // Normalize the token
149
+ const normalizedToken = normalizeText(token);
150
+
151
+ if (!normalizedToken) {
152
+ continue;
153
+ }
154
+
155
+ // For very short tokens (<=4 chars), require word boundary
156
+ // For longer tokens (>4 chars), allow substring matching
157
+ if (normalizedToken.length <= 4) {
158
+ // Word boundary match
159
+ const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
160
+ if (wordBoundaryRegex.test(normalizedText)) {
161
+ return true;
162
+ }
163
+ } else {
164
+ // Substring match
165
+ if (normalizedText.includes(normalizedToken)) {
166
+ return true;
167
+ }
168
+ }
169
+ }
170
+
171
+ return false;
172
+ }
173
+
174
+ /**
175
+ * Get the best matching token from a list for a given text
176
+ * Returns the token that was matched, or null
177
+ */
178
+ function getMatchedToken(normalizedText, tokenList) {
179
+ if (!normalizedText || !Array.isArray(tokenList)) {
180
+ return null;
181
+ }
182
+
183
+ for (const token of tokenList) {
184
+ const normalizedToken = normalizeText(token);
185
+
186
+ if (!normalizedToken) {
187
+ continue;
188
+ }
189
+
190
+ if (normalizedToken.length <= 4) {
191
+ const wordBoundaryRegex = new RegExp(`\\b${normalizedToken}\\b`);
192
+ if (wordBoundaryRegex.test(normalizedText)) {
193
+ return token;
194
+ }
195
+ } else {
196
+ if (normalizedText.includes(normalizedToken)) {
197
+ return token;
198
+ }
199
+ }
200
+ }
201
+
202
+ return null;
203
+ }
204
+
205
+ /**
206
+ * Get all target names available in dictionary
207
+ */
208
+ function getAvailableTargets() {
209
+ return Object.keys(SEMANTIC_DICTIONARY);
210
+ }
211
+
212
+ /**
213
+ * Check if a semantic target exists in dictionary
214
+ */
215
+ function isValidTarget(targetName) {
216
+ return targetName in SEMANTIC_DICTIONARY;
217
+ }
218
+
219
+ /**
220
+ * Get token list for a specific target
221
+ */
222
+ function getTokensForTarget(targetName) {
223
+ return SEMANTIC_DICTIONARY[targetName] || [];
224
+ }
225
+
226
+ module.exports = {
227
+ SEMANTIC_DICTIONARY,
228
+ normalizeText,
229
+ includesAnyToken,
230
+ getMatchedToken,
231
+ getAvailableTargets,
232
+ isValidTarget,
233
+ getTokensForTarget
234
+ };