@adsim/wordpress-mcp-server 3.1.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +564 -176
  2. package/dxt/manifest.json +93 -9
  3. package/index.js +3624 -36
  4. package/package.json +1 -1
  5. package/src/confirmationToken.js +64 -0
  6. package/src/contentAnalyzer.js +476 -0
  7. package/src/htmlParser.js +80 -0
  8. package/src/linkUtils.js +158 -0
  9. package/src/pluginDetector.js +158 -0
  10. package/src/utils/contentCompressor.js +116 -0
  11. package/src/woocommerceClient.js +88 -0
  12. package/tests/unit/contentAnalyzer.test.js +397 -0
  13. package/tests/unit/pluginDetector.test.js +167 -0
  14. package/tests/unit/tools/analyzeEeatSignals.test.js +192 -0
  15. package/tests/unit/tools/approval.test.js +251 -0
  16. package/tests/unit/tools/auditCanonicals.test.js +149 -0
  17. package/tests/unit/tools/auditHeadingStructure.test.js +150 -0
  18. package/tests/unit/tools/auditMediaSeo.test.js +123 -0
  19. package/tests/unit/tools/auditOutboundLinks.test.js +175 -0
  20. package/tests/unit/tools/auditTaxonomies.test.js +173 -0
  21. package/tests/unit/tools/contentCompressor.test.js +320 -0
  22. package/tests/unit/tools/contentIntelligence.test.js +2168 -0
  23. package/tests/unit/tools/destructive.test.js +246 -0
  24. package/tests/unit/tools/findBrokenInternalLinks.test.js +222 -0
  25. package/tests/unit/tools/findKeywordCannibalization.test.js +183 -0
  26. package/tests/unit/tools/findOrphanPages.test.js +145 -0
  27. package/tests/unit/tools/findThinContent.test.js +145 -0
  28. package/tests/unit/tools/internalLinks.test.js +283 -0
  29. package/tests/unit/tools/perTargetControls.test.js +228 -0
  30. package/tests/unit/tools/pluginIntelligence.test.js +864 -0
  31. package/tests/unit/tools/site.test.js +6 -1
  32. package/tests/unit/tools/woocommerce.test.js +344 -0
  33. package/tests/unit/tools/woocommerceIntelligence.test.js +341 -0
  34. package/tests/unit/tools/woocommerceWrite.test.js +323 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adsim/wordpress-mcp-server",
3
- "version": "3.1.0",
3
+ "version": "4.5.0",
4
4
  "description": "A Model Context Protocol (MCP) server for WordPress REST API integration. Manage posts, search content, and interact with your WordPress site through any MCP-compatible client.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -0,0 +1,64 @@
1
+ import { createHash } from 'crypto';
2
+
3
+ const SECRET = process.env.WP_MCP_SECRET ?? 'mcp-default-secret';
4
+
5
+ /**
6
+ * Generate a stateless confirmation token for a destructive action.
7
+ * Format: mcp_{action}_{postId}_{timestamp}_{hash4}
8
+ */
9
+ export function generateToken(postId, action) {
10
+ const timestamp = Math.floor(Date.now() / 1000);
11
+ const hash4 = createHash('sha256')
12
+ .update(`${postId}${action}${timestamp}${SECRET}`)
13
+ .digest('hex')
14
+ .substring(0, 4);
15
+ return `mcp_${action}_${postId}_${timestamp}_${hash4}`;
16
+ }
17
+
18
+ /**
19
+ * Validate a confirmation token.
20
+ * @returns {{ valid: boolean, reason?: string }}
21
+ */
22
+ export function validateToken(token, postId, action, ttlSeconds = 60) {
23
+ if (typeof token !== 'string' || !token.startsWith('mcp_')) {
24
+ return { valid: false, reason: 'Invalid token format' };
25
+ }
26
+
27
+ // We know postId and action, so reconstruct expected prefix
28
+ const expectedPrefix = `mcp_${action}_${postId}_`;
29
+ if (!token.startsWith(expectedPrefix)) {
30
+ return { valid: false, reason: 'Token does not match post or action' };
31
+ }
32
+
33
+ const suffix = token.slice(expectedPrefix.length);
34
+ const lastUnderscore = suffix.lastIndexOf('_');
35
+ if (lastUnderscore === -1) {
36
+ return { valid: false, reason: 'Invalid token format' };
37
+ }
38
+
39
+ const timestampStr = suffix.substring(0, lastUnderscore);
40
+ const hash4 = suffix.substring(lastUnderscore + 1);
41
+ const timestamp = parseInt(timestampStr, 10);
42
+
43
+ if (isNaN(timestamp)) {
44
+ return { valid: false, reason: 'Invalid token format' };
45
+ }
46
+
47
+ // Check expiry
48
+ const now = Math.floor(Date.now() / 1000);
49
+ if (now - timestamp > ttlSeconds) {
50
+ return { valid: false, reason: 'Token expired' };
51
+ }
52
+
53
+ // Verify hash
54
+ const expectedHash = createHash('sha256')
55
+ .update(`${postId}${action}${timestamp}${SECRET}`)
56
+ .digest('hex')
57
+ .substring(0, 4);
58
+
59
+ if (hash4 !== expectedHash) {
60
+ return { valid: false, reason: 'Invalid token hash' };
61
+ }
62
+
63
+ return { valid: true };
64
+ }
@@ -0,0 +1,476 @@
1
+ /**
2
+ * Content analysis utilities for WordPress MCP Server — Content Intelligence v4.4.
3
+ * Readability scoring, transition words, passive voice, content structure detection.
4
+ * Zero external dependencies — regex-based analysis.
5
+ */
6
+
7
+ import { extractHeadings } from './htmlParser.js';
8
+
9
+ // ── French vowels for syllable counting ──
10
+ const VOWELS = /[aeiouyàâäéèêëïîôùûüœæ]/gi;
11
+
12
+ /**
13
+ * Count syllables in a French word using vowel-group heuristic.
14
+ * @param {string} word
15
+ * @returns {number}
16
+ */
17
+ export function countSyllablesFr(word) {
18
+ if (!word) return 0;
19
+ let w = word.toLowerCase().trim();
20
+ if (w.length === 0) return 0;
21
+
22
+ // Strip trailing silent 'e' (unless word <= 3 chars)
23
+ if (w.length > 3 && w.endsWith('e') && !w.endsWith('ée') && !w.endsWith('ie') && !w.endsWith('ue')) {
24
+ w = w.slice(0, -1);
25
+ }
26
+
27
+ // Count vowel groups
28
+ let count = 0;
29
+ let prevVowel = false;
30
+ for (const ch of w) {
31
+ const isVowel = /[aeiouyàâäéèêëïîôùûüœæ]/.test(ch);
32
+ if (isVowel && !prevVowel) count++;
33
+ prevVowel = isVowel;
34
+ }
35
+
36
+ return Math.max(1, count);
37
+ }
38
+
39
+ /**
40
+ * Strip HTML tags and return plain text.
41
+ * @param {string} html
42
+ * @returns {string}
43
+ */
44
+ function stripToText(html) {
45
+ if (!html) return '';
46
+ return html
47
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
48
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
49
+ .replace(/<\/?(p|div|h[1-6]|li|br|tr|blockquote|pre)[^>]*>/gi, '\n')
50
+ .replace(/<[^>]+>/g, '')
51
+ .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
52
+ .replace(/&quot;/g, '"').replace(/&#039;/g, "'").replace(/&nbsp;/g, ' ')
53
+ .replace(/\n{3,}/g, '\n\n')
54
+ .replace(/[ \t]+/g, ' ')
55
+ .trim();
56
+ }
57
+
58
+ /**
59
+ * Split text into sentences.
60
+ * @param {string} text Plain text (no HTML)
61
+ * @returns {string[]}
62
+ */
63
+ function splitSentences(text) {
64
+ if (!text) return [];
65
+ return text
66
+ .split(/[.!?…]+/)
67
+ .map(s => s.trim())
68
+ .filter(s => s.length > 0);
69
+ }
70
+
71
+ /**
72
+ * Calculate readability score using French-adapted Flesch-Kincaid formula.
73
+ * @param {string} html
74
+ * @param {string} lang Language code (default 'fr')
75
+ * @returns {{ score: number, sentences: number, words: number, syllables: number, avg_words_per_sentence: number, avg_syllables_per_word: number, level: string }}
76
+ */
77
+ export function calculateReadabilityScore(html, lang = 'fr') {
78
+ const text = stripToText(html);
79
+ const sentences = splitSentences(text);
80
+ const sentenceCount = sentences.length || 1;
81
+ const words = text.split(/\s+/).filter(w => w.length > 0);
82
+ const wordCount = words.length || 1;
83
+
84
+ let totalSyllables = 0;
85
+ for (const w of words) {
86
+ totalSyllables += countSyllablesFr(w);
87
+ }
88
+
89
+ const avgWordsPerSentence = wordCount / sentenceCount;
90
+ const avgSyllablesPerWord = totalSyllables / wordCount;
91
+
92
+ // French-adapted Flesch-Kincaid
93
+ let raw = 207 - 1.015 * avgWordsPerSentence - 73.6 * avgSyllablesPerWord;
94
+ const score = Math.round(Math.max(0, Math.min(100, raw)) * 10) / 10;
95
+
96
+ let level;
97
+ if (score >= 80) level = 'très facile';
98
+ else if (score >= 60) level = 'facile';
99
+ else if (score >= 40) level = 'standard';
100
+ else if (score >= 20) level = 'difficile';
101
+ else level = 'très difficile';
102
+
103
+ return {
104
+ score,
105
+ sentences: sentenceCount,
106
+ words: wordCount,
107
+ syllables: totalSyllables,
108
+ avg_words_per_sentence: Math.round(avgWordsPerSentence * 10) / 10,
109
+ avg_syllables_per_word: Math.round(avgSyllablesPerWord * 100) / 100,
110
+ level
111
+ };
112
+ }
113
+
114
+ // ── French transition words ──
115
+ const TRANSITION_WORDS_FR = [
116
+ 'cependant', 'néanmoins', 'en effet', 'par conséquent', 'de plus',
117
+ 'en outre', 'toutefois', 'ainsi', 'par ailleurs', 'en revanche',
118
+ "c'est pourquoi", 'autrement dit', 'en somme', "d'une part", "d'autre part",
119
+ 'premièrement', 'deuxièmement', 'finalement', 'en conclusion', 'en résumé',
120
+ 'par exemple', 'notamment', 'en particulier', "c'est-à-dire", 'à savoir',
121
+ 'bien que', 'malgré', 'alors que', 'tandis que', 'puisque',
122
+ 'car', 'donc', 'or', 'mais', 'pourtant',
123
+ 'du coup', 'ensuite', 'puis', 'enfin', "d'abord"
124
+ ];
125
+
126
+ /**
127
+ * Extract transition words found in text.
128
+ * @param {string} text Plain text
129
+ * @param {string} lang Language code (default 'fr')
130
+ * @returns {{ count: number, density: number, words_found: string[] }}
131
+ */
132
+ export function extractTransitionWords(text, lang = 'fr') {
133
+ if (!text) return { count: 0, density: 0, words_found: [] };
134
+ const lower = text.toLowerCase();
135
+ const sentences = splitSentences(text);
136
+ const sentenceCount = sentences.length || 1;
137
+ const found = [];
138
+
139
+ for (const tw of TRANSITION_WORDS_FR) {
140
+ // Word-boundary aware search (handles multi-word transitions)
141
+ const escaped = tw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
142
+ const regex = new RegExp(`(?:^|[\\s,;:(])${escaped}(?:[\\s,;:.!?)]|$)`, 'gi');
143
+ if (regex.test(lower)) {
144
+ found.push(tw);
145
+ }
146
+ }
147
+
148
+ return {
149
+ count: found.length,
150
+ density: Math.round((found.length / sentenceCount) * 100) / 100,
151
+ words_found: found
152
+ };
153
+ }
154
+
155
+ // ── French passive voice patterns ──
156
+ const PASSIVE_AUX_PATTERN = /\b(?:est|sont|a\s+été|ont\s+été|sera|seront|fut|furent|était|étaient)\b/gi;
157
+ const PAST_PARTICIPLE = /\b\w+(?:é|ée|és|ées|i|ie|is|ise|it|ite|u|ue|us|ues|t|te|ts|tes)\b/;
158
+
159
+ /**
160
+ * Count sentences containing passive voice constructions.
161
+ * @param {string} text Plain text
162
+ * @param {string} lang Language code (default 'fr')
163
+ * @returns {{ count: number, total_sentences: number, ratio: number }}
164
+ */
165
+ export function countPassiveSentences(text, lang = 'fr') {
166
+ if (!text) return { count: 0, total_sentences: 0, ratio: 0 };
167
+ const sentences = splitSentences(text);
168
+ let passiveCount = 0;
169
+
170
+ for (const sentence of sentences) {
171
+ const auxRegex = new RegExp(PASSIVE_AUX_PATTERN.source, 'gi');
172
+ let match;
173
+ while ((match = auxRegex.exec(sentence)) !== null) {
174
+ const after = sentence.slice(match.index + match[0].length).trim();
175
+ const nextWords = after.split(/\s+/).slice(0, 3).join(' ');
176
+ if (PAST_PARTICIPLE.test(nextWords)) {
177
+ passiveCount++;
178
+ break;
179
+ }
180
+ }
181
+ }
182
+
183
+ return {
184
+ count: passiveCount,
185
+ total_sentences: sentences.length,
186
+ ratio: sentences.length > 0 ? Math.round((passiveCount / sentences.length) * 100) / 100 : 0
187
+ };
188
+ }
189
+
190
+ /**
191
+ * Extract headings outline from HTML (H1-H6).
192
+ * @param {string} html
193
+ * @returns {{ level: number, text: string }[]}
194
+ */
195
+ export function extractHeadingsOutline(html) {
196
+ return extractHeadings(html);
197
+ }
198
+
199
+ // ── Content section detection ──
200
+ const CONCLUSION_KEYWORDS = ['conclusion', 'résumé', 'en résumé', 'pour conclure', 'en conclusion'];
201
+ const FAQ_KEYWORDS = ['faq', 'questions fréquentes', 'questions courantes', 'foire aux questions'];
202
+
203
+ /**
204
+ * Detect content structure: intro, conclusion, FAQ, lists, tables, images.
205
+ * @param {string} html
206
+ * @returns {{ has_intro: boolean, has_conclusion: boolean, has_faq: boolean, lists_count: number, tables_count: number, images_count: number, headings_count: number }}
207
+ */
208
+ export function detectContentSections(html) {
209
+ if (!html) return { has_intro: false, has_conclusion: false, has_faq: false, lists_count: 0, tables_count: 0, images_count: 0, headings_count: 0 };
210
+
211
+ const headings = extractHeadings(html);
212
+ const headingsCount = headings.length;
213
+
214
+ // Intro = content before first H2
215
+ const firstH2Idx = html.search(/<h2\b/i);
216
+ const has_intro = firstH2Idx > 0 && stripToText(html.substring(0, firstH2Idx)).length > 30;
217
+
218
+ // Conclusion = last H2+ section contains conclusion keywords
219
+ let has_conclusion = false;
220
+ if (headings.length > 0) {
221
+ const lastHeading = headings[headings.length - 1];
222
+ const lastText = lastHeading.text.toLowerCase();
223
+ has_conclusion = CONCLUSION_KEYWORDS.some(kw => lastText.includes(kw));
224
+ }
225
+
226
+ // FAQ detection
227
+ const lowerHtml = html.toLowerCase();
228
+ const has_faq = FAQ_KEYWORDS.some(kw => lowerHtml.includes(kw));
229
+
230
+ // Counts
231
+ const lists_count = (html.match(/<(?:ul|ol)\b/gi) || []).length;
232
+ const tables_count = (html.match(/<table\b/gi) || []).length;
233
+ const images_count = (html.match(/<img\b/gi) || []).length;
234
+
235
+ return { has_intro, has_conclusion, has_faq, lists_count, tables_count, images_count, headings_count: headingsCount };
236
+ }
237
+
238
+ // ── TF-IDF and Cosine Similarity ──
239
+
240
+ const STOP_WORDS_FR = new Set(['les', 'des', 'une', 'pour', 'dans', 'par', 'sur', 'avec', 'son', 'ses', 'aux', 'qui', 'que', 'est', 'sont', 'ont', 'été', 'pas', 'plus', 'tout', 'tous', 'cette', 'ces', 'mais', 'comme', 'être', 'avoir', 'faire', 'peut', 'nous', 'vous', 'ils', 'elle', 'leur', 'même', 'entre', 'après', 'sans', 'aussi', 'bien', 'quel', 'autre', 'très', 'encore', 'fait', 'dit', 'deux', 'dont', 'quand']);
241
+
242
+ /**
243
+ * Build TF-IDF vectors for a set of documents.
244
+ * @param {{ id: any, text: string }[]} documents
245
+ * @returns {{ vectors: Map<any, Map<string, number>>, terms: Set<string> }}
246
+ */
247
+ export function buildTFIDFVectors(documents) {
248
+ if (!documents || documents.length === 0) return { vectors: new Map(), terms: new Set() };
249
+
250
+ const tokenize = (text) => {
251
+ return (text || '').toLowerCase()
252
+ .split(/[^a-zàâäéèêëïîôùûüœæç0-9]+/i)
253
+ .filter(t => t.length >= 3 && !STOP_WORDS_FR.has(t));
254
+ };
255
+
256
+ const docTokens = new Map();
257
+ const docContainingTerm = new Map();
258
+
259
+ for (const doc of documents) {
260
+ const tokens = tokenize(doc.text);
261
+ const tf = new Map();
262
+ const total = tokens.length || 1;
263
+ const seen = new Set();
264
+
265
+ for (const token of tokens) {
266
+ tf.set(token, (tf.get(token) || 0) + 1 / total);
267
+ if (!seen.has(token)) {
268
+ seen.add(token);
269
+ if (!docContainingTerm.has(token)) docContainingTerm.set(token, new Set());
270
+ docContainingTerm.get(token).add(doc.id);
271
+ }
272
+ }
273
+ docTokens.set(doc.id, tf);
274
+ }
275
+
276
+ const totalDocs = documents.length;
277
+ const allTerms = new Set();
278
+ const vectors = new Map();
279
+
280
+ for (const doc of documents) {
281
+ const tf = docTokens.get(doc.id);
282
+ const tfidf = new Map();
283
+ for (const [term, tfVal] of tf) {
284
+ const idf = Math.log(1 + totalDocs / docContainingTerm.get(term).size);
285
+ tfidf.set(term, tfVal * idf);
286
+ allTerms.add(term);
287
+ }
288
+ vectors.set(doc.id, tfidf);
289
+ }
290
+
291
+ return { vectors, terms: allTerms };
292
+ }
293
+
294
+ /**
295
+ * Compute cosine similarity between two TF-IDF vectors.
296
+ * @param {Map<string, number>} vec1
297
+ * @param {Map<string, number>} vec2
298
+ * @returns {number}
299
+ */
300
+ export function computeCosineSimilarity(vec1, vec2) {
301
+ if (!vec1 || !vec2 || vec1.size === 0 || vec2.size === 0) return 0;
302
+
303
+ let dot = 0;
304
+ let mag1 = 0;
305
+ let mag2 = 0;
306
+
307
+ for (const [term, val] of vec1) {
308
+ mag1 += val * val;
309
+ if (vec2.has(term)) dot += val * vec2.get(term);
310
+ }
311
+ for (const [, val] of vec2) {
312
+ mag2 += val * val;
313
+ }
314
+
315
+ const denom = Math.sqrt(mag1) * Math.sqrt(mag2);
316
+ return denom === 0 ? 0 : dot / denom;
317
+ }
318
+
319
+ /**
320
+ * Find near-duplicate document pairs above a similarity threshold.
321
+ * @param {{ id: any, title: string, text: string }[]} documents
322
+ * @param {number} threshold
323
+ * @returns {{ doc1_id: any, doc2_id: any, similarity: number }[]}
324
+ */
325
+ export function findDuplicatePairs(documents, threshold = 0.7) {
326
+ if (!documents || documents.length < 2) return [];
327
+
328
+ const { vectors } = buildTFIDFVectors(documents);
329
+ const pairs = [];
330
+ const ids = [...vectors.keys()];
331
+
332
+ for (let i = 0; i < ids.length; i++) {
333
+ for (let j = i + 1; j < ids.length; j++) {
334
+ const sim = computeCosineSimilarity(vectors.get(ids[i]), vectors.get(ids[j]));
335
+ if (sim >= threshold) {
336
+ pairs.push({ doc1_id: ids[i], doc2_id: ids[j], similarity: sim });
337
+ }
338
+ }
339
+ }
340
+
341
+ return pairs;
342
+ }
343
+
344
+ // ── Named Entity Extraction ──
345
+
346
+ const ENTITY_EXCLUSIONS = new Set(['Le', 'La', 'Les', 'Un', 'Une', 'Des', 'Ce', 'Cette', 'Ces', 'Il', 'Elle', 'Ils', 'Elles', 'On', 'Nous', 'Vous', 'Mon', 'Ma', 'Mes', 'Son', 'Sa', 'Ses', 'Leur', 'Leurs', 'Notre', 'Votre', 'Tout', 'Tous', 'Toute', 'Toutes', 'Quel', 'Quelle', 'Mais', 'Donc', 'Car', 'Puis', 'Aussi', 'Bien', 'Très', 'Plus', 'Moins', 'Pour', 'Dans', 'Avec', 'Sans', 'Sur', 'Sous', 'Par', 'Entre', 'Après', 'Avant', 'Depuis', 'Pendant', 'Comme', 'Si', 'Quand', 'Où', 'Comment', 'Pourquoi', 'À', 'Au', 'Aux', 'En', 'De', 'Du']);
347
+
348
+ const KNOWN_BRANDS = new Set(['Google', 'Facebook', 'Meta', 'Microsoft', 'Apple', 'Amazon', 'AWS', 'Azure', 'WordPress', 'Shopify', 'HubSpot', 'Salesforce', 'SEMrush', 'Ahrefs', 'Moz', 'Yoast', 'RankMath', 'WooCommerce', 'Elementor', 'Cloudflare', 'GitHub', 'Twitter', 'LinkedIn', 'Instagram', 'YouTube', 'TikTok', 'ChatGPT', 'OpenAI', 'Anthropic', 'Claude']);
349
+
350
+ const KNOWN_LOCATIONS = new Set(['Belgique', 'France', 'Bruxelles', 'Paris', 'Liège', 'Europe', 'Wallonie', 'Flandre', 'Luxembourg', 'Suisse', 'Genève', 'Canada', 'Montréal', 'Québec', 'États-Unis', 'New York', 'Londres', 'Berlin', 'Amsterdam']);
351
+
352
+ const KNOWN_FIRSTNAMES = new Set(['Jean', 'Pierre', 'Marie', 'Paul', 'Michel', 'Jacques', 'Philippe', 'François', 'Nicolas', 'Laurent', 'Julien', 'Thomas', 'David', 'Sophie', 'Julie', 'Isabelle', 'Nathalie', 'Stéphane', 'Christophe', 'Sébastien', 'Georges', 'Antoine', 'Alexandre', 'Marc', 'Olivier']);
353
+
354
+ const BRAND_CONTEXT_WORDS = ['plateforme', 'outil', 'logiciel', 'solution', 'service', 'application', 'app'];
355
+ const ORG_SUFFIXES = ['SA', 'SRL', 'SPRL', 'ASBL', 'Inc', 'Corp', 'Ltd', 'GmbH', 'SAS', 'SARL'];
356
+ const ORG_PREFIXES = ["l'entreprise", 'la société', "l'agence", 'le groupe'];
357
+
358
+ /**
359
+ * Extract named entities from plain text using regex heuristics.
360
+ * @param {string} text Plain text (no HTML)
361
+ * @returns {{ name: string, type: string, count: number, contexts: string[] }[]}
362
+ */
363
+ export function extractEntities(text) {
364
+ if (!text) return [];
365
+
366
+ const sentences = text.split(/(?<=[.!?…])\s+/).filter(s => s.length > 0);
367
+ const entityMap = new Map(); // name -> { type, count, contexts }
368
+
369
+ for (const sentence of sentences) {
370
+ const words = sentence.split(/\s+/);
371
+ if (words.length < 2) continue;
372
+
373
+ let i = 0;
374
+ while (i < words.length) {
375
+ const raw = words[i];
376
+ const clean = raw.replace(/[,;:.!?()]+$/, '');
377
+ if (/^[A-ZÀ-Ÿ]/.test(clean) && !ENTITY_EXCLUSIONS.has(clean)) {
378
+ // Group consecutive capitalized words (break on trailing punctuation like comma)
379
+ const parts = [clean];
380
+ const hasPunct = raw !== clean; // word had trailing punctuation
381
+ let j = i + 1;
382
+ if (!hasPunct) {
383
+ while (j < words.length) {
384
+ const rawJ = words[j];
385
+ const cleanJ = rawJ.replace(/[,;:.!?()]+$/, '');
386
+ if (!/^[A-ZÀ-Ÿ]/.test(cleanJ) || ENTITY_EXCLUSIONS.has(cleanJ)) break;
387
+ parts.push(cleanJ);
388
+ if (rawJ !== cleanJ) { j++; break; } // trailing punctuation breaks group
389
+ j++;
390
+ }
391
+ } else {
392
+ // Trailing punct on first word — don't group further
393
+ }
394
+ const entityName = parts.join(' ');
395
+ if (entityName.length < 2) { i = j; continue; }
396
+
397
+ // Classify
398
+ const afterWords = words.slice(j, j + 3).map(w2 => w2.toLowerCase());
399
+ const beforeWords = words.slice(Math.max(0, i - 3), i).map(w2 => w2.toLowerCase()).join(' ');
400
+ let type = 'unknown';
401
+
402
+ // Check brand
403
+ if (parts.some(p => KNOWN_BRANDS.has(p)) || BRAND_CONTEXT_WORDS.some(bc => afterWords.includes(bc))) {
404
+ type = 'brand';
405
+ }
406
+ // Check location
407
+ else if (KNOWN_LOCATIONS.has(entityName) || parts.some(p => KNOWN_LOCATIONS.has(p))) {
408
+ type = 'location';
409
+ }
410
+ // Check person (firstname + surname)
411
+ else if (parts.length >= 2 && KNOWN_FIRSTNAMES.has(parts[0])) {
412
+ type = 'person';
413
+ }
414
+ // Check organization
415
+ else if (ORG_SUFFIXES.some(s => afterWords[0] === s.toLowerCase() || parts[parts.length - 1] === s) ||
416
+ ORG_PREFIXES.some(p => beforeWords.includes(p))) {
417
+ type = 'organization';
418
+ }
419
+
420
+ if (!entityMap.has(entityName)) {
421
+ entityMap.set(entityName, { type, count: 0, contexts: [] });
422
+ }
423
+ const entry = entityMap.get(entityName);
424
+ entry.count++;
425
+ if (entry.contexts.length < 2) {
426
+ const ctx = sentence.length > 120 ? sentence.substring(0, 120) + '…' : sentence;
427
+ entry.contexts.push(ctx);
428
+ }
429
+
430
+ i = j;
431
+ } else {
432
+ i++;
433
+ }
434
+ }
435
+ }
436
+
437
+ return [...entityMap.entries()].map(([name, data]) => ({
438
+ name, type: data.type, count: data.count, contexts: data.contexts
439
+ }));
440
+ }
441
+
442
+ // ── Text Diff ──
443
+
444
+ /**
445
+ * Compute a simplified line-by-line diff between two texts.
446
+ * @param {string} textA Base text
447
+ * @param {string} textB Target text
448
+ * @returns {{ lines_added: number, lines_removed: number, lines_unchanged: number, words_added: number, words_removed: number, change_ratio: number, added_lines: string[], removed_lines: string[] }}
449
+ */
450
+ export function computeTextDiff(textA, textB) {
451
+ const linesA = (textA || '').split('\n').filter(l => l.trim().length > 0);
452
+ const linesB = (textB || '').split('\n').filter(l => l.trim().length > 0);
453
+
454
+ const setA = new Set(linesA);
455
+ const setB = new Set(linesB);
456
+
457
+ const removed = linesA.filter(l => !setB.has(l));
458
+ const added = linesB.filter(l => !setA.has(l));
459
+ const unchanged = linesA.filter(l => setB.has(l));
460
+
461
+ const countW = (lines) => lines.reduce((sum, l) => sum + l.split(/\s+/).filter(w => w.length > 0).length, 0);
462
+
463
+ const total = added.length + removed.length + unchanged.length;
464
+ const changeRatio = total > 0 ? (added.length + removed.length) / total : 0;
465
+
466
+ return {
467
+ lines_added: added.length,
468
+ lines_removed: removed.length,
469
+ lines_unchanged: unchanged.length,
470
+ words_added: countW(added),
471
+ words_removed: countW(removed),
472
+ change_ratio: changeRatio,
473
+ added_lines: added.slice(0, 20),
474
+ removed_lines: removed.slice(0, 20)
475
+ };
476
+ }
@@ -0,0 +1,80 @@
1
+ /**
2
+ * HTML parsing utilities for WordPress MCP Server.
3
+ * Zero external dependencies — regex-based parsing.
4
+ */
5
+
6
+ /**
7
+ * Extract <img> tags from HTML content.
8
+ * @param {string} html
9
+ * @returns {{ src: string, alt: string, title: string }[]}
10
+ */
11
+ export function parseImagesFromHtml(html) {
12
+ if (!html) return [];
13
+ const images = [];
14
+ const regex = /<img\s[^>]*?>/gi;
15
+ let match;
16
+ while ((match = regex.exec(html)) !== null) {
17
+ const tag = match[0];
18
+ const src = (tag.match(/src=["']([^"']+)["']/i) || [])[1] || '';
19
+ const alt = (tag.match(/alt=["']([^"']*?)["']/i) || [])[1] || '';
20
+ const title = (tag.match(/title=["']([^"']*?)["']/i) || [])[1] || '';
21
+ if (src) images.push({ src, alt, title });
22
+ }
23
+ return images;
24
+ }
25
+
26
+ /**
27
+ * Extract headings (H1-H6) from HTML content.
28
+ * @param {string} html
29
+ * @returns {{ level: number, text: string }[]}
30
+ */
31
+ export function extractHeadings(html) {
32
+ if (!html) return [];
33
+ const headings = [];
34
+ const regex = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
35
+ let match;
36
+ while ((match = regex.exec(html)) !== null) {
37
+ const level = parseInt(match[1], 10);
38
+ const text = match[2].replace(/<[^>]*>/g, '').trim();
39
+ headings.push({ level, text });
40
+ }
41
+ return headings;
42
+ }
43
+
44
+ /**
45
+ * Extract internal links from HTML (returns normalised absolute URLs).
46
+ * @param {string} html
47
+ * @param {string} siteUrl e.g. https://example.com
48
+ * @returns {string[]}
49
+ */
50
+ export function extractInternalLinks(html, siteUrl) {
51
+ if (!html || !siteUrl) return [];
52
+ let siteHost;
53
+ try { siteHost = new URL(siteUrl).host; } catch { return []; }
54
+ const links = [];
55
+ const regex = /<a\s[^>]*?href=["']([^"']+)["'][^>]*?>/gi;
56
+ let match;
57
+ while ((match = regex.exec(html)) !== null) {
58
+ const href = match[1];
59
+ try {
60
+ if (href.startsWith('/') && !href.startsWith('//')) {
61
+ links.push(`${siteUrl.replace(/\/+$/, '')}${href}`);
62
+ } else if (href.startsWith('http')) {
63
+ if (new URL(href).host === siteHost) links.push(href);
64
+ }
65
+ } catch { /* invalid URL, skip */ }
66
+ }
67
+ return links;
68
+ }
69
+
70
+ /**
71
+ * Count words in HTML content (strips tags first).
72
+ * @param {string} html
73
+ * @returns {number}
74
+ */
75
+ export function countWords(html) {
76
+ if (!html) return 0;
77
+ const text = html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim();
78
+ if (!text) return 0;
79
+ return text.split(/\s+/).length;
80
+ }