@adsim/wordpress-mcp-server 3.1.0 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +564 -176
- package/dxt/manifest.json +93 -9
- package/index.js +3624 -36
- package/package.json +1 -1
- package/src/confirmationToken.js +64 -0
- package/src/contentAnalyzer.js +476 -0
- package/src/htmlParser.js +80 -0
- package/src/linkUtils.js +158 -0
- package/src/pluginDetector.js +158 -0
- package/src/utils/contentCompressor.js +116 -0
- package/src/woocommerceClient.js +88 -0
- package/tests/unit/contentAnalyzer.test.js +397 -0
- package/tests/unit/pluginDetector.test.js +167 -0
- package/tests/unit/tools/analyzeEeatSignals.test.js +192 -0
- package/tests/unit/tools/approval.test.js +251 -0
- package/tests/unit/tools/auditCanonicals.test.js +149 -0
- package/tests/unit/tools/auditHeadingStructure.test.js +150 -0
- package/tests/unit/tools/auditMediaSeo.test.js +123 -0
- package/tests/unit/tools/auditOutboundLinks.test.js +175 -0
- package/tests/unit/tools/auditTaxonomies.test.js +173 -0
- package/tests/unit/tools/contentCompressor.test.js +320 -0
- package/tests/unit/tools/contentIntelligence.test.js +2168 -0
- package/tests/unit/tools/destructive.test.js +246 -0
- package/tests/unit/tools/findBrokenInternalLinks.test.js +222 -0
- package/tests/unit/tools/findKeywordCannibalization.test.js +183 -0
- package/tests/unit/tools/findOrphanPages.test.js +145 -0
- package/tests/unit/tools/findThinContent.test.js +145 -0
- package/tests/unit/tools/internalLinks.test.js +283 -0
- package/tests/unit/tools/perTargetControls.test.js +228 -0
- package/tests/unit/tools/pluginIntelligence.test.js +864 -0
- package/tests/unit/tools/site.test.js +6 -1
- package/tests/unit/tools/woocommerce.test.js +344 -0
- package/tests/unit/tools/woocommerceIntelligence.test.js +341 -0
- package/tests/unit/tools/woocommerceWrite.test.js +323 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adsim/wordpress-mcp-server",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.5.0",
|
|
4
4
|
"description": "A Model Context Protocol (MCP) server for WordPress REST API integration. Manage posts, search content, and interact with your WordPress site through any MCP-compatible client.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { createHash } from 'crypto';
|
|
2
|
+
|
|
3
|
+
const SECRET = process.env.WP_MCP_SECRET ?? 'mcp-default-secret';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Generate a stateless confirmation token for a destructive action.
|
|
7
|
+
* Format: mcp_{action}_{postId}_{timestamp}_{hash4}
|
|
8
|
+
*/
|
|
9
|
+
export function generateToken(postId, action) {
|
|
10
|
+
const timestamp = Math.floor(Date.now() / 1000);
|
|
11
|
+
const hash4 = createHash('sha256')
|
|
12
|
+
.update(`${postId}${action}${timestamp}${SECRET}`)
|
|
13
|
+
.digest('hex')
|
|
14
|
+
.substring(0, 4);
|
|
15
|
+
return `mcp_${action}_${postId}_${timestamp}_${hash4}`;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Validate a confirmation token.
|
|
20
|
+
* @returns {{ valid: boolean, reason?: string }}
|
|
21
|
+
*/
|
|
22
|
+
export function validateToken(token, postId, action, ttlSeconds = 60) {
|
|
23
|
+
if (typeof token !== 'string' || !token.startsWith('mcp_')) {
|
|
24
|
+
return { valid: false, reason: 'Invalid token format' };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// We know postId and action, so reconstruct expected prefix
|
|
28
|
+
const expectedPrefix = `mcp_${action}_${postId}_`;
|
|
29
|
+
if (!token.startsWith(expectedPrefix)) {
|
|
30
|
+
return { valid: false, reason: 'Token does not match post or action' };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const suffix = token.slice(expectedPrefix.length);
|
|
34
|
+
const lastUnderscore = suffix.lastIndexOf('_');
|
|
35
|
+
if (lastUnderscore === -1) {
|
|
36
|
+
return { valid: false, reason: 'Invalid token format' };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const timestampStr = suffix.substring(0, lastUnderscore);
|
|
40
|
+
const hash4 = suffix.substring(lastUnderscore + 1);
|
|
41
|
+
const timestamp = parseInt(timestampStr, 10);
|
|
42
|
+
|
|
43
|
+
if (isNaN(timestamp)) {
|
|
44
|
+
return { valid: false, reason: 'Invalid token format' };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Check expiry
|
|
48
|
+
const now = Math.floor(Date.now() / 1000);
|
|
49
|
+
if (now - timestamp > ttlSeconds) {
|
|
50
|
+
return { valid: false, reason: 'Token expired' };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Verify hash
|
|
54
|
+
const expectedHash = createHash('sha256')
|
|
55
|
+
.update(`${postId}${action}${timestamp}${SECRET}`)
|
|
56
|
+
.digest('hex')
|
|
57
|
+
.substring(0, 4);
|
|
58
|
+
|
|
59
|
+
if (hash4 !== expectedHash) {
|
|
60
|
+
return { valid: false, reason: 'Invalid token hash' };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return { valid: true };
|
|
64
|
+
}
|
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content analysis utilities for WordPress MCP Server — Content Intelligence v4.4.
|
|
3
|
+
* Readability scoring, transition words, passive voice, content structure detection.
|
|
4
|
+
* Zero external dependencies — regex-based analysis.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { extractHeadings } from './htmlParser.js';
|
|
8
|
+
|
|
9
|
+
// ── French vowels for syllable counting ──
|
|
10
|
+
const VOWELS = /[aeiouyàâäéèêëïîôùûüœæ]/gi;
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Count syllables in a French word using vowel-group heuristic.
|
|
14
|
+
* @param {string} word
|
|
15
|
+
* @returns {number}
|
|
16
|
+
*/
|
|
17
|
+
export function countSyllablesFr(word) {
|
|
18
|
+
if (!word) return 0;
|
|
19
|
+
let w = word.toLowerCase().trim();
|
|
20
|
+
if (w.length === 0) return 0;
|
|
21
|
+
|
|
22
|
+
// Strip trailing silent 'e' (unless word <= 3 chars)
|
|
23
|
+
if (w.length > 3 && w.endsWith('e') && !w.endsWith('ée') && !w.endsWith('ie') && !w.endsWith('ue')) {
|
|
24
|
+
w = w.slice(0, -1);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Count vowel groups
|
|
28
|
+
let count = 0;
|
|
29
|
+
let prevVowel = false;
|
|
30
|
+
for (const ch of w) {
|
|
31
|
+
const isVowel = /[aeiouyàâäéèêëïîôùûüœæ]/.test(ch);
|
|
32
|
+
if (isVowel && !prevVowel) count++;
|
|
33
|
+
prevVowel = isVowel;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return Math.max(1, count);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Strip HTML tags and return plain text.
|
|
41
|
+
* @param {string} html
|
|
42
|
+
* @returns {string}
|
|
43
|
+
*/
|
|
44
|
+
function stripToText(html) {
|
|
45
|
+
if (!html) return '';
|
|
46
|
+
return html
|
|
47
|
+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
48
|
+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
|
49
|
+
.replace(/<\/?(p|div|h[1-6]|li|br|tr|blockquote|pre)[^>]*>/gi, '\n')
|
|
50
|
+
.replace(/<[^>]+>/g, '')
|
|
51
|
+
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
|
52
|
+
.replace(/"/g, '"').replace(/'/g, "'").replace(/ /g, ' ')
|
|
53
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
54
|
+
.replace(/[ \t]+/g, ' ')
|
|
55
|
+
.trim();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Split text into sentences.
|
|
60
|
+
* @param {string} text Plain text (no HTML)
|
|
61
|
+
* @returns {string[]}
|
|
62
|
+
*/
|
|
63
|
+
function splitSentences(text) {
|
|
64
|
+
if (!text) return [];
|
|
65
|
+
return text
|
|
66
|
+
.split(/[.!?…]+/)
|
|
67
|
+
.map(s => s.trim())
|
|
68
|
+
.filter(s => s.length > 0);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Calculate readability score using French-adapted Flesch-Kincaid formula.
|
|
73
|
+
* @param {string} html
|
|
74
|
+
* @param {string} lang Language code (default 'fr')
|
|
75
|
+
* @returns {{ score: number, sentences: number, words: number, syllables: number, avg_words_per_sentence: number, avg_syllables_per_word: number, level: string }}
|
|
76
|
+
*/
|
|
77
|
+
export function calculateReadabilityScore(html, lang = 'fr') {
|
|
78
|
+
const text = stripToText(html);
|
|
79
|
+
const sentences = splitSentences(text);
|
|
80
|
+
const sentenceCount = sentences.length || 1;
|
|
81
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
82
|
+
const wordCount = words.length || 1;
|
|
83
|
+
|
|
84
|
+
let totalSyllables = 0;
|
|
85
|
+
for (const w of words) {
|
|
86
|
+
totalSyllables += countSyllablesFr(w);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const avgWordsPerSentence = wordCount / sentenceCount;
|
|
90
|
+
const avgSyllablesPerWord = totalSyllables / wordCount;
|
|
91
|
+
|
|
92
|
+
// French-adapted Flesch-Kincaid
|
|
93
|
+
let raw = 207 - 1.015 * avgWordsPerSentence - 73.6 * avgSyllablesPerWord;
|
|
94
|
+
const score = Math.round(Math.max(0, Math.min(100, raw)) * 10) / 10;
|
|
95
|
+
|
|
96
|
+
let level;
|
|
97
|
+
if (score >= 80) level = 'très facile';
|
|
98
|
+
else if (score >= 60) level = 'facile';
|
|
99
|
+
else if (score >= 40) level = 'standard';
|
|
100
|
+
else if (score >= 20) level = 'difficile';
|
|
101
|
+
else level = 'très difficile';
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
score,
|
|
105
|
+
sentences: sentenceCount,
|
|
106
|
+
words: wordCount,
|
|
107
|
+
syllables: totalSyllables,
|
|
108
|
+
avg_words_per_sentence: Math.round(avgWordsPerSentence * 10) / 10,
|
|
109
|
+
avg_syllables_per_word: Math.round(avgSyllablesPerWord * 100) / 100,
|
|
110
|
+
level
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// ── French transition words ──
|
|
115
|
+
const TRANSITION_WORDS_FR = [
|
|
116
|
+
'cependant', 'néanmoins', 'en effet', 'par conséquent', 'de plus',
|
|
117
|
+
'en outre', 'toutefois', 'ainsi', 'par ailleurs', 'en revanche',
|
|
118
|
+
"c'est pourquoi", 'autrement dit', 'en somme', "d'une part", "d'autre part",
|
|
119
|
+
'premièrement', 'deuxièmement', 'finalement', 'en conclusion', 'en résumé',
|
|
120
|
+
'par exemple', 'notamment', 'en particulier', "c'est-à-dire", 'à savoir',
|
|
121
|
+
'bien que', 'malgré', 'alors que', 'tandis que', 'puisque',
|
|
122
|
+
'car', 'donc', 'or', 'mais', 'pourtant',
|
|
123
|
+
'du coup', 'ensuite', 'puis', 'enfin', "d'abord"
|
|
124
|
+
];
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Extract transition words found in text.
|
|
128
|
+
* @param {string} text Plain text
|
|
129
|
+
* @param {string} lang Language code (default 'fr')
|
|
130
|
+
* @returns {{ count: number, density: number, words_found: string[] }}
|
|
131
|
+
*/
|
|
132
|
+
export function extractTransitionWords(text, lang = 'fr') {
|
|
133
|
+
if (!text) return { count: 0, density: 0, words_found: [] };
|
|
134
|
+
const lower = text.toLowerCase();
|
|
135
|
+
const sentences = splitSentences(text);
|
|
136
|
+
const sentenceCount = sentences.length || 1;
|
|
137
|
+
const found = [];
|
|
138
|
+
|
|
139
|
+
for (const tw of TRANSITION_WORDS_FR) {
|
|
140
|
+
// Word-boundary aware search (handles multi-word transitions)
|
|
141
|
+
const escaped = tw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
142
|
+
const regex = new RegExp(`(?:^|[\\s,;:(])${escaped}(?:[\\s,;:.!?)]|$)`, 'gi');
|
|
143
|
+
if (regex.test(lower)) {
|
|
144
|
+
found.push(tw);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return {
|
|
149
|
+
count: found.length,
|
|
150
|
+
density: Math.round((found.length / sentenceCount) * 100) / 100,
|
|
151
|
+
words_found: found
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ── French passive voice patterns ──
|
|
156
|
+
const PASSIVE_AUX_PATTERN = /\b(?:est|sont|a\s+été|ont\s+été|sera|seront|fut|furent|était|étaient)\b/gi;
|
|
157
|
+
const PAST_PARTICIPLE = /\b\w+(?:é|ée|és|ées|i|ie|is|ise|it|ite|u|ue|us|ues|t|te|ts|tes)\b/;
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Count sentences containing passive voice constructions.
|
|
161
|
+
* @param {string} text Plain text
|
|
162
|
+
* @param {string} lang Language code (default 'fr')
|
|
163
|
+
* @returns {{ count: number, total_sentences: number, ratio: number }}
|
|
164
|
+
*/
|
|
165
|
+
export function countPassiveSentences(text, lang = 'fr') {
|
|
166
|
+
if (!text) return { count: 0, total_sentences: 0, ratio: 0 };
|
|
167
|
+
const sentences = splitSentences(text);
|
|
168
|
+
let passiveCount = 0;
|
|
169
|
+
|
|
170
|
+
for (const sentence of sentences) {
|
|
171
|
+
const auxRegex = new RegExp(PASSIVE_AUX_PATTERN.source, 'gi');
|
|
172
|
+
let match;
|
|
173
|
+
while ((match = auxRegex.exec(sentence)) !== null) {
|
|
174
|
+
const after = sentence.slice(match.index + match[0].length).trim();
|
|
175
|
+
const nextWords = after.split(/\s+/).slice(0, 3).join(' ');
|
|
176
|
+
if (PAST_PARTICIPLE.test(nextWords)) {
|
|
177
|
+
passiveCount++;
|
|
178
|
+
break;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
count: passiveCount,
|
|
185
|
+
total_sentences: sentences.length,
|
|
186
|
+
ratio: sentences.length > 0 ? Math.round((passiveCount / sentences.length) * 100) / 100 : 0
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Extract headings outline from HTML (H1-H6).
|
|
192
|
+
* @param {string} html
|
|
193
|
+
* @returns {{ level: number, text: string }[]}
|
|
194
|
+
*/
|
|
195
|
+
export function extractHeadingsOutline(html) {
|
|
196
|
+
return extractHeadings(html);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// ── Content section detection ──
|
|
200
|
+
const CONCLUSION_KEYWORDS = ['conclusion', 'résumé', 'en résumé', 'pour conclure', 'en conclusion'];
|
|
201
|
+
const FAQ_KEYWORDS = ['faq', 'questions fréquentes', 'questions courantes', 'foire aux questions'];
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Detect content structure: intro, conclusion, FAQ, lists, tables, images.
|
|
205
|
+
* @param {string} html
|
|
206
|
+
* @returns {{ has_intro: boolean, has_conclusion: boolean, has_faq: boolean, lists_count: number, tables_count: number, images_count: number, headings_count: number }}
|
|
207
|
+
*/
|
|
208
|
+
export function detectContentSections(html) {
|
|
209
|
+
if (!html) return { has_intro: false, has_conclusion: false, has_faq: false, lists_count: 0, tables_count: 0, images_count: 0, headings_count: 0 };
|
|
210
|
+
|
|
211
|
+
const headings = extractHeadings(html);
|
|
212
|
+
const headingsCount = headings.length;
|
|
213
|
+
|
|
214
|
+
// Intro = content before first H2
|
|
215
|
+
const firstH2Idx = html.search(/<h2\b/i);
|
|
216
|
+
const has_intro = firstH2Idx > 0 && stripToText(html.substring(0, firstH2Idx)).length > 30;
|
|
217
|
+
|
|
218
|
+
// Conclusion = last H2+ section contains conclusion keywords
|
|
219
|
+
let has_conclusion = false;
|
|
220
|
+
if (headings.length > 0) {
|
|
221
|
+
const lastHeading = headings[headings.length - 1];
|
|
222
|
+
const lastText = lastHeading.text.toLowerCase();
|
|
223
|
+
has_conclusion = CONCLUSION_KEYWORDS.some(kw => lastText.includes(kw));
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// FAQ detection
|
|
227
|
+
const lowerHtml = html.toLowerCase();
|
|
228
|
+
const has_faq = FAQ_KEYWORDS.some(kw => lowerHtml.includes(kw));
|
|
229
|
+
|
|
230
|
+
// Counts
|
|
231
|
+
const lists_count = (html.match(/<(?:ul|ol)\b/gi) || []).length;
|
|
232
|
+
const tables_count = (html.match(/<table\b/gi) || []).length;
|
|
233
|
+
const images_count = (html.match(/<img\b/gi) || []).length;
|
|
234
|
+
|
|
235
|
+
return { has_intro, has_conclusion, has_faq, lists_count, tables_count, images_count, headings_count: headingsCount };
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ── TF-IDF and Cosine Similarity ──
|
|
239
|
+
|
|
240
|
+
const STOP_WORDS_FR = new Set(['les', 'des', 'une', 'pour', 'dans', 'par', 'sur', 'avec', 'son', 'ses', 'aux', 'qui', 'que', 'est', 'sont', 'ont', 'été', 'pas', 'plus', 'tout', 'tous', 'cette', 'ces', 'mais', 'comme', 'être', 'avoir', 'faire', 'peut', 'nous', 'vous', 'ils', 'elle', 'leur', 'même', 'entre', 'après', 'sans', 'aussi', 'bien', 'quel', 'autre', 'très', 'encore', 'fait', 'dit', 'deux', 'dont', 'quand']);
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Build TF-IDF vectors for a set of documents.
|
|
244
|
+
* @param {{ id: any, text: string }[]} documents
|
|
245
|
+
* @returns {{ vectors: Map<any, Map<string, number>>, terms: Set<string> }}
|
|
246
|
+
*/
|
|
247
|
+
export function buildTFIDFVectors(documents) {
|
|
248
|
+
if (!documents || documents.length === 0) return { vectors: new Map(), terms: new Set() };
|
|
249
|
+
|
|
250
|
+
const tokenize = (text) => {
|
|
251
|
+
return (text || '').toLowerCase()
|
|
252
|
+
.split(/[^a-zàâäéèêëïîôùûüœæç0-9]+/i)
|
|
253
|
+
.filter(t => t.length >= 3 && !STOP_WORDS_FR.has(t));
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
const docTokens = new Map();
|
|
257
|
+
const docContainingTerm = new Map();
|
|
258
|
+
|
|
259
|
+
for (const doc of documents) {
|
|
260
|
+
const tokens = tokenize(doc.text);
|
|
261
|
+
const tf = new Map();
|
|
262
|
+
const total = tokens.length || 1;
|
|
263
|
+
const seen = new Set();
|
|
264
|
+
|
|
265
|
+
for (const token of tokens) {
|
|
266
|
+
tf.set(token, (tf.get(token) || 0) + 1 / total);
|
|
267
|
+
if (!seen.has(token)) {
|
|
268
|
+
seen.add(token);
|
|
269
|
+
if (!docContainingTerm.has(token)) docContainingTerm.set(token, new Set());
|
|
270
|
+
docContainingTerm.get(token).add(doc.id);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
docTokens.set(doc.id, tf);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const totalDocs = documents.length;
|
|
277
|
+
const allTerms = new Set();
|
|
278
|
+
const vectors = new Map();
|
|
279
|
+
|
|
280
|
+
for (const doc of documents) {
|
|
281
|
+
const tf = docTokens.get(doc.id);
|
|
282
|
+
const tfidf = new Map();
|
|
283
|
+
for (const [term, tfVal] of tf) {
|
|
284
|
+
const idf = Math.log(1 + totalDocs / docContainingTerm.get(term).size);
|
|
285
|
+
tfidf.set(term, tfVal * idf);
|
|
286
|
+
allTerms.add(term);
|
|
287
|
+
}
|
|
288
|
+
vectors.set(doc.id, tfidf);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return { vectors, terms: allTerms };
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Compute cosine similarity between two TF-IDF vectors.
|
|
296
|
+
* @param {Map<string, number>} vec1
|
|
297
|
+
* @param {Map<string, number>} vec2
|
|
298
|
+
* @returns {number}
|
|
299
|
+
*/
|
|
300
|
+
export function computeCosineSimilarity(vec1, vec2) {
|
|
301
|
+
if (!vec1 || !vec2 || vec1.size === 0 || vec2.size === 0) return 0;
|
|
302
|
+
|
|
303
|
+
let dot = 0;
|
|
304
|
+
let mag1 = 0;
|
|
305
|
+
let mag2 = 0;
|
|
306
|
+
|
|
307
|
+
for (const [term, val] of vec1) {
|
|
308
|
+
mag1 += val * val;
|
|
309
|
+
if (vec2.has(term)) dot += val * vec2.get(term);
|
|
310
|
+
}
|
|
311
|
+
for (const [, val] of vec2) {
|
|
312
|
+
mag2 += val * val;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const denom = Math.sqrt(mag1) * Math.sqrt(mag2);
|
|
316
|
+
return denom === 0 ? 0 : dot / denom;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Find near-duplicate document pairs above a similarity threshold.
|
|
321
|
+
* @param {{ id: any, title: string, text: string }[]} documents
|
|
322
|
+
* @param {number} threshold
|
|
323
|
+
* @returns {{ doc1_id: any, doc2_id: any, similarity: number }[]}
|
|
324
|
+
*/
|
|
325
|
+
export function findDuplicatePairs(documents, threshold = 0.7) {
|
|
326
|
+
if (!documents || documents.length < 2) return [];
|
|
327
|
+
|
|
328
|
+
const { vectors } = buildTFIDFVectors(documents);
|
|
329
|
+
const pairs = [];
|
|
330
|
+
const ids = [...vectors.keys()];
|
|
331
|
+
|
|
332
|
+
for (let i = 0; i < ids.length; i++) {
|
|
333
|
+
for (let j = i + 1; j < ids.length; j++) {
|
|
334
|
+
const sim = computeCosineSimilarity(vectors.get(ids[i]), vectors.get(ids[j]));
|
|
335
|
+
if (sim >= threshold) {
|
|
336
|
+
pairs.push({ doc1_id: ids[i], doc2_id: ids[j], similarity: sim });
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
return pairs;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// ── Named Entity Extraction ──
|
|
345
|
+
|
|
346
|
+
const ENTITY_EXCLUSIONS = new Set(['Le', 'La', 'Les', 'Un', 'Une', 'Des', 'Ce', 'Cette', 'Ces', 'Il', 'Elle', 'Ils', 'Elles', 'On', 'Nous', 'Vous', 'Mon', 'Ma', 'Mes', 'Son', 'Sa', 'Ses', 'Leur', 'Leurs', 'Notre', 'Votre', 'Tout', 'Tous', 'Toute', 'Toutes', 'Quel', 'Quelle', 'Mais', 'Donc', 'Car', 'Puis', 'Aussi', 'Bien', 'Très', 'Plus', 'Moins', 'Pour', 'Dans', 'Avec', 'Sans', 'Sur', 'Sous', 'Par', 'Entre', 'Après', 'Avant', 'Depuis', 'Pendant', 'Comme', 'Si', 'Quand', 'Où', 'Comment', 'Pourquoi', 'À', 'Au', 'Aux', 'En', 'De', 'Du']);
|
|
347
|
+
|
|
348
|
+
const KNOWN_BRANDS = new Set(['Google', 'Facebook', 'Meta', 'Microsoft', 'Apple', 'Amazon', 'AWS', 'Azure', 'WordPress', 'Shopify', 'HubSpot', 'Salesforce', 'SEMrush', 'Ahrefs', 'Moz', 'Yoast', 'RankMath', 'WooCommerce', 'Elementor', 'Cloudflare', 'GitHub', 'Twitter', 'LinkedIn', 'Instagram', 'YouTube', 'TikTok', 'ChatGPT', 'OpenAI', 'Anthropic', 'Claude']);
|
|
349
|
+
|
|
350
|
+
const KNOWN_LOCATIONS = new Set(['Belgique', 'France', 'Bruxelles', 'Paris', 'Liège', 'Europe', 'Wallonie', 'Flandre', 'Luxembourg', 'Suisse', 'Genève', 'Canada', 'Montréal', 'Québec', 'États-Unis', 'New York', 'Londres', 'Berlin', 'Amsterdam']);
|
|
351
|
+
|
|
352
|
+
const KNOWN_FIRSTNAMES = new Set(['Jean', 'Pierre', 'Marie', 'Paul', 'Michel', 'Jacques', 'Philippe', 'François', 'Nicolas', 'Laurent', 'Julien', 'Thomas', 'David', 'Sophie', 'Julie', 'Isabelle', 'Nathalie', 'Stéphane', 'Christophe', 'Sébastien', 'Georges', 'Antoine', 'Alexandre', 'Marc', 'Olivier']);
|
|
353
|
+
|
|
354
|
+
const BRAND_CONTEXT_WORDS = ['plateforme', 'outil', 'logiciel', 'solution', 'service', 'application', 'app'];
|
|
355
|
+
const ORG_SUFFIXES = ['SA', 'SRL', 'SPRL', 'ASBL', 'Inc', 'Corp', 'Ltd', 'GmbH', 'SAS', 'SARL'];
|
|
356
|
+
const ORG_PREFIXES = ["l'entreprise", 'la société', "l'agence", 'le groupe'];
|
|
357
|
+
|
|
358
|
+
/**
|
|
359
|
+
* Extract named entities from plain text using regex heuristics.
|
|
360
|
+
* @param {string} text Plain text (no HTML)
|
|
361
|
+
* @returns {{ name: string, type: string, count: number, contexts: string[] }[]}
|
|
362
|
+
*/
|
|
363
|
+
export function extractEntities(text) {
|
|
364
|
+
if (!text) return [];
|
|
365
|
+
|
|
366
|
+
const sentences = text.split(/(?<=[.!?…])\s+/).filter(s => s.length > 0);
|
|
367
|
+
const entityMap = new Map(); // name -> { type, count, contexts }
|
|
368
|
+
|
|
369
|
+
for (const sentence of sentences) {
|
|
370
|
+
const words = sentence.split(/\s+/);
|
|
371
|
+
if (words.length < 2) continue;
|
|
372
|
+
|
|
373
|
+
let i = 0;
|
|
374
|
+
while (i < words.length) {
|
|
375
|
+
const raw = words[i];
|
|
376
|
+
const clean = raw.replace(/[,;:.!?()]+$/, '');
|
|
377
|
+
if (/^[A-ZÀ-Ÿ]/.test(clean) && !ENTITY_EXCLUSIONS.has(clean)) {
|
|
378
|
+
// Group consecutive capitalized words (break on trailing punctuation like comma)
|
|
379
|
+
const parts = [clean];
|
|
380
|
+
const hasPunct = raw !== clean; // word had trailing punctuation
|
|
381
|
+
let j = i + 1;
|
|
382
|
+
if (!hasPunct) {
|
|
383
|
+
while (j < words.length) {
|
|
384
|
+
const rawJ = words[j];
|
|
385
|
+
const cleanJ = rawJ.replace(/[,;:.!?()]+$/, '');
|
|
386
|
+
if (!/^[A-ZÀ-Ÿ]/.test(cleanJ) || ENTITY_EXCLUSIONS.has(cleanJ)) break;
|
|
387
|
+
parts.push(cleanJ);
|
|
388
|
+
if (rawJ !== cleanJ) { j++; break; } // trailing punctuation breaks group
|
|
389
|
+
j++;
|
|
390
|
+
}
|
|
391
|
+
} else {
|
|
392
|
+
// Trailing punct on first word — don't group further
|
|
393
|
+
}
|
|
394
|
+
const entityName = parts.join(' ');
|
|
395
|
+
if (entityName.length < 2) { i = j; continue; }
|
|
396
|
+
|
|
397
|
+
// Classify
|
|
398
|
+
const afterWords = words.slice(j, j + 3).map(w2 => w2.toLowerCase());
|
|
399
|
+
const beforeWords = words.slice(Math.max(0, i - 3), i).map(w2 => w2.toLowerCase()).join(' ');
|
|
400
|
+
let type = 'unknown';
|
|
401
|
+
|
|
402
|
+
// Check brand
|
|
403
|
+
if (parts.some(p => KNOWN_BRANDS.has(p)) || BRAND_CONTEXT_WORDS.some(bc => afterWords.includes(bc))) {
|
|
404
|
+
type = 'brand';
|
|
405
|
+
}
|
|
406
|
+
// Check location
|
|
407
|
+
else if (KNOWN_LOCATIONS.has(entityName) || parts.some(p => KNOWN_LOCATIONS.has(p))) {
|
|
408
|
+
type = 'location';
|
|
409
|
+
}
|
|
410
|
+
// Check person (firstname + surname)
|
|
411
|
+
else if (parts.length >= 2 && KNOWN_FIRSTNAMES.has(parts[0])) {
|
|
412
|
+
type = 'person';
|
|
413
|
+
}
|
|
414
|
+
// Check organization
|
|
415
|
+
else if (ORG_SUFFIXES.some(s => afterWords[0] === s.toLowerCase() || parts[parts.length - 1] === s) ||
|
|
416
|
+
ORG_PREFIXES.some(p => beforeWords.includes(p))) {
|
|
417
|
+
type = 'organization';
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
if (!entityMap.has(entityName)) {
|
|
421
|
+
entityMap.set(entityName, { type, count: 0, contexts: [] });
|
|
422
|
+
}
|
|
423
|
+
const entry = entityMap.get(entityName);
|
|
424
|
+
entry.count++;
|
|
425
|
+
if (entry.contexts.length < 2) {
|
|
426
|
+
const ctx = sentence.length > 120 ? sentence.substring(0, 120) + '…' : sentence;
|
|
427
|
+
entry.contexts.push(ctx);
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
i = j;
|
|
431
|
+
} else {
|
|
432
|
+
i++;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
return [...entityMap.entries()].map(([name, data]) => ({
|
|
438
|
+
name, type: data.type, count: data.count, contexts: data.contexts
|
|
439
|
+
}));
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// ── Text Diff ──
|
|
443
|
+
|
|
444
|
+
/**
|
|
445
|
+
* Compute a simplified line-by-line diff between two texts.
|
|
446
|
+
* @param {string} textA Base text
|
|
447
|
+
* @param {string} textB Target text
|
|
448
|
+
* @returns {{ lines_added: number, lines_removed: number, lines_unchanged: number, words_added: number, words_removed: number, change_ratio: number, added_lines: string[], removed_lines: string[] }}
|
|
449
|
+
*/
|
|
450
|
+
export function computeTextDiff(textA, textB) {
|
|
451
|
+
const linesA = (textA || '').split('\n').filter(l => l.trim().length > 0);
|
|
452
|
+
const linesB = (textB || '').split('\n').filter(l => l.trim().length > 0);
|
|
453
|
+
|
|
454
|
+
const setA = new Set(linesA);
|
|
455
|
+
const setB = new Set(linesB);
|
|
456
|
+
|
|
457
|
+
const removed = linesA.filter(l => !setB.has(l));
|
|
458
|
+
const added = linesB.filter(l => !setA.has(l));
|
|
459
|
+
const unchanged = linesA.filter(l => setB.has(l));
|
|
460
|
+
|
|
461
|
+
const countW = (lines) => lines.reduce((sum, l) => sum + l.split(/\s+/).filter(w => w.length > 0).length, 0);
|
|
462
|
+
|
|
463
|
+
const total = added.length + removed.length + unchanged.length;
|
|
464
|
+
const changeRatio = total > 0 ? (added.length + removed.length) / total : 0;
|
|
465
|
+
|
|
466
|
+
return {
|
|
467
|
+
lines_added: added.length,
|
|
468
|
+
lines_removed: removed.length,
|
|
469
|
+
lines_unchanged: unchanged.length,
|
|
470
|
+
words_added: countW(added),
|
|
471
|
+
words_removed: countW(removed),
|
|
472
|
+
change_ratio: changeRatio,
|
|
473
|
+
added_lines: added.slice(0, 20),
|
|
474
|
+
removed_lines: removed.slice(0, 20)
|
|
475
|
+
};
|
|
476
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML parsing utilities for WordPress MCP Server.
|
|
3
|
+
* Zero external dependencies — regex-based parsing.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Extract <img> tags from HTML content.
|
|
8
|
+
* @param {string} html
|
|
9
|
+
* @returns {{ src: string, alt: string, title: string }[]}
|
|
10
|
+
*/
|
|
11
|
+
export function parseImagesFromHtml(html) {
|
|
12
|
+
if (!html) return [];
|
|
13
|
+
const images = [];
|
|
14
|
+
const regex = /<img\s[^>]*?>/gi;
|
|
15
|
+
let match;
|
|
16
|
+
while ((match = regex.exec(html)) !== null) {
|
|
17
|
+
const tag = match[0];
|
|
18
|
+
const src = (tag.match(/src=["']([^"']+)["']/i) || [])[1] || '';
|
|
19
|
+
const alt = (tag.match(/alt=["']([^"']*?)["']/i) || [])[1] || '';
|
|
20
|
+
const title = (tag.match(/title=["']([^"']*?)["']/i) || [])[1] || '';
|
|
21
|
+
if (src) images.push({ src, alt, title });
|
|
22
|
+
}
|
|
23
|
+
return images;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Extract headings (H1-H6) from HTML content.
|
|
28
|
+
* @param {string} html
|
|
29
|
+
* @returns {{ level: number, text: string }[]}
|
|
30
|
+
*/
|
|
31
|
+
export function extractHeadings(html) {
|
|
32
|
+
if (!html) return [];
|
|
33
|
+
const headings = [];
|
|
34
|
+
const regex = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
35
|
+
let match;
|
|
36
|
+
while ((match = regex.exec(html)) !== null) {
|
|
37
|
+
const level = parseInt(match[1], 10);
|
|
38
|
+
const text = match[2].replace(/<[^>]*>/g, '').trim();
|
|
39
|
+
headings.push({ level, text });
|
|
40
|
+
}
|
|
41
|
+
return headings;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Extract internal links from HTML (returns normalised absolute URLs).
|
|
46
|
+
* @param {string} html
|
|
47
|
+
* @param {string} siteUrl e.g. https://example.com
|
|
48
|
+
* @returns {string[]}
|
|
49
|
+
*/
|
|
50
|
+
export function extractInternalLinks(html, siteUrl) {
|
|
51
|
+
if (!html || !siteUrl) return [];
|
|
52
|
+
let siteHost;
|
|
53
|
+
try { siteHost = new URL(siteUrl).host; } catch { return []; }
|
|
54
|
+
const links = [];
|
|
55
|
+
const regex = /<a\s[^>]*?href=["']([^"']+)["'][^>]*?>/gi;
|
|
56
|
+
let match;
|
|
57
|
+
while ((match = regex.exec(html)) !== null) {
|
|
58
|
+
const href = match[1];
|
|
59
|
+
try {
|
|
60
|
+
if (href.startsWith('/') && !href.startsWith('//')) {
|
|
61
|
+
links.push(`${siteUrl.replace(/\/+$/, '')}${href}`);
|
|
62
|
+
} else if (href.startsWith('http')) {
|
|
63
|
+
if (new URL(href).host === siteHost) links.push(href);
|
|
64
|
+
}
|
|
65
|
+
} catch { /* invalid URL, skip */ }
|
|
66
|
+
}
|
|
67
|
+
return links;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Count words in HTML content (strips tags first).
|
|
72
|
+
* @param {string} html
|
|
73
|
+
* @returns {number}
|
|
74
|
+
*/
|
|
75
|
+
export function countWords(html) {
|
|
76
|
+
if (!html) return 0;
|
|
77
|
+
const text = html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
78
|
+
if (!text) return 0;
|
|
79
|
+
return text.split(/\s+/).length;
|
|
80
|
+
}
|