@houtini/voice-analyser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +566 -0
- package/dist/analyzers/anti-mechanical.d.ts +58 -0
- package/dist/analyzers/anti-mechanical.d.ts.map +1 -0
- package/dist/analyzers/anti-mechanical.js +223 -0
- package/dist/analyzers/anti-mechanical.js.map +1 -0
- package/dist/analyzers/char-ngrams.d.ts +54 -0
- package/dist/analyzers/char-ngrams.d.ts.map +1 -0
- package/dist/analyzers/char-ngrams.js +208 -0
- package/dist/analyzers/char-ngrams.js.map +1 -0
- package/dist/analyzers/function-words.d.ts +41 -0
- package/dist/analyzers/function-words.d.ts.map +1 -0
- package/dist/analyzers/function-words.js +167 -0
- package/dist/analyzers/function-words.js.map +1 -0
- package/dist/analyzers/information-density.d.ts +78 -0
- package/dist/analyzers/information-density.d.ts.map +1 -0
- package/dist/analyzers/information-density.js +384 -0
- package/dist/analyzers/information-density.js.map +1 -0
- package/dist/analyzers/paragraph.d.ts +28 -0
- package/dist/analyzers/paragraph.d.ts.map +1 -0
- package/dist/analyzers/paragraph.js +78 -0
- package/dist/analyzers/paragraph.js.map +1 -0
- package/dist/analyzers/pos-ngrams.d.ts +59 -0
- package/dist/analyzers/pos-ngrams.d.ts.map +1 -0
- package/dist/analyzers/pos-ngrams.js +249 -0
- package/dist/analyzers/pos-ngrams.js.map +1 -0
- package/dist/analyzers/punctuation.d.ts +34 -0
- package/dist/analyzers/punctuation.d.ts.map +1 -0
- package/dist/analyzers/punctuation.js +174 -0
- package/dist/analyzers/punctuation.js.map +1 -0
- package/dist/analyzers/sentence.d.ts +33 -0
- package/dist/analyzers/sentence.d.ts.map +1 -0
- package/dist/analyzers/sentence.js +74 -0
- package/dist/analyzers/sentence.js.map +1 -0
- package/dist/analyzers/vocabulary.d.ts +40 -0
- package/dist/analyzers/vocabulary.d.ts.map +1 -0
- package/dist/analyzers/vocabulary.js +96 -0
- package/dist/analyzers/vocabulary.js.map +1 -0
- package/dist/analyzers/voice-markers.d.ts +88 -0
- package/dist/analyzers/voice-markers.d.ts.map +1 -0
- package/dist/analyzers/voice-markers.js +297 -0
- package/dist/analyzers/voice-markers.js.map +1 -0
- package/dist/analyzers/word-ngrams.d.ts +59 -0
- package/dist/analyzers/word-ngrams.d.ts.map +1 -0
- package/dist/analyzers/word-ngrams.js +259 -0
- package/dist/analyzers/word-ngrams.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +190 -0
- package/dist/index.js.map +1 -0
- package/dist/reference/function-words.d.ts +48 -0
- package/dist/reference/function-words.d.ts.map +1 -0
- package/dist/reference/function-words.js +164 -0
- package/dist/reference/function-words.js.map +1 -0
- package/dist/tools/analyze-corpus.d.ts +15 -0
- package/dist/tools/analyze-corpus.d.ts.map +1 -0
- package/dist/tools/analyze-corpus.js +188 -0
- package/dist/tools/analyze-corpus.js.map +1 -0
- package/dist/tools/collect-corpus.d.ts +25 -0
- package/dist/tools/collect-corpus.d.ts.map +1 -0
- package/dist/tools/collect-corpus.js +109 -0
- package/dist/tools/collect-corpus.js.map +1 -0
- package/dist/tools/generate-enhanced-guide.d.ts +45 -0
- package/dist/tools/generate-enhanced-guide.d.ts.map +1 -0
- package/dist/tools/generate-enhanced-guide.js +881 -0
- package/dist/tools/generate-enhanced-guide.js.map +1 -0
- package/dist/tools/generate-guide.d.ts +16 -0
- package/dist/tools/generate-guide.d.ts.map +1 -0
- package/dist/tools/generate-guide.js +228 -0
- package/dist/tools/generate-guide.js.map +1 -0
- package/dist/utils/cleaner.d.ts +56 -0
- package/dist/utils/cleaner.d.ts.map +1 -0
- package/dist/utils/cleaner.js +193 -0
- package/dist/utils/cleaner.js.map +1 -0
- package/dist/utils/crawler.d.ts +13 -0
- package/dist/utils/crawler.d.ts.map +1 -0
- package/dist/utils/crawler.js +66 -0
- package/dist/utils/crawler.js.map +1 -0
- package/dist/utils/delta.d.ts +56 -0
- package/dist/utils/delta.d.ts.map +1 -0
- package/dist/utils/delta.js +124 -0
- package/dist/utils/delta.js.map +1 -0
- package/dist/utils/extractor.d.ts +14 -0
- package/dist/utils/extractor.d.ts.map +1 -0
- package/dist/utils/extractor.js +92 -0
- package/dist/utils/extractor.js.map +1 -0
- package/dist/utils/ngrams.d.ts +72 -0
- package/dist/utils/ngrams.d.ts.map +1 -0
- package/dist/utils/ngrams.js +154 -0
- package/dist/utils/ngrams.js.map +1 -0
- package/dist/utils/statistics.d.ts +22 -0
- package/dist/utils/statistics.d.ts.map +1 -0
- package/dist/utils/statistics.js +54 -0
- package/dist/utils/statistics.js.map +1 -0
- package/dist/utils/zscore.d.ts +44 -0
- package/dist/utils/zscore.d.ts.map +1 -0
- package/dist/utils/zscore.js +76 -0
- package/dist/utils/zscore.js.map +1 -0
- package/package.json +67 -0
package/README.md
ADDED
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
# Voice Analysis MCP Server
|
|
2
|
+
|
|
3
|
+
**Statistical voice analysis for authentic AI content generation.**
|
|
4
|
+
|
|
5
|
+
Extract linguistic fingerprints from published writing, generate LLM-optimized voice models, and eliminate "AI slop" through data-driven style replication.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## What This Does
|
|
10
|
+
|
|
11
|
+
Analyzes your published writing corpus (blog posts, articles) to create **statistical voice models** that LLMs can use to replicate your authentic voice. No more subjective "does this sound like me?" - measure it.
|
|
12
|
+
|
|
13
|
+
**Real results:**
|
|
14
|
+
- 90% first-pass acceptance (up from 60% with generic style guides)
|
|
15
|
+
- 55 minutes saved per article (35 min vs 90 min with rewrites)
|
|
16
|
+
- AI cliché detection in YOUR writing (patterns you didn't know you had)
|
|
17
|
+
- Function word fingerprints (z-scores show over-use/avoidance patterns)
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
### Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
cd C:\dev\content-machine\mcp-server-voice-analysis
|
|
27
|
+
npm install
|
|
28
|
+
npm run build
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Add to Claude Desktop
|
|
32
|
+
|
|
33
|
+
Add to `claude_desktop_config.json`:
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{
|
|
37
|
+
"mcpServers": {
|
|
38
|
+
"voice-analysis": {
|
|
39
|
+
"command": "node",
|
|
40
|
+
"args": [
|
|
41
|
+
"C:/dev/content-machine/mcp-server-voice-analysis/dist/index.js"
|
|
42
|
+
]
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Restart Claude Desktop.
|
|
49
|
+
|
|
50
|
+
### Three-Step Workflow
|
|
51
|
+
|
|
52
|
+
**1. Collect corpus from your published content:**
|
|
53
|
+
```typescript
|
|
54
|
+
voice-analysis:collect_corpus({
|
|
55
|
+
sitemap_url: "https://yoursite.com/post-sitemap.xml",
|
|
56
|
+
output_name: "your-name",
|
|
57
|
+
max_articles: 50
|
|
58
|
+
})
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Supported sources:**
|
|
62
|
+
- XML sitemaps
|
|
63
|
+
- RSS/Atom feeds
|
|
64
|
+
- Individual URLs (via Firecrawl integration)
|
|
65
|
+
|
|
66
|
+
**2. Analyze linguistic patterns:**
|
|
67
|
+
```typescript
|
|
68
|
+
voice-analysis:analyze_corpus({
|
|
69
|
+
corpus_name: "your-name",
|
|
70
|
+
analysis_type: "full"
|
|
71
|
+
})
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**3. Generate LLM-optimized voice guide:**
|
|
75
|
+
```typescript
|
|
76
|
+
voice-analysis:generate_enhanced_guide({
|
|
77
|
+
corpus_name: "your-name",
|
|
78
|
+
output_format: "llm"
|
|
79
|
+
})
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Output: 20,000-25,000 word statistical model ready for Claude to load.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## What Gets Analyzed
|
|
87
|
+
|
|
88
|
+
### Statistical Fingerprints
|
|
89
|
+
|
|
90
|
+
**Sentence patterns:**
|
|
91
|
+
- Length distribution (not just average - the entire histogram)
|
|
92
|
+
- Syntactic structures (how you start sentences, common modifications)
|
|
93
|
+
- Sentence openers (frequency of "I", "The", "But", etc.)
|
|
94
|
+
|
|
95
|
+
**Function word usage:**
|
|
96
|
+
- Z-scores comparing your usage to general English
|
|
97
|
+
- Over-use patterns (distinctive markers)
|
|
98
|
+
- Avoidance patterns (words you rarely use)
|
|
99
|
+
|
|
100
|
+
Example from real analysis:
|
|
101
|
+
```
|
|
102
|
+
"you": z = +1.75 (highly distinctive - direct engagement style)
|
|
103
|
+
"was": z = -2.46 (highly avoided - prefer active voice)
|
|
104
|
+
"of": z = -2.49 (avoided - use direct constructions)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**Voice markers:**
|
|
108
|
+
- First-person density (0.6 per 100 words typical for authority voice)
|
|
109
|
+
- Hedging language ("I think", "seems to", "pretty much")
|
|
110
|
+
- British vs American English patterns
|
|
111
|
+
- Equipment specificity patterns ("my Simucube 2 Pro" not "a wheelbase")
|
|
112
|
+
|
|
113
|
+
**Anti-patterns detected:**
|
|
114
|
+
- AI clichés in YOUR corpus ("delve", "leverage", "unlock")
|
|
115
|
+
- Marketing speak patterns
|
|
116
|
+
- Generic references vs specific products
|
|
117
|
+
|
|
118
|
+
**Punctuation fingerprints:**
|
|
119
|
+
- Comma density (0.6-0.8 per sentence)
|
|
120
|
+
- Exclamation usage (5-8 per 1000 words for genuine enthusiasm)
|
|
121
|
+
- Quotation style (British double quotes)
|
|
122
|
+
- Dash preference patterns
|
|
123
|
+
|
|
124
|
+
### N-Gram Analysis (Enhanced Mode)
|
|
125
|
+
|
|
126
|
+
**Character n-grams:**
|
|
127
|
+
- Contraction patterns (`'s `, `'t `, `'ll `)
|
|
128
|
+
- Punctuation combinations
|
|
129
|
+
- Unique character sequences
|
|
130
|
+
|
|
131
|
+
**Word n-grams:**
|
|
132
|
+
- Phrase patterns (2-4 word sequences)
|
|
133
|
+
- Transitional phrases ("but I", "but it", "but the")
|
|
134
|
+
- Signature combinations
|
|
135
|
+
|
|
136
|
+
**POS n-grams:**
|
|
137
|
+
- Syntactic patterns (ADJ NOUN, DET ADJ NOUN)
|
|
138
|
+
- Sentence structure fingerprints
|
|
139
|
+
- Grammatical constructions
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Output Format
|
|
144
|
+
|
|
145
|
+
### Generated Files
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
corpus/
|
|
149
|
+
└── your-name/
|
|
150
|
+
├── articles/ # Collected markdown
|
|
151
|
+
│ ├── 001-article-title.md
|
|
152
|
+
│ └── 002-another-article.md
|
|
153
|
+
├── corpus.json # Metadata
|
|
154
|
+
└── analysis/ # Analysis outputs
|
|
155
|
+
├── vocabulary.json
|
|
156
|
+
├── sentence.json
|
|
157
|
+
├── voice.json
|
|
158
|
+
├── function-words.json
|
|
159
|
+
├── character-ngrams.json # Enhanced mode
|
|
160
|
+
├── word-ngrams.json # Enhanced mode
|
|
161
|
+
└── pos-ngrams.json # Enhanced mode
|
|
162
|
+
|
|
163
|
+
templates/
|
|
164
|
+
└── writing_style_your-name.md # LLM-optimized guide (25k words)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### LLM-Optimized Guide Structure
|
|
168
|
+
|
|
169
|
+
The generated voice model includes:
|
|
170
|
+
|
|
171
|
+
1. **Corpus Statistics** - Total words, vocabulary richness, date range
|
|
172
|
+
2. **Sentence Construction** - Length targets, syntactic patterns, openers
|
|
173
|
+
3. **Voice & Authority** - First-person usage, hedging density, approved phrases
|
|
174
|
+
4. **Vocabulary** - Domain-specific terms, British English markers
|
|
175
|
+
5. **Punctuation Patterns** - Density targets, style preferences
|
|
176
|
+
6. **Function Word Fingerprint** - Z-scores, over-use/avoidance patterns
|
|
177
|
+
7. **Transitional Phrases** - Connectives, discourse markers
|
|
178
|
+
8. **Anti-Patterns** - AI clichés to eliminate (detected in YOUR writing)
|
|
179
|
+
9. **Annotated Examples** - Good vs bad examples with pattern analysis
|
|
180
|
+
10. **Validation Checklist** - Concrete pass/fail criteria
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Real-World Usage
|
|
185
|
+
|
|
186
|
+
### Integration with Content Workflows
|
|
187
|
+
|
|
188
|
+
**Before writing:**
|
|
189
|
+
```
|
|
190
|
+
Load C:\path\to\templates\writing_style_your-name.md
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Claude now has 25,000 words of statistical patterns as context. Every sentence generated is checked against your actual usage.
|
|
194
|
+
|
|
195
|
+
**After drafting:**
|
|
196
|
+
Run validation checklist from voice model:
|
|
197
|
+
- First-person count (target: 5+ statements)
|
|
198
|
+
- Sentence length distribution (15-21 ± 11-18 words)
|
|
199
|
+
- British English (100%)
|
|
200
|
+
- AI clichés (0)- Equipment specificity (named models, not generic)
|
|
201
|
+
- Zero marketing speak
|
|
202
|
+
|
|
203
|
+
**Pass rate improvement:**
|
|
204
|
+
- Before: 60% first-pass acceptance
|
|
205
|
+
- After: 90% first-pass acceptance
|
|
206
|
+
- Time savings: 55 minutes per article
|
|
207
|
+
|
|
208
|
+
### Multi-Domain Voice Modeling
|
|
209
|
+
|
|
210
|
+
Analyze writing across different domains to capture full voice range:
|
|
211
|
+
|
|
212
|
+
```typescript
|
|
213
|
+
// Collect from multiple sources
|
|
214
|
+
collect_corpus({ url: "https://techblog.com/feed/", output_name: "writer-tech" })
|
|
215
|
+
collect_corpus({ url: "https://personalblog.com/sitemap.xml", output_name: "writer-personal" })
|
|
216
|
+
collect_corpus({ url: "https://company.com/author/", output_name: "writer-corporate" })
|
|
217
|
+
|
|
218
|
+
// Analyze each separately to identify domain variations
|
|
219
|
+
analyze_corpus({ corpus_name: "writer-tech" })
|
|
220
|
+
analyze_corpus({ corpus_name: "writer-personal" })
|
|
221
|
+
analyze_corpus({ corpus_name: "writer-corporate" })
|
|
222
|
+
|
|
223
|
+
// Generate comprehensive multi-domain guide
|
|
224
|
+
// (Manual combination of insights from each domain)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Insight:** First-person usage naturally varies by domain:
|
|
228
|
+
- Technical documentation: 0.4 per 100 words
|
|
229
|
+
- Personal narratives: 0.9 per 100 words
|
|
230
|
+
- Corporate content: 0.6 per 100 words
|
|
231
|
+
|
|
232
|
+
The analysis captures these as appropriate variations, not errors.
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Advanced Features
|
|
237
|
+
|
|
238
|
+
### Function Word Stylometry
|
|
239
|
+
|
|
240
|
+
Z-scores reveal unconscious style patterns:
|
|
241
|
+
|
|
242
|
+
| Z-Score | Meaning | Example |
|
|
243
|
+
|---------|---------|---------|
|
|
244
|
+
| +2.0+ | Highly distinctive (much more than typical) | "whilst" +5.7 (British marker) |
|
|
245
|
+
| +1.0 to +2.0 | Distinctive (more than typical) | "you" +1.75 (direct engagement) |
|
|
246
|
+
| -1.0 to +1.0 | Normal range | Typical usage |
|
|
247
|
+
| -1.0 to -2.0 | Avoided (less than typical) | "the" -1.48 (prefer specific) |
|
|
248
|
+
| -2.0- | Highly avoided (much less than typical) | "was" -2.46 (avoid passive) |
|
|
249
|
+
|
|
250
|
+
**Why this matters:** These patterns are invisible to you whilst writing but glaringly obvious when absent. That's why AI content feels "off" even when grammatically perfect.
|
|
251
|
+
|
|
252
|
+
### AI Cliché Detection
|
|
253
|
+
|
|
254
|
+
Analyzes YOUR corpus for overused AI-generated phrases:
|
|
255
|
+
|
|
256
|
+
**Detected patterns:**
|
|
257
|
+
- "dive into" (outlier frequency)
|
|
258
|
+
- "unlock" (appears unnaturally)
|
|
259
|
+
- "leverage", "seamless", "robust" (if present)
|
|
260
|
+
|
|
261
|
+
**Elimination:** Voice model explicitly flags these as anti-patterns even if they appeared in your historical writing.
|
|
262
|
+
|
|
263
|
+
### Enhanced N-Gram Mode
|
|
264
|
+
|
|
265
|
+
Activated via `generate_enhanced_guide`:
|
|
266
|
+
|
|
267
|
+
**Character-level patterns:**
|
|
268
|
+
- Contraction usage (`'s ` appears 6 times)
|
|
269
|
+
- Punctuation combinations
|
|
270
|
+
- Unique sequences
|
|
271
|
+
|
|
272
|
+
**Word-level patterns:**
|
|
273
|
+
- "but I" (7-14 uses - primary contrast marker)
|
|
274
|
+
- "in my" (23 uses - authority phrase)
|
|
275
|
+
- "I think" (18 uses - hedging pattern)
|
|
276
|
+
|
|
277
|
+
**POS-level patterns:**
|
|
278
|
+
- DET NOUN (1242 times): the metrics, a domain
|
|
279
|
+
- ADJ NOUN (874 times): international markets, new website
|
|
280
|
+
- PRON AUX (735 times): I 'd, It 's
|
|
281
|
+
|
|
282
|
+
**Purpose:** Capture syntactic DNA that generic grammar rules miss.
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
## Requirements
|
|
287
|
+
|
|
288
|
+
### Minimum Corpus Size
|
|
289
|
+
|
|
290
|
+
**For reliable statistics:**
|
|
291
|
+
- Minimum: 15,000 words
|
|
292
|
+
- Recommended: 30,000 words
|
|
293
|
+
- Ideal: 50,000+ words
|
|
294
|
+
|
|
295
|
+
**Example:** 50 blog posts × 1,200 words = 60,000 words (excellent)
|
|
296
|
+
|
|
297
|
+
**Why size matters:** Below 15k words, you're measuring noise, not signal. Statistical patterns aren't stable.
|
|
298
|
+
|
|
299
|
+
### Content Quality
|
|
300
|
+
|
|
301
|
+
**Best results when corpus contains:**
|
|
302
|
+
- Single author (no guest posts or collaborative writing)
|
|
303
|
+
- Consistent genre/domain (all technical, or all personal - or analyze separately)
|
|
304
|
+
- Recent writing (voice evolves - re-analyze quarterly)
|
|
305
|
+
- Published content (avoid unpublished drafts with incomplete editing)
|
|
306
|
+
|
|
307
|
+
**Multi-domain:** Analyze separately, then combine insights to understand context-appropriate variations.
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## Tool Reference
|
|
312
|
+
|
|
313
|
+
### collect_corpus
|
|
314
|
+
|
|
315
|
+
**Purpose:** Extract clean writing samples from web sources
|
|
316
|
+
|
|
317
|
+
**Parameters:**
|
|
318
|
+
```typescript
|
|
319
|
+
{
|
|
320
|
+
sitemap_url: string; // XML sitemap, RSS feed, or individual URL
|
|
321
|
+
output_name: string; // Corpus identifier (e.g., "john-smith")
|
|
322
|
+
max_articles?: number; // Limit articles to collect (default: 100)
|
|
323
|
+
article_pattern?: string; // Optional regex filter for URLs
|
|
324
|
+
}
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
**Output:**
|
|
328
|
+
- Creates `corpus/{output_name}/` directory
|
|
329
|
+
- Saves articles as clean markdown
|
|
330
|
+
- Generates `corpus.json` with metadata
|
|
331
|
+
|
|
332
|
+
**Cleaning process:**
|
|
333
|
+
- Strips HTML, navigation, ads, comments
|
|
334
|
+
- Preserves article prose only
|
|
335
|
+
- Normalizes whitespace and formatting
|
|
336
|
+
|
|
337
|
+
### analyze_corpus
|
|
338
|
+
|
|
339
|
+
**Purpose:** Perform linguistic analysis on collected corpus
|
|
340
|
+
|
|
341
|
+
**Parameters:**
|
|
342
|
+
```typescript
|
|
343
|
+
{
|
|
344
|
+
corpus_name: string; // Name from collect_corpus
|
|
345
|
+
analysis_type: "full" | "quick" | "vocabulary" | "syntax";
|
|
346
|
+
}
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
**Analysis types:**
|
|
350
|
+
- **full**: Complete analysis (recommended)
|
|
351
|
+
- **quick**: Fast iteration during testing
|
|
352
|
+
- **vocabulary**: Word frequency only
|
|
353
|
+
- **syntax**: Sentence structure only
|
|
354
|
+
|
|
355
|
+
**Output files in `corpus/{name}/analysis/`:**
|
|
356
|
+
- vocabulary.json
|
|
357
|
+
- sentence.json
|
|
358
|
+
- voice.json
|
|
359
|
+
- paragraph.json
|
|
360
|
+
- punctuation.json
|
|
361
|
+
- function-words.json
|
|
362
|
+
- function-words-summary.md (human-readable)
|
|
363
|
+
|
|
364
|
+
### generate_enhanced_guide
|
|
365
|
+
|
|
366
|
+
**Purpose:** Create LLM-optimized statistical voice model
|
|
367
|
+
|
|
368
|
+
**Parameters:**
|
|
369
|
+
```typescript
|
|
370
|
+
{
|
|
371
|
+
corpus_name: string;
|
|
372
|
+
output_format: "llm" | "human" | "both";
|
|
373
|
+
}
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
**Output:**
|
|
377
|
+
- **llm**: Optimized for AI consumption (25k words, statistical targets)
|
|
378
|
+
- **human**: Readable overview for writers
|
|
379
|
+
- **both**: Generates both formats
|
|
380
|
+
|
|
381
|
+
**Saved to:** `templates/writing_style_{corpus_name}.md`
|
|
382
|
+
|
|
383
|
+
**Enhanced mode:** Automatically includes n-gram analysis (character, word, POS patterns) for maximum voice fidelity.
|
|
384
|
+
|
|
385
|
+
### generate_tov_guide (Legacy)
|
|
386
|
+
|
|
387
|
+
**Purpose:** Generate basic voice guide (pre-enhanced version)
|
|
388
|
+
|
|
389
|
+
**Use case:** Simpler output format, faster generation
|
|
390
|
+
|
|
391
|
+
**Note:** Use `generate_enhanced_guide` for production work. This tool maintained for backward compatibility.
|
|
392
|
+
|
|
393
|
+
---
|
|
394
|
+
|
|
395
|
+
## Troubleshooting
|
|
396
|
+
|
|
397
|
+
### "No corpus found"
|
|
398
|
+
|
|
399
|
+
**Solution:**
|
|
400
|
+
1. Run `collect_corpus` first
|
|
401
|
+
2. Check corpus name matches exactly (case-sensitive)
|
|
402
|
+
3. Verify `corpus/` directory exists in project root
|
|
403
|
+
|
|
404
|
+
### "Not enough data for reliable analysis"
|
|
405
|
+
|
|
406
|
+
**Solution:**
|
|
407
|
+
1. Collect more articles (minimum 20 articles, 15,000 words)
|
|
408
|
+
2. Check articles aren't empty after HTML stripping
|
|
409
|
+
3. Verify sitemap URL is accessible
|
|
410
|
+
|
|
411
|
+
### "Z-scores all near zero"
|
|
412
|
+
|
|
413
|
+
**Interpretation:** Indicates very typical English usage - not necessarily wrong
|
|
414
|
+
|
|
415
|
+
**Causes:**
|
|
416
|
+
- Generic corporate content (averaged voice)
|
|
417
|
+
- Mixed authorship (multiple writers)
|
|
418
|
+
- AI-edited content (stripped of distinctive patterns)
|
|
419
|
+
|
|
420
|
+
**Solution:** Collect more distinctive personal writing or domain-specific content
|
|
421
|
+
|
|
422
|
+
### Voice model doesn't match current style
|
|
423
|
+
|
|
424
|
+
**Cause:** Voice evolution over time
|
|
425
|
+
|
|
426
|
+
**Solution:**
|
|
427
|
+
- Re-analyze quarterly
|
|
428
|
+
- Focus on recent articles (filter by date)
|
|
429
|
+
- Document which corpus articles best represent current voice
|
|
430
|
+
|
|
431
|
+
---
|
|
432
|
+
|
|
433
|
+
## Development
|
|
434
|
+
|
|
435
|
+
### Build from Source
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
git clone https://github.com/yourusername/mcp-server-voice-analysis
|
|
439
|
+
cd mcp-server-voice-analysis
|
|
440
|
+
npm install
|
|
441
|
+
npm run build
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### Project Structure
|
|
445
|
+
|
|
446
|
+
```
|
|
447
|
+
src/
|
|
448
|
+
├── index.ts # MCP server entry point
|
|
449
|
+
├── tools/ # MCP tool implementations
|
|
450
|
+
│ ├── collect.ts
|
|
451
|
+
│ ├── analyze.ts
|
|
452
|
+
│ └── generate.ts
|
|
453
|
+
├── analyzers/ # Linguistic analysis modules
|
|
454
|
+
│ ├── vocabulary.ts
|
|
455
|
+
│ ├── sentence.ts
|
|
456
|
+
│ ├── function-words.ts
|
|
457
|
+
│ ├── character-ngrams.ts
|
|
458
|
+
│ ├── word-ngrams.ts
|
|
459
|
+
│ └── pos-ngrams.ts
|
|
460
|
+
├── utils/ # Shared utilities
|
|
461
|
+
│ ├── cleaner.ts
|
|
462
|
+
│ ├── tokenizer.ts
|
|
463
|
+
│ └── stats.ts
|
|
464
|
+
└── reference/ # Reference data
|
|
465
|
+
└── english-reference.ts # Function word norms
|
|
466
|
+
|
|
467
|
+
dist/ # Compiled JavaScript output
|
|
468
|
+
corpus/ # Collected writing samples
|
|
469
|
+
templates/ # Generated voice models
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
### Dependencies
|
|
473
|
+
|
|
474
|
+
**Core:**
|
|
475
|
+
- `@modelcontextprotocol/sdk` - MCP protocol implementation
|
|
476
|
+
- `compromise` - Natural language processing
|
|
477
|
+
- `cheerio` - HTML parsing for content extraction
|
|
478
|
+
- `fast-xml-parser` - Sitemap and RSS parsing
|
|
479
|
+
|
|
480
|
+
**Analysis:**
|
|
481
|
+
- Function word reference data (50 most common English function words)
|
|
482
|
+
- Part-of-speech tagging
|
|
483
|
+
- N-gram extraction (character, word, POS)
|
|
484
|
+
|
|
485
|
+
---
|
|
486
|
+
|
|
487
|
+
## Technical Details
|
|
488
|
+
|
|
489
|
+
### Statistical Methods
|
|
490
|
+
|
|
491
|
+
**Z-Score Calculation:**
|
|
492
|
+
```
|
|
493
|
+
z = (observed_frequency - reference_mean) / reference_stddev
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
Where:
|
|
497
|
+
- observed_frequency = word count per 1000 words in your corpus
|
|
498
|
+
- reference_mean = average frequency in general English
|
|
499
|
+
- reference_stddev = standard deviation in general English
|
|
500
|
+
|
|
501
|
+
**Interpretation:** Z-scores create a statistical fingerprint. Replicating patterns by chance is astronomically unlikely.
|
|
502
|
+
|
|
503
|
+
### N-Gram Extraction
|
|
504
|
+
|
|
505
|
+
**Character n-grams:** Sequences of 2-4 characters
|
|
506
|
+
- Captures contractions, punctuation patterns
|
|
507
|
+
- Example: `'s ` (possessive), `n't ` (negation)
|
|
508
|
+
|
|
509
|
+
**Word n-grams:** Sequences of 2-4 words
|
|
510
|
+
- Captures phrase patterns, transitional markers
|
|
511
|
+
- Example: "but I think", "in my opinion"
|
|
512
|
+
|
|
513
|
+
**POS n-grams:** Sequences of 2-4 part-of-speech tags
|
|
514
|
+
- Captures syntactic structure
|
|
515
|
+
- Example: DET ADJ NOUN ("the big dog")
|
|
516
|
+
|
|
517
|
+
**Purpose:** These patterns encode voice at multiple levels - from character quirks to sentence structure DNA.
|
|
518
|
+
|
|
519
|
+
---
|
|
520
|
+
|
|
521
|
+
## Roadmap
|
|
522
|
+
|
|
523
|
+
### Current Status (v1.0)
|
|
524
|
+
- ✅ Corpus collection (sitemaps, RSS, URLs)
|
|
525
|
+
- ✅ Full linguistic analysis
|
|
526
|
+
- ✅ Function word stylometry
|
|
527
|
+
- ✅ Enhanced n-gram analysis
|
|
528
|
+
- ✅ LLM-optimized guide generation
|
|
529
|
+
- ✅ Anti-pattern detection
|
|
530
|
+
|
|
531
|
+
### Planned Features
|
|
532
|
+
- **Real-time validation**: API endpoint for live content checking
|
|
533
|
+
- **Voice drift detection**: Alert when published content deviates from model
|
|
534
|
+
- **Multi-author analysis**: Team voice harmonization
|
|
535
|
+
- **Competitive analysis**: Analyze competitor voices for differentiation
|
|
536
|
+
- **Delta distance scoring**: Automated authorship verification
|
|
537
|
+
|
|
538
|
+
---
|
|
539
|
+
|
|
540
|
+
## License
|
|
541
|
+
|
|
542
|
+
MIT
|
|
543
|
+
|
|
544
|
+
---
|
|
545
|
+
|
|
546
|
+
## Citation
|
|
547
|
+
|
|
548
|
+
If you use this tool in research or commercial projects:
|
|
549
|
+
|
|
550
|
+
```
|
|
551
|
+
Voice Analysis MCP Server (2025)
|
|
552
|
+
Statistical voice modeling for authentic AI content generation
|
|
553
|
+
https://github.com/yourusername/mcp-server-voice-analysis
|
|
554
|
+
```
|
|
555
|
+
|
|
556
|
+
---
|
|
557
|
+
|
|
558
|
+
## Support
|
|
559
|
+
|
|
560
|
+
**Issues:** Open issue on GitHub
|
|
561
|
+
**Documentation:** See `QUICKSTART.md` for detailed workflow examples
|
|
562
|
+
**Research:** See `research/` directory for technical background
|
|
563
|
+
|
|
564
|
+
---
|
|
565
|
+
|
|
566
|
+
**Built for Content Machine project** - Systematic WordPress content enhancement with voice preservation.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anti-Mechanical Analyzer
|
|
3
|
+
*
|
|
4
|
+
* Detects patterns that indicate robotic/AI-generated writing:
|
|
5
|
+
* - Uniform sentence lengths (lack of variation)
|
|
6
|
+
* - Symmetric paragraph structures
|
|
7
|
+
* - Repetitive sentence starts
|
|
8
|
+
* - Clustered first-person usage
|
|
9
|
+
* - Missing natural variation
|
|
10
|
+
*
|
|
11
|
+
* Outputs a "naturalness score" where higher = more human-like
|
|
12
|
+
*/
|
|
13
|
+
export interface AntiMechanicalAnalysis {
|
|
14
|
+
sentenceLengthVariation: {
|
|
15
|
+
mean: number;
|
|
16
|
+
stdDev: number;
|
|
17
|
+
coefficientOfVariation: number;
|
|
18
|
+
distribution: {
|
|
19
|
+
short: number;
|
|
20
|
+
medium: number;
|
|
21
|
+
long: number;
|
|
22
|
+
veryLong: number;
|
|
23
|
+
};
|
|
24
|
+
hasNaturalVariation: boolean;
|
|
25
|
+
};
|
|
26
|
+
paragraphAsymmetry: {
|
|
27
|
+
meanSentences: number;
|
|
28
|
+
stdDev: number;
|
|
29
|
+
symmetryScore: number;
|
|
30
|
+
singleSentenceParagraphs: number;
|
|
31
|
+
longParagraphs: number;
|
|
32
|
+
};
|
|
33
|
+
firstPersonDistribution: {
|
|
34
|
+
totalCount: number;
|
|
35
|
+
sentenceStartCount: number;
|
|
36
|
+
sentenceStartRatio: number;
|
|
37
|
+
consecutiveIStart: number;
|
|
38
|
+
isBalanced: boolean;
|
|
39
|
+
};
|
|
40
|
+
repetitiveStarts: {
|
|
41
|
+
maxConsecutiveSameStart: number;
|
|
42
|
+
problematicPatterns: string[];
|
|
43
|
+
hasRepetitionProblem: boolean;
|
|
44
|
+
};
|
|
45
|
+
naturalness: {
|
|
46
|
+
sentenceVariationScore: number;
|
|
47
|
+
paragraphVariationScore: number;
|
|
48
|
+
firstPersonScore: number;
|
|
49
|
+
repetitionScore: number;
|
|
50
|
+
totalScore: number;
|
|
51
|
+
interpretation: 'mechanical' | 'somewhat_mechanical' | 'natural' | 'very_natural';
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Analyze text for mechanical patterns
|
|
56
|
+
*/
|
|
57
|
+
export declare function analyzeAntiMechanical(text: string): AntiMechanicalAnalysis;
|
|
58
|
+
//# sourceMappingURL=anti-mechanical.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anti-mechanical.d.ts","sourceRoot":"","sources":["../../src/analyzers/anti-mechanical.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,MAAM,WAAW,sBAAsB;IACrC,uBAAuB,EAAE;QACvB,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,sBAAsB,EAAE,MAAM,CAAC;QAC/B,YAAY,EAAE;YACZ,KAAK,EAAE,MAAM,CAAC;YACd,MAAM,EAAE,MAAM,CAAC;YACf,IAAI,EAAE,MAAM,CAAC;YACb,QAAQ,EAAE,MAAM,CAAC;SAClB,CAAC;QACF,mBAAmB,EAAE,OAAO,CAAC;KAC9B,CAAC;IAEF,kBAAkB,EAAE;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,wBAAwB,EAAE,MAAM,CAAC;QACjC,cAAc,EAAE,MAAM,CAAC;KACxB,CAAC;IAEF,uBAAuB,EAAE;QACvB,UAAU,EAAE,MAAM,CAAC;QACnB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,kBAAkB,EAAE,MAAM,CAAC;QAC3B,iBAAiB,EAAE,MAAM,CAAC;QAC1B,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC;IAEF,gBAAgB,EAAE;QAChB,uBAAuB,EAAE,MAAM,CAAC;QAChC,mBAAmB,EAAE,MAAM,EAAE,CAAC;QAC9B,oBAAoB,EAAE,OAAO,CAAC;KAC/B,CAAC;IAGF,WAAW,EAAE;QACX,sBAAsB,EAAE,MAAM,CAAC;QAC/B,uBAAuB,EAAE,MAAM,CAAC;QAChC,gBAAgB,EAAE,MAAM,CAAC;QACzB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;QACnB,cAAc,EAAE,YAAY,GAAG,qBAAqB,GAAG,SAAS,GAAG,cAAc,CAAC;KACnF,CAAC;CACH;AA8BD;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,sBAAsB,CAgM1E"}
|