@houtini/voice-analyser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +566 -0
  2. package/dist/analyzers/anti-mechanical.d.ts +58 -0
  3. package/dist/analyzers/anti-mechanical.d.ts.map +1 -0
  4. package/dist/analyzers/anti-mechanical.js +223 -0
  5. package/dist/analyzers/anti-mechanical.js.map +1 -0
  6. package/dist/analyzers/char-ngrams.d.ts +54 -0
  7. package/dist/analyzers/char-ngrams.d.ts.map +1 -0
  8. package/dist/analyzers/char-ngrams.js +208 -0
  9. package/dist/analyzers/char-ngrams.js.map +1 -0
  10. package/dist/analyzers/function-words.d.ts +41 -0
  11. package/dist/analyzers/function-words.d.ts.map +1 -0
  12. package/dist/analyzers/function-words.js +167 -0
  13. package/dist/analyzers/function-words.js.map +1 -0
  14. package/dist/analyzers/information-density.d.ts +78 -0
  15. package/dist/analyzers/information-density.d.ts.map +1 -0
  16. package/dist/analyzers/information-density.js +384 -0
  17. package/dist/analyzers/information-density.js.map +1 -0
  18. package/dist/analyzers/paragraph.d.ts +28 -0
  19. package/dist/analyzers/paragraph.d.ts.map +1 -0
  20. package/dist/analyzers/paragraph.js +78 -0
  21. package/dist/analyzers/paragraph.js.map +1 -0
  22. package/dist/analyzers/pos-ngrams.d.ts +59 -0
  23. package/dist/analyzers/pos-ngrams.d.ts.map +1 -0
  24. package/dist/analyzers/pos-ngrams.js +249 -0
  25. package/dist/analyzers/pos-ngrams.js.map +1 -0
  26. package/dist/analyzers/punctuation.d.ts +34 -0
  27. package/dist/analyzers/punctuation.d.ts.map +1 -0
  28. package/dist/analyzers/punctuation.js +174 -0
  29. package/dist/analyzers/punctuation.js.map +1 -0
  30. package/dist/analyzers/sentence.d.ts +33 -0
  31. package/dist/analyzers/sentence.d.ts.map +1 -0
  32. package/dist/analyzers/sentence.js +74 -0
  33. package/dist/analyzers/sentence.js.map +1 -0
  34. package/dist/analyzers/vocabulary.d.ts +40 -0
  35. package/dist/analyzers/vocabulary.d.ts.map +1 -0
  36. package/dist/analyzers/vocabulary.js +96 -0
  37. package/dist/analyzers/vocabulary.js.map +1 -0
  38. package/dist/analyzers/voice-markers.d.ts +88 -0
  39. package/dist/analyzers/voice-markers.d.ts.map +1 -0
  40. package/dist/analyzers/voice-markers.js +297 -0
  41. package/dist/analyzers/voice-markers.js.map +1 -0
  42. package/dist/analyzers/word-ngrams.d.ts +59 -0
  43. package/dist/analyzers/word-ngrams.d.ts.map +1 -0
  44. package/dist/analyzers/word-ngrams.js +259 -0
  45. package/dist/analyzers/word-ngrams.js.map +1 -0
  46. package/dist/index.d.ts +7 -0
  47. package/dist/index.d.ts.map +1 -0
  48. package/dist/index.js +190 -0
  49. package/dist/index.js.map +1 -0
  50. package/dist/reference/function-words.d.ts +48 -0
  51. package/dist/reference/function-words.d.ts.map +1 -0
  52. package/dist/reference/function-words.js +164 -0
  53. package/dist/reference/function-words.js.map +1 -0
  54. package/dist/tools/analyze-corpus.d.ts +15 -0
  55. package/dist/tools/analyze-corpus.d.ts.map +1 -0
  56. package/dist/tools/analyze-corpus.js +188 -0
  57. package/dist/tools/analyze-corpus.js.map +1 -0
  58. package/dist/tools/collect-corpus.d.ts +25 -0
  59. package/dist/tools/collect-corpus.d.ts.map +1 -0
  60. package/dist/tools/collect-corpus.js +109 -0
  61. package/dist/tools/collect-corpus.js.map +1 -0
  62. package/dist/tools/generate-enhanced-guide.d.ts +45 -0
  63. package/dist/tools/generate-enhanced-guide.d.ts.map +1 -0
  64. package/dist/tools/generate-enhanced-guide.js +881 -0
  65. package/dist/tools/generate-enhanced-guide.js.map +1 -0
  66. package/dist/tools/generate-guide.d.ts +16 -0
  67. package/dist/tools/generate-guide.d.ts.map +1 -0
  68. package/dist/tools/generate-guide.js +228 -0
  69. package/dist/tools/generate-guide.js.map +1 -0
  70. package/dist/utils/cleaner.d.ts +56 -0
  71. package/dist/utils/cleaner.d.ts.map +1 -0
  72. package/dist/utils/cleaner.js +193 -0
  73. package/dist/utils/cleaner.js.map +1 -0
  74. package/dist/utils/crawler.d.ts +13 -0
  75. package/dist/utils/crawler.d.ts.map +1 -0
  76. package/dist/utils/crawler.js +66 -0
  77. package/dist/utils/crawler.js.map +1 -0
  78. package/dist/utils/delta.d.ts +56 -0
  79. package/dist/utils/delta.d.ts.map +1 -0
  80. package/dist/utils/delta.js +124 -0
  81. package/dist/utils/delta.js.map +1 -0
  82. package/dist/utils/extractor.d.ts +14 -0
  83. package/dist/utils/extractor.d.ts.map +1 -0
  84. package/dist/utils/extractor.js +92 -0
  85. package/dist/utils/extractor.js.map +1 -0
  86. package/dist/utils/ngrams.d.ts +72 -0
  87. package/dist/utils/ngrams.d.ts.map +1 -0
  88. package/dist/utils/ngrams.js +154 -0
  89. package/dist/utils/ngrams.js.map +1 -0
  90. package/dist/utils/statistics.d.ts +22 -0
  91. package/dist/utils/statistics.d.ts.map +1 -0
  92. package/dist/utils/statistics.js +54 -0
  93. package/dist/utils/statistics.js.map +1 -0
  94. package/dist/utils/zscore.d.ts +44 -0
  95. package/dist/utils/zscore.d.ts.map +1 -0
  96. package/dist/utils/zscore.js +76 -0
  97. package/dist/utils/zscore.js.map +1 -0
  98. package/package.json +67 -0
package/README.md ADDED
@@ -0,0 +1,566 @@
1
+ # Voice Analysis MCP Server
2
+
3
+ **Statistical voice analysis for authentic AI content generation.**
4
+
5
+ Extract linguistic fingerprints from published writing, generate LLM-optimized voice models, and eliminate "AI slop" through data-driven style replication.
6
+
7
+ ---
8
+
9
+ ## What This Does
10
+
11
+ Analyzes your published writing corpus (blog posts, articles) to create **statistical voice models** that LLMs can use to replicate your authentic voice. No more subjective "does this sound like me?" - measure it.
12
+
13
+ **Real results:**
14
+ - 90% first-pass acceptance (up from 60% with generic style guides)
15
+ - 55 minutes saved per article (35 min vs 90 min with rewrites)
16
+ - AI cliché detection in YOUR writing (patterns you didn't know you had)
17
+ - Function word fingerprints (z-scores show over-use/avoidance patterns)
18
+
19
+ ---
20
+
21
+ ## Quick Start
22
+
23
+ ### Installation
24
+
25
+ ```bash
26
+ cd C:\dev\content-machine\mcp-server-voice-analysis
27
+ npm install
28
+ npm run build
29
+ ```
30
+
31
+ ### Add to Claude Desktop
32
+
33
+ Add to `claude_desktop_config.json`:
34
+
35
+ ```json
36
+ {
37
+ "mcpServers": {
38
+ "voice-analysis": {
39
+ "command": "node",
40
+ "args": [
41
+ "C:/dev/content-machine/mcp-server-voice-analysis/dist/index.js"
42
+ ]
43
+ }
44
+ }
45
+ }
46
+ ```
47
+
48
+ Restart Claude Desktop.
49
+
50
+ ### Three-Step Workflow
51
+
52
+ **1. Collect corpus from your published content:**
53
+ ```typescript
54
+ voice-analysis:collect_corpus({
55
+ sitemap_url: "https://yoursite.com/post-sitemap.xml",
56
+ output_name: "your-name",
57
+ max_articles: 50
58
+ })
59
+ ```
60
+
61
+ **Supported sources:**
62
+ - XML sitemaps
63
+ - RSS/Atom feeds
64
+ - Individual URLs (via Firecrawl integration)
65
+
66
+ **2. Analyze linguistic patterns:**
67
+ ```typescript
68
+ voice-analysis:analyze_corpus({
69
+ corpus_name: "your-name",
70
+ analysis_type: "full"
71
+ })
72
+ ```
73
+
74
+ **3. Generate LLM-optimized voice guide:**
75
+ ```typescript
76
+ voice-analysis:generate_enhanced_guide({
77
+ corpus_name: "your-name",
78
+ output_format: "llm"
79
+ })
80
+ ```
81
+
82
+ Output: 20,000-25,000 word statistical model ready for Claude to load.
83
+
84
+ ---
85
+
86
+ ## What Gets Analyzed
87
+
88
+ ### Statistical Fingerprints
89
+
90
+ **Sentence patterns:**
91
+ - Length distribution (not just average - the entire histogram)
92
+ - Syntactic structures (how you start sentences, common modifications)
93
+ - Sentence openers (frequency of "I", "The", "But", etc.)
94
+
95
+ **Function word usage:**
96
+ - Z-scores comparing your usage to general English
97
+ - Over-use patterns (distinctive markers)
98
+ - Avoidance patterns (words you rarely use)
99
+
100
+ Example from real analysis:
101
+ ```
102
+ "you": z = +1.75 (highly distinctive - direct engagement style)
103
+ "was": z = -2.46 (highly avoided - prefer active voice)
104
+ "of": z = -2.49 (avoided - use direct constructions)
105
+ ```
106
+
107
+ **Voice markers:**
108
+ - First-person density (0.6 per 100 words typical for authority voice)
109
+ - Hedging language ("I think", "seems to", "pretty much")
110
+ - British vs American English patterns
111
+ - Equipment specificity patterns ("my Simucube 2 Pro" not "a wheelbase")
112
+
113
+ **Anti-patterns detected:**
114
+ - AI clichés in YOUR corpus ("delve", "leverage", "unlock")
115
+ - Marketing speak patterns
116
+ - Generic references vs specific products
117
+
118
+ **Punctuation fingerprints:**
119
+ - Comma density (0.6-0.8 per sentence)
120
+ - Exclamation usage (5-8 per 1000 words for genuine enthusiasm)
121
+ - Quotation style (British double quotes)
122
+ - Dash preference patterns
123
+
124
+ ### N-Gram Analysis (Enhanced Mode)
125
+
126
+ **Character n-grams:**
127
+ - Contraction patterns (`'s `, `'t `, `'ll `)
128
+ - Punctuation combinations
129
+ - Unique character sequences
130
+
131
+ **Word n-grams:**
132
+ - Phrase patterns (2-4 word sequences)
133
+ - Transitional phrases ("but I", "but it", "but the")
134
+ - Signature combinations
135
+
136
+ **POS n-grams:**
137
+ - Syntactic patterns (ADJ NOUN, DET ADJ NOUN)
138
+ - Sentence structure fingerprints
139
+ - Grammatical constructions
140
+
141
+ ---
142
+
143
+ ## Output Format
144
+
145
+ ### Generated Files
146
+
147
+ ```
148
+ corpus/
149
+ └── your-name/
150
+ ├── articles/ # Collected markdown
151
+ │ ├── 001-article-title.md
152
+ │ └── 002-another-article.md
153
+ ├── corpus.json # Metadata
154
+ └── analysis/ # Analysis outputs
155
+ ├── vocabulary.json
156
+ ├── sentence.json
157
+ ├── voice.json
158
+ ├── function-words.json
159
+ ├── character-ngrams.json # Enhanced mode
160
+ ├── word-ngrams.json # Enhanced mode
161
+ └── pos-ngrams.json # Enhanced mode
162
+
163
+ templates/
164
+ └── writing_style_your-name.md # LLM-optimized guide (25k words)
165
+ ```
166
+
167
+ ### LLM-Optimized Guide Structure
168
+
169
+ The generated voice model includes:
170
+
171
+ 1. **Corpus Statistics** - Total words, vocabulary richness, date range
172
+ 2. **Sentence Construction** - Length targets, syntactic patterns, openers
173
+ 3. **Voice & Authority** - First-person usage, hedging density, approved phrases
174
+ 4. **Vocabulary** - Domain-specific terms, British English markers
175
+ 5. **Punctuation Patterns** - Density targets, style preferences
176
+ 6. **Function Word Fingerprint** - Z-scores, over-use/avoidance patterns
177
+ 7. **Transitional Phrases** - Connectives, discourse markers
178
+ 8. **Anti-Patterns** - AI clichés to eliminate (detected in YOUR writing)
179
+ 9. **Annotated Examples** - Good vs bad examples with pattern analysis
180
+ 10. **Validation Checklist** - Concrete pass/fail criteria
181
+
182
+ ---
183
+
184
+ ## Real-World Usage
185
+
186
+ ### Integration with Content Workflows
187
+
188
+ **Before writing:**
189
+ ```
190
+ Load C:\path\to\templates\writing_style_your-name.md
191
+ ```
192
+
193
+ Claude now has 25,000 words of statistical patterns as context. Every sentence generated is checked against your actual usage.
194
+
195
+ **After drafting:**
196
+ Run validation checklist from voice model:
197
+ - First-person count (target: 5+ statements)
198
+ - Sentence length distribution (15-21 ± 11-18 words)
199
+ - British English (100%)
200
+ - AI clichés (0)- Equipment specificity (named models, not generic)
201
+ - Zero marketing speak
202
+
203
+ **Pass rate improvement:**
204
+ - Before: 60% first-pass acceptance
205
+ - After: 90% first-pass acceptance
206
+ - Time savings: 55 minutes per article
207
+
208
+ ### Multi-Domain Voice Modeling
209
+
210
+ Analyze writing across different domains to capture full voice range:
211
+
212
+ ```typescript
213
+ // Collect from multiple sources
214
+ collect_corpus({ url: "https://techblog.com/feed/", output_name: "writer-tech" })
215
+ collect_corpus({ url: "https://personalblog.com/sitemap.xml", output_name: "writer-personal" })
216
+ collect_corpus({ url: "https://company.com/author/", output_name: "writer-corporate" })
217
+
218
+ // Analyze each separately to identify domain variations
219
+ analyze_corpus({ corpus_name: "writer-tech" })
220
+ analyze_corpus({ corpus_name: "writer-personal" })
221
+ analyze_corpus({ corpus_name: "writer-corporate" })
222
+
223
+ // Generate comprehensive multi-domain guide
224
+ // (Manual combination of insights from each domain)
225
+ ```
226
+
227
+ **Insight:** First-person usage naturally varies by domain:
228
+ - Technical documentation: 0.4 per 100 words
229
+ - Personal narratives: 0.9 per 100 words
230
+ - Corporate content: 0.6 per 100 words
231
+
232
+ The analysis captures these as appropriate variations, not errors.
233
+
234
+ ---
235
+
236
+ ## Advanced Features
237
+
238
+ ### Function Word Stylometry
239
+
240
+ Z-scores reveal unconscious style patterns:
241
+
242
+ | Z-Score | Meaning | Example |
243
+ |---------|---------|---------|
244
+ | +2.0+ | Highly distinctive (much more than typical) | "whilst" +5.7 (British marker) |
245
+ | +1.0 to +2.0 | Distinctive (more than typical) | "you" +1.75 (direct engagement) |
246
+ | -1.0 to +1.0 | Normal range | Typical usage |
247
+ | -1.0 to -2.0 | Avoided (less than typical) | "the" -1.48 (prefer specific) |
248
+ | -2.0- | Highly avoided (much less than typical) | "was" -2.46 (avoid passive) |
249
+
250
+ **Why this matters:** These patterns are invisible to you whilst writing but glaringly obvious when absent. That's why AI content feels "off" even when grammatically perfect.
251
+
252
+ ### AI Cliché Detection
253
+
254
+ Analyzes YOUR corpus for overused AI-generated phrases:
255
+
256
+ **Detected patterns:**
257
+ - "dive into" (outlier frequency)
258
+ - "unlock" (appears unnaturally)
259
+ - "leverage", "seamless", "robust" (if present)
260
+
261
+ **Elimination:** Voice model explicitly flags these as anti-patterns even if they appeared in your historical writing.
262
+
263
+ ### Enhanced N-Gram Mode
264
+
265
+ Activated via `generate_enhanced_guide`:
266
+
267
+ **Character-level patterns:**
268
+ - Contraction usage (`'s ` appears 6 times)
269
+ - Punctuation combinations
270
+ - Unique sequences
271
+
272
+ **Word-level patterns:**
273
+ - "but I" (7-14 uses - primary contrast marker)
274
+ - "in my" (23 uses - authority phrase)
275
+ - "I think" (18 uses - hedging pattern)
276
+
277
+ **POS-level patterns:**
278
+ - DET NOUN (1242 times): the metrics, a domain
279
+ - ADJ NOUN (874 times): international markets, new website
280
+ - PRON AUX (735 times): I 'd, It 's
281
+
282
+ **Purpose:** Capture syntactic DNA that generic grammar rules miss.
283
+
284
+ ---
285
+
286
+ ## Requirements
287
+
288
+ ### Minimum Corpus Size
289
+
290
+ **For reliable statistics:**
291
+ - Minimum: 15,000 words
292
+ - Recommended: 30,000 words
293
+ - Ideal: 50,000+ words
294
+
295
+ **Example:** 50 blog posts × 1,200 words = 60,000 words (excellent)
296
+
297
+ **Why size matters:** Below 15k words, you're measuring noise, not signal. Statistical patterns aren't stable.
298
+
299
+ ### Content Quality
300
+
301
+ **Best results when corpus contains:**
302
+ - Single author (no guest posts or collaborative writing)
303
+ - Consistent genre/domain (all technical, or all personal - or analyze separately)
304
+ - Recent writing (voice evolves - re-analyze quarterly)
305
+ - Published content (avoid unpublished drafts with incomplete editing)
306
+
307
+ **Multi-domain:** Analyze separately, then combine insights to understand context-appropriate variations.
308
+
309
+ ---
310
+
311
+ ## Tool Reference
312
+
313
+ ### collect_corpus
314
+
315
+ **Purpose:** Extract clean writing samples from web sources
316
+
317
+ **Parameters:**
318
+ ```typescript
319
+ {
320
+ sitemap_url: string; // XML sitemap, RSS feed, or individual URL
321
+ output_name: string; // Corpus identifier (e.g., "john-smith")
322
+ max_articles?: number; // Limit articles to collect (default: 100)
323
+ article_pattern?: string; // Optional regex filter for URLs
324
+ }
325
+ ```
326
+
327
+ **Output:**
328
+ - Creates `corpus/{output_name}/` directory
329
+ - Saves articles as clean markdown
330
+ - Generates `corpus.json` with metadata
331
+
332
+ **Cleaning process:**
333
+ - Strips HTML, navigation, ads, comments
334
+ - Preserves article prose only
335
+ - Normalizes whitespace and formatting
336
+
337
+ ### analyze_corpus
338
+
339
+ **Purpose:** Perform linguistic analysis on collected corpus
340
+
341
+ **Parameters:**
342
+ ```typescript
343
+ {
344
+ corpus_name: string; // Name from collect_corpus
345
+ analysis_type: "full" | "quick" | "vocabulary" | "syntax";
346
+ }
347
+ ```
348
+
349
+ **Analysis types:**
350
+ - **full**: Complete analysis (recommended)
351
+ - **quick**: Fast iteration during testing
352
+ - **vocabulary**: Word frequency only
353
+ - **syntax**: Sentence structure only
354
+
355
+ **Output files in `corpus/{name}/analysis/`:**
356
+ - vocabulary.json
357
+ - sentence.json
358
+ - voice.json
359
+ - paragraph.json
360
+ - punctuation.json
361
+ - function-words.json
362
+ - function-words-summary.md (human-readable)
363
+
364
+ ### generate_enhanced_guide
365
+
366
+ **Purpose:** Create LLM-optimized statistical voice model
367
+
368
+ **Parameters:**
369
+ ```typescript
370
+ {
371
+ corpus_name: string;
372
+ output_format: "llm" | "human" | "both";
373
+ }
374
+ ```
375
+
376
+ **Output:**
377
+ - **llm**: Optimized for AI consumption (25k words, statistical targets)
378
+ - **human**: Readable overview for writers
379
+ - **both**: Generates both formats
380
+
381
+ **Saved to:** `templates/writing_style_{corpus_name}.md`
382
+
383
+ **Enhanced mode:** Automatically includes n-gram analysis (character, word, POS patterns) for maximum voice fidelity.
384
+
385
+ ### generate_tov_guide (Legacy)
386
+
387
+ **Purpose:** Generate basic voice guide (pre-enhanced version)
388
+
389
+ **Use case:** Simpler output format, faster generation
390
+
391
+ **Note:** Use `generate_enhanced_guide` for production work. This tool maintained for backward compatibility.
392
+
393
+ ---
394
+
395
+ ## Troubleshooting
396
+
397
+ ### "No corpus found"
398
+
399
+ **Solution:**
400
+ 1. Run `collect_corpus` first
401
+ 2. Check corpus name matches exactly (case-sensitive)
402
+ 3. Verify `corpus/` directory exists in project root
403
+
404
+ ### "Not enough data for reliable analysis"
405
+
406
+ **Solution:**
407
+ 1. Collect more articles (minimum 20 articles, 15,000 words)
408
+ 2. Check articles aren't empty after HTML stripping
409
+ 3. Verify sitemap URL is accessible
410
+
411
+ ### "Z-scores all near zero"
412
+
413
+ **Interpretation:** Indicates very typical English usage - not necessarily wrong
414
+
415
+ **Causes:**
416
+ - Generic corporate content (averaged voice)
417
+ - Mixed authorship (multiple writers)
418
+ - AI-edited content (stripped of distinctive patterns)
419
+
420
+ **Solution:** Collect more distinctive personal writing or domain-specific content
421
+
422
+ ### Voice model doesn't match current style
423
+
424
+ **Cause:** Voice evolution over time
425
+
426
+ **Solution:**
427
+ - Re-analyze quarterly
428
+ - Focus on recent articles (filter by date)
429
+ - Document which corpus articles best represent current voice
430
+
431
+ ---
432
+
433
+ ## Development
434
+
435
+ ### Build from Source
436
+
437
+ ```bash
438
+ git clone https://github.com/yourusername/mcp-server-voice-analysis
439
+ cd mcp-server-voice-analysis
440
+ npm install
441
+ npm run build
442
+ ```
443
+
444
+ ### Project Structure
445
+
446
+ ```
447
+ src/
448
+ ├── index.ts # MCP server entry point
449
+ ├── tools/ # MCP tool implementations
450
+ │ ├── collect.ts
451
+ │ ├── analyze.ts
452
+ │ └── generate.ts
453
+ ├── analyzers/ # Linguistic analysis modules
454
+ │ ├── vocabulary.ts
455
+ │ ├── sentence.ts
456
+ │ ├── function-words.ts
457
+ │ ├── character-ngrams.ts
458
+ │ ├── word-ngrams.ts
459
+ │ └── pos-ngrams.ts
460
+ ├── utils/ # Shared utilities
461
+ │ ├── cleaner.ts
462
+ │ ├── tokenizer.ts
463
+ │ └── stats.ts
464
+ └── reference/ # Reference data
465
+ └── english-reference.ts # Function word norms
466
+
467
+ dist/ # Compiled JavaScript output
468
+ corpus/ # Collected writing samples
469
+ templates/ # Generated voice models
470
+ ```
471
+
472
+ ### Dependencies
473
+
474
+ **Core:**
475
+ - `@modelcontextprotocol/sdk` - MCP protocol implementation
476
+ - `compromise` - Natural language processing
477
+ - `cheerio` - HTML parsing for content extraction
478
+ - `fast-xml-parser` - Sitemap and RSS parsing
479
+
480
+ **Analysis:**
481
+ - Function word reference data (50 most common English function words)
482
+ - Part-of-speech tagging
483
+ - N-gram extraction (character, word, POS)
484
+
485
+ ---
486
+
487
+ ## Technical Details
488
+
489
+ ### Statistical Methods
490
+
491
+ **Z-Score Calculation:**
492
+ ```
493
+ z = (observed_frequency - reference_mean) / reference_stddev
494
+ ```
495
+
496
+ Where:
497
+ - observed_frequency = word count per 1000 words in your corpus
498
+ - reference_mean = average frequency in general English
499
+ - reference_stddev = standard deviation in general English
500
+
501
+ **Interpretation:** Z-scores create a statistical fingerprint. Replicating patterns by chance is astronomically unlikely.
502
+
503
+ ### N-Gram Extraction
504
+
505
+ **Character n-grams:** Sequences of 2-4 characters
506
+ - Captures contractions, punctuation patterns
507
+ - Example: `'s ` (possessive), `n't ` (negation)
508
+
509
+ **Word n-grams:** Sequences of 2-4 words
510
+ - Captures phrase patterns, transitional markers
511
+ - Example: "but I think", "in my opinion"
512
+
513
+ **POS n-grams:** Sequences of 2-4 part-of-speech tags
514
+ - Captures syntactic structure
515
+ - Example: DET ADJ NOUN ("the big dog")
516
+
517
+ **Purpose:** These patterns encode voice at multiple levels - from character quirks to sentence structure DNA.
518
+
519
+ ---
520
+
521
+ ## Roadmap
522
+
523
+ ### Current Status (v1.0)
524
+ - ✅ Corpus collection (sitemaps, RSS, URLs)
525
+ - ✅ Full linguistic analysis
526
+ - ✅ Function word stylometry
527
+ - ✅ Enhanced n-gram analysis
528
+ - ✅ LLM-optimized guide generation
529
+ - ✅ Anti-pattern detection
530
+
531
+ ### Planned Features
532
+ - **Real-time validation**: API endpoint for live content checking
533
+ - **Voice drift detection**: Alert when published content deviates from model
534
+ - **Multi-author analysis**: Team voice harmonization
535
+ - **Competitive analysis**: Analyze competitor voices for differentiation
536
+ - **Delta distance scoring**: Automated authorship verification
537
+
538
+ ---
539
+
540
+ ## License
541
+
542
+ MIT
543
+
544
+ ---
545
+
546
+ ## Citation
547
+
548
+ If you use this tool in research or commercial projects:
549
+
550
+ ```
551
+ Voice Analysis MCP Server (2025)
552
+ Statistical voice modeling for authentic AI content generation
553
+ https://github.com/yourusername/mcp-server-voice-analysis
554
+ ```
555
+
556
+ ---
557
+
558
+ ## Support
559
+
560
+ **Issues:** Open issue on GitHub
561
+ **Documentation:** See `QUICKSTART.md` for detailed workflow examples
562
+ **Research:** See `research/` directory for technical background
563
+
564
+ ---
565
+
566
+ **Built for Content Machine project** - Systematic WordPress content enhancement with voice preservation.
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Anti-Mechanical Analyzer
3
+ *
4
+ * Detects patterns that indicate robotic/AI-generated writing:
5
+ * - Uniform sentence lengths (lack of variation)
6
+ * - Symmetric paragraph structures
7
+ * - Repetitive sentence starts
8
+ * - Clustered first-person usage
9
+ * - Missing natural variation
10
+ *
11
+ * Outputs a "naturalness score" where higher = more human-like
12
+ */
13
+ export interface AntiMechanicalAnalysis {
14
+ sentenceLengthVariation: {
15
+ mean: number;
16
+ stdDev: number;
17
+ coefficientOfVariation: number;
18
+ distribution: {
19
+ short: number;
20
+ medium: number;
21
+ long: number;
22
+ veryLong: number;
23
+ };
24
+ hasNaturalVariation: boolean;
25
+ };
26
+ paragraphAsymmetry: {
27
+ meanSentences: number;
28
+ stdDev: number;
29
+ symmetryScore: number;
30
+ singleSentenceParagraphs: number;
31
+ longParagraphs: number;
32
+ };
33
+ firstPersonDistribution: {
34
+ totalCount: number;
35
+ sentenceStartCount: number;
36
+ sentenceStartRatio: number;
37
+ consecutiveIStart: number;
38
+ isBalanced: boolean;
39
+ };
40
+ repetitiveStarts: {
41
+ maxConsecutiveSameStart: number;
42
+ problematicPatterns: string[];
43
+ hasRepetitionProblem: boolean;
44
+ };
45
+ naturalness: {
46
+ sentenceVariationScore: number;
47
+ paragraphVariationScore: number;
48
+ firstPersonScore: number;
49
+ repetitionScore: number;
50
+ totalScore: number;
51
+ interpretation: 'mechanical' | 'somewhat_mechanical' | 'natural' | 'very_natural';
52
+ };
53
+ }
54
+ /**
55
+ * Analyze text for mechanical patterns
56
+ */
57
+ export declare function analyzeAntiMechanical(text: string): AntiMechanicalAnalysis;
58
+ //# sourceMappingURL=anti-mechanical.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"anti-mechanical.d.ts","sourceRoot":"","sources":["../../src/analyzers/anti-mechanical.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,MAAM,WAAW,sBAAsB;IACrC,uBAAuB,EAAE;QACvB,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,sBAAsB,EAAE,MAAM,CAAC;QAC/B,YAAY,EAAE;YACZ,KAAK,EAAE,MAAM,CAAC;YACd,MAAM,EAAE,MAAM,CAAC;YACf,IAAI,EAAE,MAAM,CAAC;YACb,QAAQ,EAAE,MAAM,CAAC;SAClB,CAAC;QACF,mBAAmB,EAAE,OAAO,CAAC;KAC9B,CAAC;IAEF,kBAAkB,EAAE;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,wBAAwB,EAAE,MAAM,CAAC;QACjC,cAAc,EAAE,MAAM,CAAC;KACxB,CAAC;IAEF,uBAAuB,EAAE;QACvB,UAAU,EAAE,MAAM,CAAC;QACnB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,kBAAkB,EAAE,MAAM,CAAC;QAC3B,iBAAiB,EAAE,MAAM,CAAC;QAC1B,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC;IAEF,gBAAgB,EAAE;QAChB,uBAAuB,EAAE,MAAM,CAAC;QAChC,mBAAmB,EAAE,MAAM,EAAE,CAAC;QAC9B,oBAAoB,EAAE,OAAO,CAAC;KAC/B,CAAC;IAGF,WAAW,EAAE;QACX,sBAAsB,EAAE,MAAM,CAAC;QAC/B,uBAAuB,EAAE,MAAM,CAAC;QAChC,gBAAgB,EAAE,MAAM,CAAC;QACzB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;QACnB,cAAc,EAAE,YAAY,GAAG,qBAAqB,GAAG,SAAS,GAAG,cAAc,CAAC;KACnF,CAAC;CACH;AA8BD;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,sBAAsB,CAgM1E"}